contrib/llvm/tools/clang/lib/AST/CommentLexer.cpp

   1 #include "clang/AST/CommentLexer.h"
   2 #include "clang/AST/CommentCommandTraits.h"
   3 #include "clang/Basic/ConvertUTF.h"
   4 #include "llvm/ADT/StringSwitch.h"
   5 #include "llvm/Support/ErrorHandling.h"
   6
   7 namespace clang {
   8 namespace comments {
   9
  10 void Token::dump(const Lexer &L, const SourceManager &SM) const {
  11   llvm::errs() << "comments::Token Kind=" << Kind << " ";
  12   Loc.dump(SM);
  13   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
  14 }
  15
  16 namespace {
  17 bool isHTMLNamedCharacterReferenceCharacter(char C) {
  18   return (C >= 'a' && C <= 'z') ||
  19          (C >= 'A' && C <= 'Z');
  20 }
  21
  22 bool isHTMLDecimalCharacterReferenceCharacter(char C) {
  23   return C >= '0' && C <= '9';
  24 }
  25
  26 bool isHTMLHexCharacterReferenceCharacter(char C) {
  27   return (C >= '0' && C <= '9') ||
  28          (C >= 'a' && C <= 'f') ||
  29          (C >= 'A' && C <= 'F');
  30 }
  31 } // unnamed namespace
  32
  33 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
  34   return llvm::StringSwitch<StringRef>(Name)
  35       .Case("amp", "&")
  36       .Case("lt", "<")
  37       .Case("gt", ">")
  38       .Case("quot", "\"")
  39       .Case("apos", "\'")
  40       .Default("");
  41 }
  42
  43 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
  44   unsigned CodePoint = 0;
  45   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  46     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
  47     CodePoint *= 10;
  48     CodePoint += Name[i] - '0';
  49   }
  50
  51   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
  52   char *ResolvedPtr = Resolved;
  53   if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
  54     return StringRef(Resolved, ResolvedPtr - Resolved);
  55   else
  56     return StringRef();
  57 }
  58
  59 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
  60   unsigned CodePoint = 0;
  61   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  62     CodePoint *= 16;
  63     const char C = Name[i];
  64     assert(isHTMLHexCharacterReferenceCharacter(C));
  65     if (C >= '0' && C <= '9')
  66       CodePoint += Name[i] - '0';
  67     else if (C >= 'a' && C <= 'f')
  68       CodePoint += Name[i] - 'a' + 10;
  69     else
  70       CodePoint += Name[i] - 'A' + 10;
  71   }
  72
  73   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
  74   char *ResolvedPtr = Resolved;
  75   if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
  76     return StringRef(Resolved, ResolvedPtr - Resolved);
  77   else
  78     return StringRef();
  79 }
  80
  81 void Lexer::skipLineStartingDecorations() {
  82   // This function should be called only for C comments
  83   assert(CommentState == LCS_InsideCComment);
  84
  85   if (BufferPtr == CommentEnd)
  86     return;
  87
  88   switch (*BufferPtr) {
  89   case ' ':
  90   case '\t':
  91   case '\f':
  92   case '\v': {
  93     const char *NewBufferPtr = BufferPtr;
  94     NewBufferPtr++;
  95     if (NewBufferPtr == CommentEnd)
  96       return;
  97
  98     char C = *NewBufferPtr;
  99     while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
 100       NewBufferPtr++;
 101       if (NewBufferPtr == CommentEnd)
 102         return;
 103       C = *NewBufferPtr;
 104     }
 105     if (C == '*')
 106       BufferPtr = NewBufferPtr + 1;
 107     break;
 108   }
 109   case '*':
 110     BufferPtr++;
 111     break;
 112   }
 113 }
 114
 115 namespace {
 116 /// Returns pointer to the first newline character in the string.
 117 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
 118   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 119     const char C = *BufferPtr;
 120     if (C == '\n' || C == '\r')
 121       return BufferPtr;
 122   }
 123   return BufferEnd;
 124 }
 125
 126 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
 127   if (BufferPtr == BufferEnd)
 128     return BufferPtr;
 129
 130   if (*BufferPtr == '\n')
 131     BufferPtr++;
 132   else {
 133     assert(*BufferPtr == '\r');
 134     BufferPtr++;
 135     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
 136       BufferPtr++;
 137   }
 138   return BufferPtr;
 139 }
 140
 141 const char *skipNamedCharacterReference(const char *BufferPtr,
 142                                         const char *BufferEnd) {
 143   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 144     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
 145       return BufferPtr;
 146   }
 147   return BufferEnd;
 148 }
 149
 150 const char *skipDecimalCharacterReference(const char *BufferPtr,
 151                                           const char *BufferEnd) {
 152   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 153     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
 154       return BufferPtr;
 155   }
 156   return BufferEnd;
 157 }
 158
 159 const char *skipHexCharacterReference(const char *BufferPtr,
 160                                           const char *BufferEnd) {
 161   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 162     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
 163       return BufferPtr;
 164   }
 165   return BufferEnd;
 166 }
 167
 168 bool isHTMLIdentifierStartingCharacter(char C) {
 169   return (C >= 'a' && C <= 'z') ||
 170          (C >= 'A' && C <= 'Z');
 171 }
 172
 173 bool isHTMLIdentifierCharacter(char C) {
 174   return (C >= 'a' && C <= 'z') ||
 175          (C >= 'A' && C <= 'Z') ||
 176          (C >= '0' && C <= '9');
 177 }
 178
 179 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
 180   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 181     if (!isHTMLIdentifierCharacter(*BufferPtr))
 182       return BufferPtr;
 183   }
 184   return BufferEnd;
 185 }
 186
 187 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
 188 /// string allowed.
 189 ///
 190 /// Returns pointer to closing quote.
 191 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
 192 {
 193   const char Quote = *BufferPtr;
 194   assert(Quote == '\"' || Quote == '\'');
 195
 196   BufferPtr++;
 197   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 198     const char C = *BufferPtr;
 199     if (C == Quote && BufferPtr[-1] != '\\')
 200       return BufferPtr;
 201   }
 202   return BufferEnd;
 203 }
 204
 205 bool isHorizontalWhitespace(char C) {
 206   return C == ' ' || C == '\t' || C == '\f' || C == '\v';
 207 }
 208
 209 bool isWhitespace(char C) {
 210   return C == ' ' || C == '\n' || C == '\r' ||
 211          C == '\t' || C == '\f' || C == '\v';
 212 }
 213
 214 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
 215   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 216     if (!isWhitespace(*BufferPtr))
 217       return BufferPtr;
 218   }
 219   return BufferEnd;
 220 }
 221
 222 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
 223   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
 224 }
 225
 226 bool isCommandNameCharacter(char C) {
 227   return (C >= 'a' && C <= 'z') ||
 228          (C >= 'A' && C <= 'Z') ||
 229          (C >= '0' && C <= '9');
 230 }
 231
 232 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
 233   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 234     if (!isCommandNameCharacter(*BufferPtr))
 235       return BufferPtr;
 236   }
 237   return BufferEnd;
 238 }
 239
 240 /// Return the one past end pointer for BCPL comments.
 241 /// Handles newlines escaped with backslash or trigraph for backslahs.
 242 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
 243   const char *CurPtr = BufferPtr;
 244   while (CurPtr != BufferEnd) {
 245     char C = *CurPtr;
 246     while (C != '\n' && C != '\r') {
 247       CurPtr++;
 248       if (CurPtr == BufferEnd)
 249         return BufferEnd;
 250       C = *CurPtr;
 251     }
 252     // We found a newline, check if it is escaped.
 253     const char *EscapePtr = CurPtr - 1;
 254     while(isHorizontalWhitespace(*EscapePtr))
 255       EscapePtr--;
 256
 257     if (*EscapePtr == '\\' ||
 258         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
 259          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
 260       // We found an escaped newline.
 261       CurPtr = skipNewline(CurPtr, BufferEnd);
 262     } else
 263       return CurPtr; // Not an escaped newline.
 264   }
 265   return BufferEnd;
 266 }
 267
 268 /// Return the one past end pointer for C comments.
 269 /// Very dumb, does not handle escaped newlines or trigraphs.
 270 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
 271   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 272     if (*BufferPtr == '*') {
 273       assert(BufferPtr + 1 != BufferEnd);
 274       if (*(BufferPtr + 1) == '/')
 275         return BufferPtr;
 276     }
 277   }
 278   llvm_unreachable("buffer end hit before '*/' was seen");
 279 }
 280 } // unnamed namespace
 281
 282 void Lexer::lexCommentText(Token &T) {
 283   assert(CommentState == LCS_InsideBCPLComment ||
 284          CommentState == LCS_InsideCComment);
 285
 286   switch (State) {
 287   case LS_Normal:
 288     break;
 289   case LS_VerbatimBlockFirstLine:
 290     lexVerbatimBlockFirstLine(T);
 291     return;
 292   case LS_VerbatimBlockBody:
 293     lexVerbatimBlockBody(T);
 294     return;
 295   case LS_VerbatimLineText:
 296     lexVerbatimLineText(T);
 297     return;
 298   case LS_HTMLStartTag:
 299     lexHTMLStartTag(T);
 300     return;
 301   case LS_HTMLEndTag:
 302     lexHTMLEndTag(T);
 303     return;
 304   }
 305
 306   assert(State == LS_Normal);
 307
 308   const char *TokenPtr = BufferPtr;
 309   assert(TokenPtr < CommentEnd);
 310   while (TokenPtr != CommentEnd) {
 311     switch(*TokenPtr) {
 312       case '\\':
 313       case '@': {
 314         TokenPtr++;
 315         if (TokenPtr == CommentEnd) {
 316           formTextToken(T, TokenPtr);
 317           return;
 318         }
 319         char C = *TokenPtr;
 320         switch (C) {
 321         default:
 322           break;
 323
 324         case '\\': case '@': case '&': case '$':
 325         case '#':  case '<': case '>': case '%':
 326         case '\"': case '.': case ':':
 327           // This is one of \\ \@ \& \$ etc escape sequences.
 328           TokenPtr++;
 329           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
 330             // This is the \:: escape sequence.
 331             TokenPtr++;
 332           }
 333           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
 334           formTokenWithChars(T, TokenPtr, tok::text);
 335           T.setText(UnescapedText);
 336           return;
 337         }
 338
 339         // Don't make zero-length commands.
 340         if (!isCommandNameCharacter(*TokenPtr)) {
 341           formTextToken(T, TokenPtr);
 342           return;
 343         }
 344
 345         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
 346         unsigned Length = TokenPtr - (BufferPtr + 1);
 347
 348         // Hardcoded support for lexing LaTeX formula commands
 349         // \f$ \f[ \f] \f{ \f} as a single command.
 350         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
 351           C = *TokenPtr;
 352           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
 353             TokenPtr++;
 354             Length++;
 355           }
 356         }
 357
 358         const StringRef CommandName(BufferPtr + 1, Length);
 359         StringRef EndName;
 360
 361         if (Traits.isVerbatimBlockCommand(CommandName, EndName)) {
 362           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName);
 363           return;
 364         }
 365         if (Traits.isVerbatimLineCommand(CommandName)) {
 366           setupAndLexVerbatimLine(T, TokenPtr);
 367           return;
 368         }
 369         formTokenWithChars(T, TokenPtr, tok::command);
 370         T.setCommandName(CommandName);
 371         return;
 372       }
 373
 374       case '&':
 375         lexHTMLCharacterReference(T);
 376         return;
 377
 378       case '<': {
 379         TokenPtr++;
 380         if (TokenPtr == CommentEnd) {
 381           formTextToken(T, TokenPtr);
 382           return;
 383         }
 384         const char C = *TokenPtr;
 385         if (isHTMLIdentifierStartingCharacter(C))
 386           setupAndLexHTMLStartTag(T);
 387         else if (C == '/')
 388           setupAndLexHTMLEndTag(T);
 389         else
 390           formTextToken(T, TokenPtr);
 391
 392         return;
 393       }
 394
 395       case '\n':
 396       case '\r':
 397         TokenPtr = skipNewline(TokenPtr, CommentEnd);
 398         formTokenWithChars(T, TokenPtr, tok::newline);
 399
 400         if (CommentState == LCS_InsideCComment)
 401           skipLineStartingDecorations();
 402         return;
 403
 404       default: {
 405         while (true) {
 406           TokenPtr++;
 407           if (TokenPtr == CommentEnd)
 408             break;
 409           const char C = *TokenPtr;
 410           if(C == '\n' || C == '\r' ||
 411              C == '\\' || C == '@' || C == '&' || C == '<')
 412             break;
 413         }
 414         formTextToken(T, TokenPtr);
 415         return;
 416       }
 417     }
 418   }
 419 }
 420
 421 void Lexer::setupAndLexVerbatimBlock(Token &T,
 422                                      const char *TextBegin,
 423                                      char Marker, StringRef EndName) {
 424   VerbatimBlockEndCommandName.clear();
 425   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
 426   VerbatimBlockEndCommandName.append(EndName);
 427
 428   StringRef Name(BufferPtr + 1, TextBegin - (BufferPtr + 1));
 429   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
 430   T.setVerbatimBlockName(Name);
 431
 432   // If there is a newline following the verbatim opening command, skip the
 433   // newline so that we don't create an tok::verbatim_block_line with empty
 434   // text content.
 435   if (BufferPtr != CommentEnd) {
 436     const char C = *BufferPtr;
 437     if (C == '\n' || C == '\r') {
 438       BufferPtr = skipNewline(BufferPtr, CommentEnd);
 439       State = LS_VerbatimBlockBody;
 440       return;
 441     }
 442   }
 443
 444   State = LS_VerbatimBlockFirstLine;
 445 }
 446
 447 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
 448 again:
 449   assert(BufferPtr < CommentEnd);
 450
 451   // FIXME: It would be better to scan the text once, finding either the block
 452   // end command or newline.
 453   //
 454   // Extract current line.
 455   const char *Newline = findNewline(BufferPtr, CommentEnd);
 456   StringRef Line(BufferPtr, Newline - BufferPtr);
 457
 458   // Look for end command in current line.
 459   size_t Pos = Line.find(VerbatimBlockEndCommandName);
 460   const char *TextEnd;
 461   const char *NextLine;
 462   if (Pos == StringRef::npos) {
 463     // Current line is completely verbatim.
 464     TextEnd = Newline;
 465     NextLine = skipNewline(Newline, CommentEnd);
 466   } else if (Pos == 0) {
 467     // Current line contains just an end command.
 468     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
 469     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
 470     formTokenWithChars(T, End, tok::verbatim_block_end);
 471     T.setVerbatimBlockName(Name);
 472     State = LS_Normal;
 473     return;
 474   } else {
 475     // There is some text, followed by end command.  Extract text first.
 476     TextEnd = BufferPtr + Pos;
 477     NextLine = TextEnd;
 478     // If there is only whitespace before end command, skip whitespace.
 479     if (isWhitespace(BufferPtr, TextEnd)) {
 480       BufferPtr = TextEnd;
 481       goto again;
 482     }
 483   }
 484
 485   StringRef Text(BufferPtr, TextEnd - BufferPtr);
 486   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
 487   T.setVerbatimBlockText(Text);
 488
 489   State = LS_VerbatimBlockBody;
 490 }
 491
 492 void Lexer::lexVerbatimBlockBody(Token &T) {
 493   assert(State == LS_VerbatimBlockBody);
 494
 495   if (CommentState == LCS_InsideCComment)
 496     skipLineStartingDecorations();
 497
 498   lexVerbatimBlockFirstLine(T);
 499 }
 500
 501 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin) {
 502   const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1);
 503   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
 504   T.setVerbatimLineName(Name);
 505
 506   State = LS_VerbatimLineText;
 507 }
 508
 509 void Lexer::lexVerbatimLineText(Token &T) {
 510   assert(State == LS_VerbatimLineText);
 511
 512   // Extract current line.
 513   const char *Newline = findNewline(BufferPtr, CommentEnd);
 514   const StringRef Text(BufferPtr, Newline - BufferPtr);
 515   formTokenWithChars(T, Newline, tok::verbatim_line_text);
 516   T.setVerbatimLineText(Text);
 517
 518   State = LS_Normal;
 519 }
 520
 521 void Lexer::lexHTMLCharacterReference(Token &T) {
 522   const char *TokenPtr = BufferPtr;
 523   assert(*TokenPtr == '&');
 524   TokenPtr++;
 525   if (TokenPtr == CommentEnd) {
 526     formTextToken(T, TokenPtr);
 527     return;
 528   }
 529   const char *NamePtr;
 530   bool isNamed = false;
 531   bool isDecimal = false;
 532   char C = *TokenPtr;
 533   if (isHTMLNamedCharacterReferenceCharacter(C)) {
 534     NamePtr = TokenPtr;
 535     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
 536     isNamed = true;
 537   } else if (C == '#') {
 538     TokenPtr++;
 539     if (TokenPtr == CommentEnd) {
 540       formTextToken(T, TokenPtr);
 541       return;
 542     }
 543     C = *TokenPtr;
 544     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
 545       NamePtr = TokenPtr;
 546       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
 547       isDecimal = true;
 548     } else if (C == 'x' || C == 'X') {
 549       TokenPtr++;
 550       NamePtr = TokenPtr;
 551       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
 552     } else {
 553       formTextToken(T, TokenPtr);
 554       return;
 555     }
 556   } else {
 557     formTextToken(T, TokenPtr);
 558     return;
 559   }
 560   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
 561       *TokenPtr != ';') {
 562     formTextToken(T, TokenPtr);
 563     return;
 564   }
 565   StringRef Name(NamePtr, TokenPtr - NamePtr);
 566   TokenPtr++; // Skip semicolon.
 567   StringRef Resolved;
 568   if (isNamed)
 569     Resolved = resolveHTMLNamedCharacterReference(Name);
 570   else if (isDecimal)
 571     Resolved = resolveHTMLDecimalCharacterReference(Name);
 572   else
 573     Resolved = resolveHTMLHexCharacterReference(Name);
 574
 575   if (Resolved.empty()) {
 576     formTextToken(T, TokenPtr);
 577     return;
 578   }
 579   formTokenWithChars(T, TokenPtr, tok::text);
 580   T.setText(Resolved);
 581   return;
 582 }
 583
 584 void Lexer::setupAndLexHTMLStartTag(Token &T) {
 585   assert(BufferPtr[0] == '<' &&
 586          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
 587   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
 588
 589   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
 590   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
 591   T.setHTMLTagStartName(Name);
 592
 593   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
 594
 595   const char C = *BufferPtr;
 596   if (BufferPtr != CommentEnd &&
 597       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
 598     State = LS_HTMLStartTag;
 599 }
 600
 601 void Lexer::lexHTMLStartTag(Token &T) {
 602   assert(State == LS_HTMLStartTag);
 603
 604   const char *TokenPtr = BufferPtr;
 605   char C = *TokenPtr;
 606   if (isHTMLIdentifierCharacter(C)) {
 607     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
 608     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
 609     formTokenWithChars(T, TokenPtr, tok::html_ident);
 610     T.setHTMLIdent(Ident);
 611   } else {
 612     switch (C) {
 613     case '=':
 614       TokenPtr++;
 615       formTokenWithChars(T, TokenPtr, tok::html_equals);
 616       break;
 617     case '\"':
 618     case '\'': {
 619       const char *OpenQuote = TokenPtr;
 620       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
 621       const char *ClosingQuote = TokenPtr;
 622       if (TokenPtr != CommentEnd) // Skip closing quote.
 623         TokenPtr++;
 624       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
 625       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
 626                                       ClosingQuote - (OpenQuote + 1)));
 627       break;
 628     }
 629     case '>':
 630       TokenPtr++;
 631       formTokenWithChars(T, TokenPtr, tok::html_greater);
 632       State = LS_Normal;
 633       return;
 634     case '/':
 635       TokenPtr++;
 636       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
 637         TokenPtr++;
 638         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
 639       } else
 640         formTextToken(T, TokenPtr);
 641
 642       State = LS_Normal;
 643       return;
 644     }
 645   }
 646
 647   // Now look ahead and return to normal state if we don't see any HTML tokens
 648   // ahead.
 649   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
 650   if (BufferPtr == CommentEnd) {
 651     State = LS_Normal;
 652     return;
 653   }
 654
 655   C = *BufferPtr;
 656   if (!isHTMLIdentifierStartingCharacter(C) &&
 657       C != '=' && C != '\"' && C != '\'' && C != '>') {
 658     State = LS_Normal;
 659     return;
 660   }
 661 }
 662
 663 void Lexer::setupAndLexHTMLEndTag(Token &T) {
 664   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
 665
 666   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
 667   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
 668
 669   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
 670
 671   formTokenWithChars(T, End, tok::html_end_tag);
 672   T.setHTMLTagEndName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin));
 673
 674   if (BufferPtr != CommentEnd && *BufferPtr == '>')
 675     State = LS_HTMLEndTag;
 676 }
 677
 678 void Lexer::lexHTMLEndTag(Token &T) {
 679   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
 680
 681   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
 682   State = LS_Normal;
 683 }
 684
 685 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
 686              SourceLocation FileLoc, const CommentOptions &CommOpts,
 687              const char *BufferStart, const char *BufferEnd):
 688     Allocator(Allocator), Traits(Traits),
 689     BufferStart(BufferStart), BufferEnd(BufferEnd),
 690     FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart),
 691     CommentState(LCS_BeforeComment), State(LS_Normal) {
 692 }
 693
 694 void Lexer::lex(Token &T) {
 695 again:
 696   switch (CommentState) {
 697   case LCS_BeforeComment:
 698     if (BufferPtr == BufferEnd) {
 699       formTokenWithChars(T, BufferPtr, tok::eof);
 700       return;
 701     }
 702
 703     assert(*BufferPtr == '/');
 704     BufferPtr++; // Skip first slash.
 705     switch(*BufferPtr) {
 706     case '/': { // BCPL comment.
 707       BufferPtr++; // Skip second slash.
 708
 709       if (BufferPtr != BufferEnd) {
 710         // Skip Doxygen magic marker, if it is present.
 711         // It might be missing because of a typo //< or /*<, or because we
 712         // merged this non-Doxygen comment into a bunch of Doxygen comments
 713         // around it: /** ... */ /* ... */ /** ... */
 714         const char C = *BufferPtr;
 715         if (C == '/' || C == '!')
 716           BufferPtr++;
 717       }
 718
 719       // Skip less-than symbol that marks trailing comments.
 720       // Skip it even if the comment is not a Doxygen one, because //< and /*<
 721       // are frequent typos.
 722       if (BufferPtr != BufferEnd && *BufferPtr == '<')
 723         BufferPtr++;
 724
 725       CommentState = LCS_InsideBCPLComment;
 726       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
 727         State = LS_Normal;
 728       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
 729       goto again;
 730     }
 731     case '*': { // C comment.
 732       BufferPtr++; // Skip star.
 733
 734       // Skip Doxygen magic marker.
 735       const char C = *BufferPtr;
 736       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
 737         BufferPtr++;
 738
 739       // Skip less-than symbol that marks trailing comments.
 740       if (BufferPtr != BufferEnd && *BufferPtr == '<')
 741         BufferPtr++;
 742
 743       CommentState = LCS_InsideCComment;
 744       State = LS_Normal;
 745       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
 746       goto again;
 747     }
 748     default:
 749       llvm_unreachable("second character of comment should be '/' or '*'");
 750     }
 751
 752   case LCS_BetweenComments: {
 753     // Consecutive comments are extracted only if there is only whitespace
 754     // between them.  So we can search for the start of the next comment.
 755     const char *EndWhitespace = BufferPtr;
 756     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
 757       EndWhitespace++;
 758
 759     // Turn any whitespace between comments (and there is only whitespace
 760     // between them -- guaranteed by comment extraction) into a newline.  We
 761     // have two newlines between C comments in total (first one was synthesized
 762     // after a comment).
 763     formTokenWithChars(T, EndWhitespace, tok::newline);
 764
 765     CommentState = LCS_BeforeComment;
 766     break;
 767   }
 768
 769   case LCS_InsideBCPLComment:
 770   case LCS_InsideCComment:
 771     if (BufferPtr != CommentEnd) {
 772       lexCommentText(T);
 773       break;
 774     } else {
 775       // Skip C comment closing sequence.
 776       if (CommentState == LCS_InsideCComment) {
 777         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
 778         BufferPtr += 2;
 779         assert(BufferPtr <= BufferEnd);
 780
 781         // Synthenize newline just after the C comment, regardless if there is
 782         // actually a newline.
 783         formTokenWithChars(T, BufferPtr, tok::newline);
 784
 785         CommentState = LCS_BetweenComments;
 786         break;
 787       } else {
 788         // Don't synthesized a newline after BCPL comment.
 789         CommentState = LCS_BetweenComments;
 790         goto again;
 791       }
 792     }
 793   }
 794 }
 795
 796 StringRef Lexer::getSpelling(const Token &Tok,
 797                              const SourceManager &SourceMgr,
 798                              bool *Invalid) const {
 799   SourceLocation Loc = Tok.getLocation();
 800   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
 801
 802   bool InvalidTemp = false;
 803   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
 804   if (InvalidTemp) {
 805     *Invalid = true;
 806     return StringRef();
 807   }
 808
 809   const char *Begin = File.data() + LocInfo.second;
 810   return StringRef(Begin, Tok.getLength());
 811 }
 812
 813 } // end namespace comments
 814 } // end namespace clang
 815