contrib/llvm/tools/clang/lib/AST/CommentLexer.cpp

   1 #include "clang/AST/CommentLexer.h"
   2 #include "clang/AST/CommentCommandTraits.h"
   3 #include "clang/AST/CommentDiagnostic.h"
   4 #include "clang/Basic/CharInfo.h"
   5 #include "llvm/ADT/StringExtras.h"
   6 #include "llvm/ADT/StringSwitch.h"
   7 #include "llvm/Support/ConvertUTF.h"
   8 #include "llvm/Support/ErrorHandling.h"
   9
  10 namespace clang {
  11 namespace comments {
  12
  13 void Token::dump(const Lexer &L, const SourceManager &SM) const {
  14   llvm::errs() << "comments::Token Kind=" << Kind << " ";
  15   Loc.dump(SM);
  16   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
  17 }
  18
  19 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
  20   return isLetter(C);
  21 }
  22
  23 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
  24   return isDigit(C);
  25 }
  26
  27 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
  28   return isHexDigit(C);
  29 }
  30
  31 static inline StringRef convertCodePointToUTF8(
  32                                       llvm::BumpPtrAllocator &Allocator,
  33                                       unsigned CodePoint) {
  34   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
  35   char *ResolvedPtr = Resolved;
  36   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
  37     return StringRef(Resolved, ResolvedPtr - Resolved);
  38   else
  39     return StringRef();
  40 }
  41
  42 namespace {
  43
  44 #include "clang/AST/CommentHTMLTags.inc"
  45 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
  46
  47 } // unnamed namespace
  48
  49 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
  50   // Fast path, first check a few most widely used named character references.
  51   return llvm::StringSwitch<StringRef>(Name)
  52       .Case("amp", "&")
  53       .Case("lt", "<")
  54       .Case("gt", ">")
  55       .Case("quot", "\"")
  56       .Case("apos", "\'")
  57       // Slow path.
  58       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
  59 }
  60
  61 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
  62   unsigned CodePoint = 0;
  63   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  64     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
  65     CodePoint *= 10;
  66     CodePoint += Name[i] - '0';
  67   }
  68   return convertCodePointToUTF8(Allocator, CodePoint);
  69 }
  70
  71 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
  72   unsigned CodePoint = 0;
  73   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  74     CodePoint *= 16;
  75     const char C = Name[i];
  76     assert(isHTMLHexCharacterReferenceCharacter(C));
  77     CodePoint += llvm::hexDigitValue(C);
  78   }
  79   return convertCodePointToUTF8(Allocator, CodePoint);
  80 }
  81
  82 void Lexer::skipLineStartingDecorations() {
  83   // This function should be called only for C comments
  84   assert(CommentState == LCS_InsideCComment);
  85
  86   if (BufferPtr == CommentEnd)
  87     return;
  88
  89   switch (*BufferPtr) {
  90   case ' ':
  91   case '\t':
  92   case '\f':
  93   case '\v': {
  94     const char *NewBufferPtr = BufferPtr;
  95     NewBufferPtr++;
  96     if (NewBufferPtr == CommentEnd)
  97       return;
  98
  99     char C = *NewBufferPtr;
 100     while (isHorizontalWhitespace(C)) {
 101       NewBufferPtr++;
 102       if (NewBufferPtr == CommentEnd)
 103         return;
 104       C = *NewBufferPtr;
 105     }
 106     if (C == '*')
 107       BufferPtr = NewBufferPtr + 1;
 108     break;
 109   }
 110   case '*':
 111     BufferPtr++;
 112     break;
 113   }
 114 }
 115
 116 namespace {
 117 /// Returns pointer to the first newline character in the string.
 118 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
 119   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 120     if (isVerticalWhitespace(*BufferPtr))
 121       return BufferPtr;
 122   }
 123   return BufferEnd;
 124 }
 125
 126 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
 127   if (BufferPtr == BufferEnd)
 128     return BufferPtr;
 129
 130   if (*BufferPtr == '\n')
 131     BufferPtr++;
 132   else {
 133     assert(*BufferPtr == '\r');
 134     BufferPtr++;
 135     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
 136       BufferPtr++;
 137   }
 138   return BufferPtr;
 139 }
 140
 141 const char *skipNamedCharacterReference(const char *BufferPtr,
 142                                         const char *BufferEnd) {
 143   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 144     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
 145       return BufferPtr;
 146   }
 147   return BufferEnd;
 148 }
 149
 150 const char *skipDecimalCharacterReference(const char *BufferPtr,
 151                                           const char *BufferEnd) {
 152   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 153     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
 154       return BufferPtr;
 155   }
 156   return BufferEnd;
 157 }
 158
 159 const char *skipHexCharacterReference(const char *BufferPtr,
 160                                           const char *BufferEnd) {
 161   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 162     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
 163       return BufferPtr;
 164   }
 165   return BufferEnd;
 166 }
 167
 168 bool isHTMLIdentifierStartingCharacter(char C) {
 169   return isLetter(C);
 170 }
 171
 172 bool isHTMLIdentifierCharacter(char C) {
 173   return isAlphanumeric(C);
 174 }
 175
 176 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
 177   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 178     if (!isHTMLIdentifierCharacter(*BufferPtr))
 179       return BufferPtr;
 180   }
 181   return BufferEnd;
 182 }
 183
 184 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
 185 /// string allowed.
 186 ///
 187 /// Returns pointer to closing quote.
 188 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
 189 {
 190   const char Quote = *BufferPtr;
 191   assert(Quote == '\"' || Quote == '\'');
 192
 193   BufferPtr++;
 194   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 195     const char C = *BufferPtr;
 196     if (C == Quote && BufferPtr[-1] != '\\')
 197       return BufferPtr;
 198   }
 199   return BufferEnd;
 200 }
 201
 202 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
 203   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 204     if (!isWhitespace(*BufferPtr))
 205       return BufferPtr;
 206   }
 207   return BufferEnd;
 208 }
 209
 210 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
 211   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
 212 }
 213
 214 bool isCommandNameStartCharacter(char C) {
 215   return isLetter(C);
 216 }
 217
 218 bool isCommandNameCharacter(char C) {
 219   return isAlphanumeric(C);
 220 }
 221
 222 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
 223   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 224     if (!isCommandNameCharacter(*BufferPtr))
 225       return BufferPtr;
 226   }
 227   return BufferEnd;
 228 }
 229
 230 /// Return the one past end pointer for BCPL comments.
 231 /// Handles newlines escaped with backslash or trigraph for backslahs.
 232 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
 233   const char *CurPtr = BufferPtr;
 234   while (CurPtr != BufferEnd) {
 235     while (!isVerticalWhitespace(*CurPtr)) {
 236       CurPtr++;
 237       if (CurPtr == BufferEnd)
 238         return BufferEnd;
 239     }
 240     // We found a newline, check if it is escaped.
 241     const char *EscapePtr = CurPtr - 1;
 242     while(isHorizontalWhitespace(*EscapePtr))
 243       EscapePtr--;
 244
 245     if (*EscapePtr == '\\' ||
 246         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
 247          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
 248       // We found an escaped newline.
 249       CurPtr = skipNewline(CurPtr, BufferEnd);
 250     } else
 251       return CurPtr; // Not an escaped newline.
 252   }
 253   return BufferEnd;
 254 }
 255
 256 /// Return the one past end pointer for C comments.
 257 /// Very dumb, does not handle escaped newlines or trigraphs.
 258 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
 259   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 260     if (*BufferPtr == '*') {
 261       assert(BufferPtr + 1 != BufferEnd);
 262       if (*(BufferPtr + 1) == '/')
 263         return BufferPtr;
 264     }
 265   }
 266   llvm_unreachable("buffer end hit before '*/' was seen");
 267 }
 268 } // unnamed namespace
 269
 270 void Lexer::lexCommentText(Token &T) {
 271   assert(CommentState == LCS_InsideBCPLComment ||
 272          CommentState == LCS_InsideCComment);
 273
 274   switch (State) {
 275   case LS_Normal:
 276     break;
 277   case LS_VerbatimBlockFirstLine:
 278     lexVerbatimBlockFirstLine(T);
 279     return;
 280   case LS_VerbatimBlockBody:
 281     lexVerbatimBlockBody(T);
 282     return;
 283   case LS_VerbatimLineText:
 284     lexVerbatimLineText(T);
 285     return;
 286   case LS_HTMLStartTag:
 287     lexHTMLStartTag(T);
 288     return;
 289   case LS_HTMLEndTag:
 290     lexHTMLEndTag(T);
 291     return;
 292   }
 293
 294   assert(State == LS_Normal);
 295
 296   const char *TokenPtr = BufferPtr;
 297   assert(TokenPtr < CommentEnd);
 298   while (TokenPtr != CommentEnd) {
 299     switch(*TokenPtr) {
 300       case '\\':
 301       case '@': {
 302         // Commands that start with a backslash and commands that start with
 303         // 'at' have equivalent semantics.  But we keep information about the
 304         // exact syntax in AST for comments.
 305         tok::TokenKind CommandKind =
 306             (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
 307         TokenPtr++;
 308         if (TokenPtr == CommentEnd) {
 309           formTextToken(T, TokenPtr);
 310           return;
 311         }
 312         char C = *TokenPtr;
 313         switch (C) {
 314         default:
 315           break;
 316
 317         case '\\': case '@': case '&': case '$':
 318         case '#':  case '<': case '>': case '%':
 319         case '\"': case '.': case ':':
 320           // This is one of \\ \@ \& \$ etc escape sequences.
 321           TokenPtr++;
 322           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
 323             // This is the \:: escape sequence.
 324             TokenPtr++;
 325           }
 326           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
 327           formTokenWithChars(T, TokenPtr, tok::text);
 328           T.setText(UnescapedText);
 329           return;
 330         }
 331
 332         // Don't make zero-length commands.
 333         if (!isCommandNameStartCharacter(*TokenPtr)) {
 334           formTextToken(T, TokenPtr);
 335           return;
 336         }
 337
 338         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
 339         unsigned Length = TokenPtr - (BufferPtr + 1);
 340
 341         // Hardcoded support for lexing LaTeX formula commands
 342         // \f$ \f[ \f] \f{ \f} as a single command.
 343         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
 344           C = *TokenPtr;
 345           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
 346             TokenPtr++;
 347             Length++;
 348           }
 349         }
 350
 351         const StringRef CommandName(BufferPtr + 1, Length);
 352
 353         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
 354         if (!Info) {
 355           formTokenWithChars(T, TokenPtr, tok::unknown_command);
 356           T.setUnknownCommandName(CommandName);
 357           Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
 358           return;
 359         }
 360         if (Info->IsVerbatimBlockCommand) {
 361           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
 362           return;
 363         }
 364         if (Info->IsVerbatimLineCommand) {
 365           setupAndLexVerbatimLine(T, TokenPtr, Info);
 366           return;
 367         }
 368         formTokenWithChars(T, TokenPtr, CommandKind);
 369         T.setCommandID(Info->getID());
 370         return;
 371       }
 372
 373       case '&':
 374         lexHTMLCharacterReference(T);
 375         return;
 376
 377       case '<': {
 378         TokenPtr++;
 379         if (TokenPtr == CommentEnd) {
 380           formTextToken(T, TokenPtr);
 381           return;
 382         }
 383         const char C = *TokenPtr;
 384         if (isHTMLIdentifierStartingCharacter(C))
 385           setupAndLexHTMLStartTag(T);
 386         else if (C == '/')
 387           setupAndLexHTMLEndTag(T);
 388         else
 389           formTextToken(T, TokenPtr);
 390
 391         return;
 392       }
 393
 394       case '\n':
 395       case '\r':
 396         TokenPtr = skipNewline(TokenPtr, CommentEnd);
 397         formTokenWithChars(T, TokenPtr, tok::newline);
 398
 399         if (CommentState == LCS_InsideCComment)
 400           skipLineStartingDecorations();
 401         return;
 402
 403       default: {
 404         size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
 405                          find_first_of("\n\r\\@&<");
 406         if (End != StringRef::npos)
 407           TokenPtr += End;
 408         else
 409           TokenPtr = CommentEnd;
 410         formTextToken(T, TokenPtr);
 411         return;
 412       }
 413     }
 414   }
 415 }
 416
 417 void Lexer::setupAndLexVerbatimBlock(Token &T,
 418                                      const char *TextBegin,
 419                                      char Marker, const CommandInfo *Info) {
 420   assert(Info->IsVerbatimBlockCommand);
 421
 422   VerbatimBlockEndCommandName.clear();
 423   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
 424   VerbatimBlockEndCommandName.append(Info->EndCommandName);
 425
 426   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
 427   T.setVerbatimBlockID(Info->getID());
 428
 429   // If there is a newline following the verbatim opening command, skip the
 430   // newline so that we don't create an tok::verbatim_block_line with empty
 431   // text content.
 432   if (BufferPtr != CommentEnd &&
 433       isVerticalWhitespace(*BufferPtr)) {
 434     BufferPtr = skipNewline(BufferPtr, CommentEnd);
 435     State = LS_VerbatimBlockBody;
 436     return;
 437   }
 438
 439   State = LS_VerbatimBlockFirstLine;
 440 }
 441
 442 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
 443 again:
 444   assert(BufferPtr < CommentEnd);
 445
 446   // FIXME: It would be better to scan the text once, finding either the block
 447   // end command or newline.
 448   //
 449   // Extract current line.
 450   const char *Newline = findNewline(BufferPtr, CommentEnd);
 451   StringRef Line(BufferPtr, Newline - BufferPtr);
 452
 453   // Look for end command in current line.
 454   size_t Pos = Line.find(VerbatimBlockEndCommandName);
 455   const char *TextEnd;
 456   const char *NextLine;
 457   if (Pos == StringRef::npos) {
 458     // Current line is completely verbatim.
 459     TextEnd = Newline;
 460     NextLine = skipNewline(Newline, CommentEnd);
 461   } else if (Pos == 0) {
 462     // Current line contains just an end command.
 463     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
 464     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
 465     formTokenWithChars(T, End, tok::verbatim_block_end);
 466     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
 467     State = LS_Normal;
 468     return;
 469   } else {
 470     // There is some text, followed by end command.  Extract text first.
 471     TextEnd = BufferPtr + Pos;
 472     NextLine = TextEnd;
 473     // If there is only whitespace before end command, skip whitespace.
 474     if (isWhitespace(BufferPtr, TextEnd)) {
 475       BufferPtr = TextEnd;
 476       goto again;
 477     }
 478   }
 479
 480   StringRef Text(BufferPtr, TextEnd - BufferPtr);
 481   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
 482   T.setVerbatimBlockText(Text);
 483
 484   State = LS_VerbatimBlockBody;
 485 }
 486
 487 void Lexer::lexVerbatimBlockBody(Token &T) {
 488   assert(State == LS_VerbatimBlockBody);
 489
 490   if (CommentState == LCS_InsideCComment)
 491     skipLineStartingDecorations();
 492
 493   lexVerbatimBlockFirstLine(T);
 494 }
 495
 496 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
 497                                     const CommandInfo *Info) {
 498   assert(Info->IsVerbatimLineCommand);
 499   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
 500   T.setVerbatimLineID(Info->getID());
 501
 502   State = LS_VerbatimLineText;
 503 }
 504
 505 void Lexer::lexVerbatimLineText(Token &T) {
 506   assert(State == LS_VerbatimLineText);
 507
 508   // Extract current line.
 509   const char *Newline = findNewline(BufferPtr, CommentEnd);
 510   const StringRef Text(BufferPtr, Newline - BufferPtr);
 511   formTokenWithChars(T, Newline, tok::verbatim_line_text);
 512   T.setVerbatimLineText(Text);
 513
 514   State = LS_Normal;
 515 }
 516
 517 void Lexer::lexHTMLCharacterReference(Token &T) {
 518   const char *TokenPtr = BufferPtr;
 519   assert(*TokenPtr == '&');
 520   TokenPtr++;
 521   if (TokenPtr == CommentEnd) {
 522     formTextToken(T, TokenPtr);
 523     return;
 524   }
 525   const char *NamePtr;
 526   bool isNamed = false;
 527   bool isDecimal = false;
 528   char C = *TokenPtr;
 529   if (isHTMLNamedCharacterReferenceCharacter(C)) {
 530     NamePtr = TokenPtr;
 531     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
 532     isNamed = true;
 533   } else if (C == '#') {
 534     TokenPtr++;
 535     if (TokenPtr == CommentEnd) {
 536       formTextToken(T, TokenPtr);
 537       return;
 538     }
 539     C = *TokenPtr;
 540     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
 541       NamePtr = TokenPtr;
 542       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
 543       isDecimal = true;
 544     } else if (C == 'x' || C == 'X') {
 545       TokenPtr++;
 546       NamePtr = TokenPtr;
 547       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
 548     } else {
 549       formTextToken(T, TokenPtr);
 550       return;
 551     }
 552   } else {
 553     formTextToken(T, TokenPtr);
 554     return;
 555   }
 556   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
 557       *TokenPtr != ';') {
 558     formTextToken(T, TokenPtr);
 559     return;
 560   }
 561   StringRef Name(NamePtr, TokenPtr - NamePtr);
 562   TokenPtr++; // Skip semicolon.
 563   StringRef Resolved;
 564   if (isNamed)
 565     Resolved = resolveHTMLNamedCharacterReference(Name);
 566   else if (isDecimal)
 567     Resolved = resolveHTMLDecimalCharacterReference(Name);
 568   else
 569     Resolved = resolveHTMLHexCharacterReference(Name);
 570
 571   if (Resolved.empty()) {
 572     formTextToken(T, TokenPtr);
 573     return;
 574   }
 575   formTokenWithChars(T, TokenPtr, tok::text);
 576   T.setText(Resolved);
 577   return;
 578 }
 579
 580 void Lexer::setupAndLexHTMLStartTag(Token &T) {
 581   assert(BufferPtr[0] == '<' &&
 582          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
 583   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
 584   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
 585   if (!isHTMLTagName(Name)) {
 586     formTextToken(T, TagNameEnd);
 587     return;
 588   }
 589
 590   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
 591   T.setHTMLTagStartName(Name);
 592
 593   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
 594
 595   const char C = *BufferPtr;
 596   if (BufferPtr != CommentEnd &&
 597       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
 598     State = LS_HTMLStartTag;
 599 }
 600
 601 void Lexer::lexHTMLStartTag(Token &T) {
 602   assert(State == LS_HTMLStartTag);
 603
 604   const char *TokenPtr = BufferPtr;
 605   char C = *TokenPtr;
 606   if (isHTMLIdentifierCharacter(C)) {
 607     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
 608     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
 609     formTokenWithChars(T, TokenPtr, tok::html_ident);
 610     T.setHTMLIdent(Ident);
 611   } else {
 612     switch (C) {
 613     case '=':
 614       TokenPtr++;
 615       formTokenWithChars(T, TokenPtr, tok::html_equals);
 616       break;
 617     case '\"':
 618     case '\'': {
 619       const char *OpenQuote = TokenPtr;
 620       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
 621       const char *ClosingQuote = TokenPtr;
 622       if (TokenPtr != CommentEnd) // Skip closing quote.
 623         TokenPtr++;
 624       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
 625       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
 626                                       ClosingQuote - (OpenQuote + 1)));
 627       break;
 628     }
 629     case '>':
 630       TokenPtr++;
 631       formTokenWithChars(T, TokenPtr, tok::html_greater);
 632       State = LS_Normal;
 633       return;
 634     case '/':
 635       TokenPtr++;
 636       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
 637         TokenPtr++;
 638         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
 639       } else
 640         formTextToken(T, TokenPtr);
 641
 642       State = LS_Normal;
 643       return;
 644     }
 645   }
 646
 647   // Now look ahead and return to normal state if we don't see any HTML tokens
 648   // ahead.
 649   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
 650   if (BufferPtr == CommentEnd) {
 651     State = LS_Normal;
 652     return;
 653   }
 654
 655   C = *BufferPtr;
 656   if (!isHTMLIdentifierStartingCharacter(C) &&
 657       C != '=' && C != '\"' && C != '\'' && C != '>') {
 658     State = LS_Normal;
 659     return;
 660   }
 661 }
 662
 663 void Lexer::setupAndLexHTMLEndTag(Token &T) {
 664   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
 665
 666   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
 667   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
 668   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
 669   if (!isHTMLTagName(Name)) {
 670     formTextToken(T, TagNameEnd);
 671     return;
 672   }
 673
 674   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
 675
 676   formTokenWithChars(T, End, tok::html_end_tag);
 677   T.setHTMLTagEndName(Name);
 678
 679   if (BufferPtr != CommentEnd && *BufferPtr == '>')
 680     State = LS_HTMLEndTag;
 681 }
 682
 683 void Lexer::lexHTMLEndTag(Token &T) {
 684   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
 685
 686   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
 687   State = LS_Normal;
 688 }
 689
 690 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
 691              const CommandTraits &Traits,
 692              SourceLocation FileLoc,
 693              const char *BufferStart, const char *BufferEnd):
 694     Allocator(Allocator), Diags(Diags), Traits(Traits),
 695     BufferStart(BufferStart), BufferEnd(BufferEnd),
 696     FileLoc(FileLoc), BufferPtr(BufferStart),
 697     CommentState(LCS_BeforeComment), State(LS_Normal) {
 698 }
 699
 700 void Lexer::lex(Token &T) {
 701 again:
 702   switch (CommentState) {
 703   case LCS_BeforeComment:
 704     if (BufferPtr == BufferEnd) {
 705       formTokenWithChars(T, BufferPtr, tok::eof);
 706       return;
 707     }
 708
 709     assert(*BufferPtr == '/');
 710     BufferPtr++; // Skip first slash.
 711     switch(*BufferPtr) {
 712     case '/': { // BCPL comment.
 713       BufferPtr++; // Skip second slash.
 714
 715       if (BufferPtr != BufferEnd) {
 716         // Skip Doxygen magic marker, if it is present.
 717         // It might be missing because of a typo //< or /*<, or because we
 718         // merged this non-Doxygen comment into a bunch of Doxygen comments
 719         // around it: /** ... */ /* ... */ /** ... */
 720         const char C = *BufferPtr;
 721         if (C == '/' || C == '!')
 722           BufferPtr++;
 723       }
 724
 725       // Skip less-than symbol that marks trailing comments.
 726       // Skip it even if the comment is not a Doxygen one, because //< and /*<
 727       // are frequent typos.
 728       if (BufferPtr != BufferEnd && *BufferPtr == '<')
 729         BufferPtr++;
 730
 731       CommentState = LCS_InsideBCPLComment;
 732       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
 733         State = LS_Normal;
 734       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
 735       goto again;
 736     }
 737     case '*': { // C comment.
 738       BufferPtr++; // Skip star.
 739
 740       // Skip Doxygen magic marker.
 741       const char C = *BufferPtr;
 742       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
 743         BufferPtr++;
 744
 745       // Skip less-than symbol that marks trailing comments.
 746       if (BufferPtr != BufferEnd && *BufferPtr == '<')
 747         BufferPtr++;
 748
 749       CommentState = LCS_InsideCComment;
 750       State = LS_Normal;
 751       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
 752       goto again;
 753     }
 754     default:
 755       llvm_unreachable("second character of comment should be '/' or '*'");
 756     }
 757
 758   case LCS_BetweenComments: {
 759     // Consecutive comments are extracted only if there is only whitespace
 760     // between them.  So we can search for the start of the next comment.
 761     const char *EndWhitespace = BufferPtr;
 762     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
 763       EndWhitespace++;
 764
 765     // Turn any whitespace between comments (and there is only whitespace
 766     // between them -- guaranteed by comment extraction) into a newline.  We
 767     // have two newlines between C comments in total (first one was synthesized
 768     // after a comment).
 769     formTokenWithChars(T, EndWhitespace, tok::newline);
 770
 771     CommentState = LCS_BeforeComment;
 772     break;
 773   }
 774
 775   case LCS_InsideBCPLComment:
 776   case LCS_InsideCComment:
 777     if (BufferPtr != CommentEnd) {
 778       lexCommentText(T);
 779       break;
 780     } else {
 781       // Skip C comment closing sequence.
 782       if (CommentState == LCS_InsideCComment) {
 783         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
 784         BufferPtr += 2;
 785         assert(BufferPtr <= BufferEnd);
 786
 787         // Synthenize newline just after the C comment, regardless if there is
 788         // actually a newline.
 789         formTokenWithChars(T, BufferPtr, tok::newline);
 790
 791         CommentState = LCS_BetweenComments;
 792         break;
 793       } else {
 794         // Don't synthesized a newline after BCPL comment.
 795         CommentState = LCS_BetweenComments;
 796         goto again;
 797       }
 798     }
 799   }
 800 }
 801
 802 StringRef Lexer::getSpelling(const Token &Tok,
 803                              const SourceManager &SourceMgr,
 804                              bool *Invalid) const {
 805   SourceLocation Loc = Tok.getLocation();
 806   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
 807
 808   bool InvalidTemp = false;
 809   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
 810   if (InvalidTemp) {
 811     *Invalid = true;
 812     return StringRef();
 813   }
 814
 815   const char *Begin = File.data() + LocInfo.second;
 816   return StringRef(Begin, Tok.getLength());
 817 }
 818
 819 } // end namespace comments
 820 } // end namespace clang
 821