contrib/llvm/tools/clang/lib/AST/CommentLexer.cpp

   1 #include "clang/AST/CommentLexer.h"
   2 #include "clang/AST/CommentCommandTraits.h"
   3 #include "clang/Basic/ConvertUTF.h"
   4 #include "llvm/ADT/StringSwitch.h"
   5 #include "llvm/Support/ErrorHandling.h"
   6
   7 namespace clang {
   8 namespace comments {
   9
  10 void Token::dump(const Lexer &L, const SourceManager &SM) const {
  11   llvm::errs() << "comments::Token Kind=" << Kind << " ";
  12   Loc.dump(SM);
  13   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
  14 }
  15
  16 namespace {
  17 bool isHTMLNamedCharacterReferenceCharacter(char C) {
  18   return (C >= 'a' && C <= 'z') ||
  19          (C >= 'A' && C <= 'Z');
  20 }
  21
  22 bool isHTMLDecimalCharacterReferenceCharacter(char C) {
  23   return C >= '0' && C <= '9';
  24 }
  25
  26 bool isHTMLHexCharacterReferenceCharacter(char C) {
  27   return (C >= '0' && C <= '9') ||
  28          (C >= 'a' && C <= 'f') ||
  29          (C >= 'A' && C <= 'F');
  30 }
  31
  32 #include "clang/AST/CommentHTMLTags.inc"
  33
  34 } // unnamed namespace
  35
  36 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
  37   return llvm::StringSwitch<StringRef>(Name)
  38       .Case("amp", "&")
  39       .Case("lt", "<")
  40       .Case("gt", ">")
  41       .Case("quot", "\"")
  42       .Case("apos", "\'")
  43       .Default("");
  44 }
  45
  46 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
  47   unsigned CodePoint = 0;
  48   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  49     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
  50     CodePoint *= 10;
  51     CodePoint += Name[i] - '0';
  52   }
  53
  54   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
  55   char *ResolvedPtr = Resolved;
  56   if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
  57     return StringRef(Resolved, ResolvedPtr - Resolved);
  58   else
  59     return StringRef();
  60 }
  61
  62 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
  63   unsigned CodePoint = 0;
  64   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  65     CodePoint *= 16;
  66     const char C = Name[i];
  67     assert(isHTMLHexCharacterReferenceCharacter(C));
  68     if (C >= '0' && C <= '9')
  69       CodePoint += Name[i] - '0';
  70     else if (C >= 'a' && C <= 'f')
  71       CodePoint += Name[i] - 'a' + 10;
  72     else
  73       CodePoint += Name[i] - 'A' + 10;
  74   }
  75
  76   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
  77   char *ResolvedPtr = Resolved;
  78   if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
  79     return StringRef(Resolved, ResolvedPtr - Resolved);
  80   else
  81     return StringRef();
  82 }
  83
  84 void Lexer::skipLineStartingDecorations() {
  85   // This function should be called only for C comments
  86   assert(CommentState == LCS_InsideCComment);
  87
  88   if (BufferPtr == CommentEnd)
  89     return;
  90
  91   switch (*BufferPtr) {
  92   case ' ':
  93   case '\t':
  94   case '\f':
  95   case '\v': {
  96     const char *NewBufferPtr = BufferPtr;
  97     NewBufferPtr++;
  98     if (NewBufferPtr == CommentEnd)
  99       return;
 100
 101     char C = *NewBufferPtr;
 102     while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
 103       NewBufferPtr++;
 104       if (NewBufferPtr == CommentEnd)
 105         return;
 106       C = *NewBufferPtr;
 107     }
 108     if (C == '*')
 109       BufferPtr = NewBufferPtr + 1;
 110     break;
 111   }
 112   case '*':
 113     BufferPtr++;
 114     break;
 115   }
 116 }
 117
 118 namespace {
 119 /// Returns pointer to the first newline character in the string.
 120 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
 121   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 122     const char C = *BufferPtr;
 123     if (C == '\n' || C == '\r')
 124       return BufferPtr;
 125   }
 126   return BufferEnd;
 127 }
 128
 129 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
 130   if (BufferPtr == BufferEnd)
 131     return BufferPtr;
 132
 133   if (*BufferPtr == '\n')
 134     BufferPtr++;
 135   else {
 136     assert(*BufferPtr == '\r');
 137     BufferPtr++;
 138     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
 139       BufferPtr++;
 140   }
 141   return BufferPtr;
 142 }
 143
 144 const char *skipNamedCharacterReference(const char *BufferPtr,
 145                                         const char *BufferEnd) {
 146   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 147     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
 148       return BufferPtr;
 149   }
 150   return BufferEnd;
 151 }
 152
 153 const char *skipDecimalCharacterReference(const char *BufferPtr,
 154                                           const char *BufferEnd) {
 155   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 156     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
 157       return BufferPtr;
 158   }
 159   return BufferEnd;
 160 }
 161
 162 const char *skipHexCharacterReference(const char *BufferPtr,
 163                                           const char *BufferEnd) {
 164   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 165     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
 166       return BufferPtr;
 167   }
 168   return BufferEnd;
 169 }
 170
 171 bool isHTMLIdentifierStartingCharacter(char C) {
 172   return (C >= 'a' && C <= 'z') ||
 173          (C >= 'A' && C <= 'Z');
 174 }
 175
 176 bool isHTMLIdentifierCharacter(char C) {
 177   return (C >= 'a' && C <= 'z') ||
 178          (C >= 'A' && C <= 'Z') ||
 179          (C >= '0' && C <= '9');
 180 }
 181
 182 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
 183   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 184     if (!isHTMLIdentifierCharacter(*BufferPtr))
 185       return BufferPtr;
 186   }
 187   return BufferEnd;
 188 }
 189
 190 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
 191 /// string allowed.
 192 ///
 193 /// Returns pointer to closing quote.
 194 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
 195 {
 196   const char Quote = *BufferPtr;
 197   assert(Quote == '\"' || Quote == '\'');
 198
 199   BufferPtr++;
 200   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 201     const char C = *BufferPtr;
 202     if (C == Quote && BufferPtr[-1] != '\\')
 203       return BufferPtr;
 204   }
 205   return BufferEnd;
 206 }
 207
 208 bool isHorizontalWhitespace(char C) {
 209   return C == ' ' || C == '\t' || C == '\f' || C == '\v';
 210 }
 211
 212 bool isWhitespace(char C) {
 213   return C == ' ' || C == '\n' || C == '\r' ||
 214          C == '\t' || C == '\f' || C == '\v';
 215 }
 216
 217 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
 218   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 219     if (!isWhitespace(*BufferPtr))
 220       return BufferPtr;
 221   }
 222   return BufferEnd;
 223 }
 224
 225 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
 226   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
 227 }
 228
 229 bool isCommandNameStartCharacter(char C) {
 230   return (C >= 'a' && C <= 'z') ||
 231          (C >= 'A' && C <= 'Z');
 232 }
 233
 234 bool isCommandNameCharacter(char C) {
 235   return (C >= 'a' && C <= 'z') ||
 236          (C >= 'A' && C <= 'Z') ||
 237          (C >= '0' && C <= '9');
 238 }
 239
 240 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
 241   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 242     if (!isCommandNameCharacter(*BufferPtr))
 243       return BufferPtr;
 244   }
 245   return BufferEnd;
 246 }
 247
 248 /// Return the one past end pointer for BCPL comments.
 249 /// Handles newlines escaped with backslash or trigraph for backslahs.
 250 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
 251   const char *CurPtr = BufferPtr;
 252   while (CurPtr != BufferEnd) {
 253     char C = *CurPtr;
 254     while (C != '\n' && C != '\r') {
 255       CurPtr++;
 256       if (CurPtr == BufferEnd)
 257         return BufferEnd;
 258       C = *CurPtr;
 259     }
 260     // We found a newline, check if it is escaped.
 261     const char *EscapePtr = CurPtr - 1;
 262     while(isHorizontalWhitespace(*EscapePtr))
 263       EscapePtr--;
 264
 265     if (*EscapePtr == '\\' ||
 266         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
 267          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
 268       // We found an escaped newline.
 269       CurPtr = skipNewline(CurPtr, BufferEnd);
 270     } else
 271       return CurPtr; // Not an escaped newline.
 272   }
 273   return BufferEnd;
 274 }
 275
 276 /// Return the one past end pointer for C comments.
 277 /// Very dumb, does not handle escaped newlines or trigraphs.
 278 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
 279   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 280     if (*BufferPtr == '*') {
 281       assert(BufferPtr + 1 != BufferEnd);
 282       if (*(BufferPtr + 1) == '/')
 283         return BufferPtr;
 284     }
 285   }
 286   llvm_unreachable("buffer end hit before '*/' was seen");
 287 }
 288 } // unnamed namespace
 289
 290 void Lexer::lexCommentText(Token &T) {
 291   assert(CommentState == LCS_InsideBCPLComment ||
 292          CommentState == LCS_InsideCComment);
 293
 294   switch (State) {
 295   case LS_Normal:
 296     break;
 297   case LS_VerbatimBlockFirstLine:
 298     lexVerbatimBlockFirstLine(T);
 299     return;
 300   case LS_VerbatimBlockBody:
 301     lexVerbatimBlockBody(T);
 302     return;
 303   case LS_VerbatimLineText:
 304     lexVerbatimLineText(T);
 305     return;
 306   case LS_HTMLStartTag:
 307     lexHTMLStartTag(T);
 308     return;
 309   case LS_HTMLEndTag:
 310     lexHTMLEndTag(T);
 311     return;
 312   }
 313
 314   assert(State == LS_Normal);
 315
 316   const char *TokenPtr = BufferPtr;
 317   assert(TokenPtr < CommentEnd);
 318   while (TokenPtr != CommentEnd) {
 319     switch(*TokenPtr) {
 320       case '\\':
 321       case '@': {
 322         TokenPtr++;
 323         if (TokenPtr == CommentEnd) {
 324           formTextToken(T, TokenPtr);
 325           return;
 326         }
 327         char C = *TokenPtr;
 328         switch (C) {
 329         default:
 330           break;
 331
 332         case '\\': case '@': case '&': case '$':
 333         case '#':  case '<': case '>': case '%':
 334         case '\"': case '.': case ':':
 335           // This is one of \\ \@ \& \$ etc escape sequences.
 336           TokenPtr++;
 337           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
 338             // This is the \:: escape sequence.
 339             TokenPtr++;
 340           }
 341           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
 342           formTokenWithChars(T, TokenPtr, tok::text);
 343           T.setText(UnescapedText);
 344           return;
 345         }
 346
 347         // Don't make zero-length commands.
 348         if (!isCommandNameStartCharacter(*TokenPtr)) {
 349           formTextToken(T, TokenPtr);
 350           return;
 351         }
 352
 353         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
 354         unsigned Length = TokenPtr - (BufferPtr + 1);
 355
 356         // Hardcoded support for lexing LaTeX formula commands
 357         // \f$ \f[ \f] \f{ \f} as a single command.
 358         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
 359           C = *TokenPtr;
 360           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
 361             TokenPtr++;
 362             Length++;
 363           }
 364         }
 365
 366         const StringRef CommandName(BufferPtr + 1, Length);
 367
 368         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
 369         if (!Info) {
 370           formTokenWithChars(T, TokenPtr, tok::unknown_command);
 371           T.setUnknownCommandName(CommandName);
 372           return;
 373         }
 374         if (Info->IsVerbatimBlockCommand) {
 375           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
 376           return;
 377         }
 378         if (Info->IsVerbatimLineCommand) {
 379           setupAndLexVerbatimLine(T, TokenPtr, Info);
 380           return;
 381         }
 382         formTokenWithChars(T, TokenPtr, tok::command);
 383         T.setCommandID(Info->getID());
 384         return;
 385       }
 386
 387       case '&':
 388         lexHTMLCharacterReference(T);
 389         return;
 390
 391       case '<': {
 392         TokenPtr++;
 393         if (TokenPtr == CommentEnd) {
 394           formTextToken(T, TokenPtr);
 395           return;
 396         }
 397         const char C = *TokenPtr;
 398         if (isHTMLIdentifierStartingCharacter(C))
 399           setupAndLexHTMLStartTag(T);
 400         else if (C == '/')
 401           setupAndLexHTMLEndTag(T);
 402         else
 403           formTextToken(T, TokenPtr);
 404
 405         return;
 406       }
 407
 408       case '\n':
 409       case '\r':
 410         TokenPtr = skipNewline(TokenPtr, CommentEnd);
 411         formTokenWithChars(T, TokenPtr, tok::newline);
 412
 413         if (CommentState == LCS_InsideCComment)
 414           skipLineStartingDecorations();
 415         return;
 416
 417       default: {
 418         while (true) {
 419           TokenPtr++;
 420           if (TokenPtr == CommentEnd)
 421             break;
 422           const char C = *TokenPtr;
 423           if(C == '\n' || C == '\r' ||
 424              C == '\\' || C == '@' || C == '&' || C == '<')
 425             break;
 426         }
 427         formTextToken(T, TokenPtr);
 428         return;
 429       }
 430     }
 431   }
 432 }
 433
 434 void Lexer::setupAndLexVerbatimBlock(Token &T,
 435                                      const char *TextBegin,
 436                                      char Marker, const CommandInfo *Info) {
 437   assert(Info->IsVerbatimBlockCommand);
 438
 439   VerbatimBlockEndCommandName.clear();
 440   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
 441   VerbatimBlockEndCommandName.append(Info->EndCommandName);
 442
 443   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
 444   T.setVerbatimBlockID(Info->getID());
 445
 446   // If there is a newline following the verbatim opening command, skip the
 447   // newline so that we don't create an tok::verbatim_block_line with empty
 448   // text content.
 449   if (BufferPtr != CommentEnd) {
 450     const char C = *BufferPtr;
 451     if (C == '\n' || C == '\r') {
 452       BufferPtr = skipNewline(BufferPtr, CommentEnd);
 453       State = LS_VerbatimBlockBody;
 454       return;
 455     }
 456   }
 457
 458   State = LS_VerbatimBlockFirstLine;
 459 }
 460
 461 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
 462 again:
 463   assert(BufferPtr < CommentEnd);
 464
 465   // FIXME: It would be better to scan the text once, finding either the block
 466   // end command or newline.
 467   //
 468   // Extract current line.
 469   const char *Newline = findNewline(BufferPtr, CommentEnd);
 470   StringRef Line(BufferPtr, Newline - BufferPtr);
 471
 472   // Look for end command in current line.
 473   size_t Pos = Line.find(VerbatimBlockEndCommandName);
 474   const char *TextEnd;
 475   const char *NextLine;
 476   if (Pos == StringRef::npos) {
 477     // Current line is completely verbatim.
 478     TextEnd = Newline;
 479     NextLine = skipNewline(Newline, CommentEnd);
 480   } else if (Pos == 0) {
 481     // Current line contains just an end command.
 482     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
 483     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
 484     formTokenWithChars(T, End, tok::verbatim_block_end);
 485     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
 486     State = LS_Normal;
 487     return;
 488   } else {
 489     // There is some text, followed by end command.  Extract text first.
 490     TextEnd = BufferPtr + Pos;
 491     NextLine = TextEnd;
 492     // If there is only whitespace before end command, skip whitespace.
 493     if (isWhitespace(BufferPtr, TextEnd)) {
 494       BufferPtr = TextEnd;
 495       goto again;
 496     }
 497   }
 498
 499   StringRef Text(BufferPtr, TextEnd - BufferPtr);
 500   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
 501   T.setVerbatimBlockText(Text);
 502
 503   State = LS_VerbatimBlockBody;
 504 }
 505
 506 void Lexer::lexVerbatimBlockBody(Token &T) {
 507   assert(State == LS_VerbatimBlockBody);
 508
 509   if (CommentState == LCS_InsideCComment)
 510     skipLineStartingDecorations();
 511
 512   lexVerbatimBlockFirstLine(T);
 513 }
 514
 515 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
 516                                     const CommandInfo *Info) {
 517   assert(Info->IsVerbatimLineCommand);
 518   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
 519   T.setVerbatimLineID(Info->getID());
 520
 521   State = LS_VerbatimLineText;
 522 }
 523
 524 void Lexer::lexVerbatimLineText(Token &T) {
 525   assert(State == LS_VerbatimLineText);
 526
 527   // Extract current line.
 528   const char *Newline = findNewline(BufferPtr, CommentEnd);
 529   const StringRef Text(BufferPtr, Newline - BufferPtr);
 530   formTokenWithChars(T, Newline, tok::verbatim_line_text);
 531   T.setVerbatimLineText(Text);
 532
 533   State = LS_Normal;
 534 }
 535
 536 void Lexer::lexHTMLCharacterReference(Token &T) {
 537   const char *TokenPtr = BufferPtr;
 538   assert(*TokenPtr == '&');
 539   TokenPtr++;
 540   if (TokenPtr == CommentEnd) {
 541     formTextToken(T, TokenPtr);
 542     return;
 543   }
 544   const char *NamePtr;
 545   bool isNamed = false;
 546   bool isDecimal = false;
 547   char C = *TokenPtr;
 548   if (isHTMLNamedCharacterReferenceCharacter(C)) {
 549     NamePtr = TokenPtr;
 550     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
 551     isNamed = true;
 552   } else if (C == '#') {
 553     TokenPtr++;
 554     if (TokenPtr == CommentEnd) {
 555       formTextToken(T, TokenPtr);
 556       return;
 557     }
 558     C = *TokenPtr;
 559     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
 560       NamePtr = TokenPtr;
 561       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
 562       isDecimal = true;
 563     } else if (C == 'x' || C == 'X') {
 564       TokenPtr++;
 565       NamePtr = TokenPtr;
 566       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
 567     } else {
 568       formTextToken(T, TokenPtr);
 569       return;
 570     }
 571   } else {
 572     formTextToken(T, TokenPtr);
 573     return;
 574   }
 575   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
 576       *TokenPtr != ';') {
 577     formTextToken(T, TokenPtr);
 578     return;
 579   }
 580   StringRef Name(NamePtr, TokenPtr - NamePtr);
 581   TokenPtr++; // Skip semicolon.
 582   StringRef Resolved;
 583   if (isNamed)
 584     Resolved = resolveHTMLNamedCharacterReference(Name);
 585   else if (isDecimal)
 586     Resolved = resolveHTMLDecimalCharacterReference(Name);
 587   else
 588     Resolved = resolveHTMLHexCharacterReference(Name);
 589
 590   if (Resolved.empty()) {
 591     formTextToken(T, TokenPtr);
 592     return;
 593   }
 594   formTokenWithChars(T, TokenPtr, tok::text);
 595   T.setText(Resolved);
 596   return;
 597 }
 598
 599 void Lexer::setupAndLexHTMLStartTag(Token &T) {
 600   assert(BufferPtr[0] == '<' &&
 601          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
 602   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
 603   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
 604   if (!isHTMLTagName(Name)) {
 605     formTextToken(T, TagNameEnd);
 606     return;
 607   }
 608
 609   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
 610   T.setHTMLTagStartName(Name);
 611
 612   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
 613
 614   const char C = *BufferPtr;
 615   if (BufferPtr != CommentEnd &&
 616       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
 617     State = LS_HTMLStartTag;
 618 }
 619
 620 void Lexer::lexHTMLStartTag(Token &T) {
 621   assert(State == LS_HTMLStartTag);
 622
 623   const char *TokenPtr = BufferPtr;
 624   char C = *TokenPtr;
 625   if (isHTMLIdentifierCharacter(C)) {
 626     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
 627     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
 628     formTokenWithChars(T, TokenPtr, tok::html_ident);
 629     T.setHTMLIdent(Ident);
 630   } else {
 631     switch (C) {
 632     case '=':
 633       TokenPtr++;
 634       formTokenWithChars(T, TokenPtr, tok::html_equals);
 635       break;
 636     case '\"':
 637     case '\'': {
 638       const char *OpenQuote = TokenPtr;
 639       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
 640       const char *ClosingQuote = TokenPtr;
 641       if (TokenPtr != CommentEnd) // Skip closing quote.
 642         TokenPtr++;
 643       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
 644       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
 645                                       ClosingQuote - (OpenQuote + 1)));
 646       break;
 647     }
 648     case '>':
 649       TokenPtr++;
 650       formTokenWithChars(T, TokenPtr, tok::html_greater);
 651       State = LS_Normal;
 652       return;
 653     case '/':
 654       TokenPtr++;
 655       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
 656         TokenPtr++;
 657         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
 658       } else
 659         formTextToken(T, TokenPtr);
 660
 661       State = LS_Normal;
 662       return;
 663     }
 664   }
 665
 666   // Now look ahead and return to normal state if we don't see any HTML tokens
 667   // ahead.
 668   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
 669   if (BufferPtr == CommentEnd) {
 670     State = LS_Normal;
 671     return;
 672   }
 673
 674   C = *BufferPtr;
 675   if (!isHTMLIdentifierStartingCharacter(C) &&
 676       C != '=' && C != '\"' && C != '\'' && C != '>') {
 677     State = LS_Normal;
 678     return;
 679   }
 680 }
 681
 682 void Lexer::setupAndLexHTMLEndTag(Token &T) {
 683   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
 684
 685   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
 686   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
 687   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
 688   if (!isHTMLTagName(Name)) {
 689     formTextToken(T, TagNameEnd);
 690     return;
 691   }
 692
 693   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
 694
 695   formTokenWithChars(T, End, tok::html_end_tag);
 696   T.setHTMLTagEndName(Name);
 697
 698   if (BufferPtr != CommentEnd && *BufferPtr == '>')
 699     State = LS_HTMLEndTag;
 700 }
 701
 702 void Lexer::lexHTMLEndTag(Token &T) {
 703   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
 704
 705   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
 706   State = LS_Normal;
 707 }
 708
 709 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
 710              SourceLocation FileLoc,
 711              const char *BufferStart, const char *BufferEnd):
 712     Allocator(Allocator), Traits(Traits),
 713     BufferStart(BufferStart), BufferEnd(BufferEnd),
 714     FileLoc(FileLoc), BufferPtr(BufferStart),
 715     CommentState(LCS_BeforeComment), State(LS_Normal) {
 716 }
 717
 718 void Lexer::lex(Token &T) {
 719 again:
 720   switch (CommentState) {
 721   case LCS_BeforeComment:
 722     if (BufferPtr == BufferEnd) {
 723       formTokenWithChars(T, BufferPtr, tok::eof);
 724       return;
 725     }
 726
 727     assert(*BufferPtr == '/');
 728     BufferPtr++; // Skip first slash.
 729     switch(*BufferPtr) {
 730     case '/': { // BCPL comment.
 731       BufferPtr++; // Skip second slash.
 732
 733       if (BufferPtr != BufferEnd) {
 734         // Skip Doxygen magic marker, if it is present.
 735         // It might be missing because of a typo //< or /*<, or because we
 736         // merged this non-Doxygen comment into a bunch of Doxygen comments
 737         // around it: /** ... */ /* ... */ /** ... */
 738         const char C = *BufferPtr;
 739         if (C == '/' || C == '!')
 740           BufferPtr++;
 741       }
 742
 743       // Skip less-than symbol that marks trailing comments.
 744       // Skip it even if the comment is not a Doxygen one, because //< and /*<
 745       // are frequent typos.
 746       if (BufferPtr != BufferEnd && *BufferPtr == '<')
 747         BufferPtr++;
 748
 749       CommentState = LCS_InsideBCPLComment;
 750       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
 751         State = LS_Normal;
 752       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
 753       goto again;
 754     }
 755     case '*': { // C comment.
 756       BufferPtr++; // Skip star.
 757
 758       // Skip Doxygen magic marker.
 759       const char C = *BufferPtr;
 760       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
 761         BufferPtr++;
 762
 763       // Skip less-than symbol that marks trailing comments.
 764       if (BufferPtr != BufferEnd && *BufferPtr == '<')
 765         BufferPtr++;
 766
 767       CommentState = LCS_InsideCComment;
 768       State = LS_Normal;
 769       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
 770       goto again;
 771     }
 772     default:
 773       llvm_unreachable("second character of comment should be '/' or '*'");
 774     }
 775
 776   case LCS_BetweenComments: {
 777     // Consecutive comments are extracted only if there is only whitespace
 778     // between them.  So we can search for the start of the next comment.
 779     const char *EndWhitespace = BufferPtr;
 780     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
 781       EndWhitespace++;
 782
 783     // Turn any whitespace between comments (and there is only whitespace
 784     // between them -- guaranteed by comment extraction) into a newline.  We
 785     // have two newlines between C comments in total (first one was synthesized
 786     // after a comment).
 787     formTokenWithChars(T, EndWhitespace, tok::newline);
 788
 789     CommentState = LCS_BeforeComment;
 790     break;
 791   }
 792
 793   case LCS_InsideBCPLComment:
 794   case LCS_InsideCComment:
 795     if (BufferPtr != CommentEnd) {
 796       lexCommentText(T);
 797       break;
 798     } else {
 799       // Skip C comment closing sequence.
 800       if (CommentState == LCS_InsideCComment) {
 801         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
 802         BufferPtr += 2;
 803         assert(BufferPtr <= BufferEnd);
 804
 805         // Synthenize newline just after the C comment, regardless if there is
 806         // actually a newline.
 807         formTokenWithChars(T, BufferPtr, tok::newline);
 808
 809         CommentState = LCS_BetweenComments;
 810         break;
 811       } else {
 812         // Don't synthesized a newline after BCPL comment.
 813         CommentState = LCS_BetweenComments;
 814         goto again;
 815       }
 816     }
 817   }
 818 }
 819
 820 StringRef Lexer::getSpelling(const Token &Tok,
 821                              const SourceManager &SourceMgr,
 822                              bool *Invalid) const {
 823   SourceLocation Loc = Tok.getLocation();
 824   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
 825
 826   bool InvalidTemp = false;
 827   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
 828   if (InvalidTemp) {
 829     *Invalid = true;
 830     return StringRef();
 831   }
 832
 833   const char *Begin = File.data() + LocInfo.second;
 834   return StringRef(Begin, Tok.getLength());
 835 }
 836
 837 } // end namespace comments
 838 } // end namespace clang
 839