contrib/llvm/tools/clang/lib/AST/CommentLexer.cpp

   1 #include "clang/AST/CommentLexer.h"
   2 #include "clang/AST/CommentCommandTraits.h"
   3 #include "clang/AST/CommentDiagnostic.h"
   4 #include "clang/Basic/CharInfo.h"
   5 #include "llvm/ADT/StringExtras.h"
   6 #include "llvm/ADT/StringSwitch.h"
   7 #include "llvm/Support/ConvertUTF.h"
   8 #include "llvm/Support/ErrorHandling.h"
   9
  10 namespace clang {
  11 namespace comments {
  12
  13 void Token::dump(const Lexer &L, const SourceManager &SM) const {
  14   llvm::errs() << "comments::Token Kind=" << Kind << " ";
  15   Loc.dump(SM);
  16   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
  17 }
  18
  19 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
  20   return isLetter(C);
  21 }
  22
  23 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
  24   return isDigit(C);
  25 }
  26
  27 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
  28   return isHexDigit(C);
  29 }
  30
  31 static inline StringRef convertCodePointToUTF8(
  32                                       llvm::BumpPtrAllocator &Allocator,
  33                                       unsigned CodePoint) {
  34   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
  35   char *ResolvedPtr = Resolved;
  36   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
  37     return StringRef(Resolved, ResolvedPtr - Resolved);
  38   else
  39     return StringRef();
  40 }
  41
  42 namespace {
  43
  44 #include "clang/AST/CommentHTMLTags.inc"
  45 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
  46
  47 } // unnamed namespace
  48
  49 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
  50   // Fast path, first check a few most widely used named character references.
  51   return llvm::StringSwitch<StringRef>(Name)
  52       .Case("amp", "&")
  53       .Case("lt", "<")
  54       .Case("gt", ">")
  55       .Case("quot", "\"")
  56       .Case("apos", "\'")
  57       // Slow path.
  58       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
  59 }
  60
  61 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
  62   unsigned CodePoint = 0;
  63   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  64     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
  65     CodePoint *= 10;
  66     CodePoint += Name[i] - '0';
  67   }
  68   return convertCodePointToUTF8(Allocator, CodePoint);
  69 }
  70
  71 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
  72   unsigned CodePoint = 0;
  73   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  74     CodePoint *= 16;
  75     const char C = Name[i];
  76     assert(isHTMLHexCharacterReferenceCharacter(C));
  77     CodePoint += llvm::hexDigitValue(C);
  78   }
  79   return convertCodePointToUTF8(Allocator, CodePoint);
  80 }
  81
  82 void Lexer::skipLineStartingDecorations() {
  83   // This function should be called only for C comments
  84   assert(CommentState == LCS_InsideCComment);
  85
  86   if (BufferPtr == CommentEnd)
  87     return;
  88
  89   switch (*BufferPtr) {
  90   case ' ':
  91   case '\t':
  92   case '\f':
  93   case '\v': {
  94     const char *NewBufferPtr = BufferPtr;
  95     NewBufferPtr++;
  96     if (NewBufferPtr == CommentEnd)
  97       return;
  98
  99     char C = *NewBufferPtr;
 100     while (isHorizontalWhitespace(C)) {
 101       NewBufferPtr++;
 102       if (NewBufferPtr == CommentEnd)
 103         return;
 104       C = *NewBufferPtr;
 105     }
 106     if (C == '*')
 107       BufferPtr = NewBufferPtr + 1;
 108     break;
 109   }
 110   case '*':
 111     BufferPtr++;
 112     break;
 113   }
 114 }
 115
 116 namespace {
 117 /// Returns pointer to the first newline character in the string.
 118 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
 119   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 120     if (isVerticalWhitespace(*BufferPtr))
 121       return BufferPtr;
 122   }
 123   return BufferEnd;
 124 }
 125
 126 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
 127   if (BufferPtr == BufferEnd)
 128     return BufferPtr;
 129
 130   if (*BufferPtr == '\n')
 131     BufferPtr++;
 132   else {
 133     assert(*BufferPtr == '\r');
 134     BufferPtr++;
 135     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
 136       BufferPtr++;
 137   }
 138   return BufferPtr;
 139 }
 140
 141 const char *skipNamedCharacterReference(const char *BufferPtr,
 142                                         const char *BufferEnd) {
 143   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 144     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
 145       return BufferPtr;
 146   }
 147   return BufferEnd;
 148 }
 149
 150 const char *skipDecimalCharacterReference(const char *BufferPtr,
 151                                           const char *BufferEnd) {
 152   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 153     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
 154       return BufferPtr;
 155   }
 156   return BufferEnd;
 157 }
 158
 159 const char *skipHexCharacterReference(const char *BufferPtr,
 160                                       const char *BufferEnd) {
 161   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 162     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
 163       return BufferPtr;
 164   }
 165   return BufferEnd;
 166 }
 167
 168 bool isHTMLIdentifierStartingCharacter(char C) {
 169   return isLetter(C);
 170 }
 171
 172 bool isHTMLIdentifierCharacter(char C) {
 173   return isAlphanumeric(C);
 174 }
 175
 176 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
 177   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 178     if (!isHTMLIdentifierCharacter(*BufferPtr))
 179       return BufferPtr;
 180   }
 181   return BufferEnd;
 182 }
 183
 184 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
 185 /// string allowed.
 186 ///
 187 /// Returns pointer to closing quote.
 188 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
 189 {
 190   const char Quote = *BufferPtr;
 191   assert(Quote == '\"' || Quote == '\'');
 192
 193   BufferPtr++;
 194   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 195     const char C = *BufferPtr;
 196     if (C == Quote && BufferPtr[-1] != '\\')
 197       return BufferPtr;
 198   }
 199   return BufferEnd;
 200 }
 201
 202 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
 203   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 204     if (!isWhitespace(*BufferPtr))
 205       return BufferPtr;
 206   }
 207   return BufferEnd;
 208 }
 209
 210 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
 211   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
 212 }
 213
 214 bool isCommandNameStartCharacter(char C) {
 215   return isLetter(C);
 216 }
 217
 218 bool isCommandNameCharacter(char C) {
 219   return isAlphanumeric(C);
 220 }
 221
 222 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
 223   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 224     if (!isCommandNameCharacter(*BufferPtr))
 225       return BufferPtr;
 226   }
 227   return BufferEnd;
 228 }
 229
 230 /// Return the one past end pointer for BCPL comments.
 231 /// Handles newlines escaped with backslash or trigraph for backslahs.
 232 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
 233   const char *CurPtr = BufferPtr;
 234   while (CurPtr != BufferEnd) {
 235     while (!isVerticalWhitespace(*CurPtr)) {
 236       CurPtr++;
 237       if (CurPtr == BufferEnd)
 238         return BufferEnd;
 239     }
 240     // We found a newline, check if it is escaped.
 241     const char *EscapePtr = CurPtr - 1;
 242     while(isHorizontalWhitespace(*EscapePtr))
 243       EscapePtr--;
 244
 245     if (*EscapePtr == '\\' ||
 246         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
 247          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
 248       // We found an escaped newline.
 249       CurPtr = skipNewline(CurPtr, BufferEnd);
 250     } else
 251       return CurPtr; // Not an escaped newline.
 252   }
 253   return BufferEnd;
 254 }
 255
 256 /// Return the one past end pointer for C comments.
 257 /// Very dumb, does not handle escaped newlines or trigraphs.
 258 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
 259   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
 260     if (*BufferPtr == '*') {
 261       assert(BufferPtr + 1 != BufferEnd);
 262       if (*(BufferPtr + 1) == '/')
 263         return BufferPtr;
 264     }
 265   }
 266   llvm_unreachable("buffer end hit before '*/' was seen");
 267 }
 268
 269 } // unnamed namespace
 270
 271 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
 272                                tok::TokenKind Kind) {
 273   const unsigned TokLen = TokEnd - BufferPtr;
 274   Result.setLocation(getSourceLocation(BufferPtr));
 275   Result.setKind(Kind);
 276   Result.setLength(TokLen);
 277 #ifndef NDEBUG
 278   Result.TextPtr = "<UNSET>";
 279   Result.IntVal = 7;
 280 #endif
 281   BufferPtr = TokEnd;
 282 }
 283
 284 void Lexer::lexCommentText(Token &T) {
 285   assert(CommentState == LCS_InsideBCPLComment ||
 286          CommentState == LCS_InsideCComment);
 287
 288   switch (State) {
 289   case LS_Normal:
 290     break;
 291   case LS_VerbatimBlockFirstLine:
 292     lexVerbatimBlockFirstLine(T);
 293     return;
 294   case LS_VerbatimBlockBody:
 295     lexVerbatimBlockBody(T);
 296     return;
 297   case LS_VerbatimLineText:
 298     lexVerbatimLineText(T);
 299     return;
 300   case LS_HTMLStartTag:
 301     lexHTMLStartTag(T);
 302     return;
 303   case LS_HTMLEndTag:
 304     lexHTMLEndTag(T);
 305     return;
 306   }
 307
 308   assert(State == LS_Normal);
 309
 310   const char *TokenPtr = BufferPtr;
 311   assert(TokenPtr < CommentEnd);
 312   while (TokenPtr != CommentEnd) {
 313     switch(*TokenPtr) {
 314       case '\\':
 315       case '@': {
 316         // Commands that start with a backslash and commands that start with
 317         // 'at' have equivalent semantics.  But we keep information about the
 318         // exact syntax in AST for comments.
 319         tok::TokenKind CommandKind =
 320             (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
 321         TokenPtr++;
 322         if (TokenPtr == CommentEnd) {
 323           formTextToken(T, TokenPtr);
 324           return;
 325         }
 326         char C = *TokenPtr;
 327         switch (C) {
 328         default:
 329           break;
 330
 331         case '\\': case '@': case '&': case '$':
 332         case '#':  case '<': case '>': case '%':
 333         case '\"': case '.': case ':':
 334           // This is one of \\ \@ \& \$ etc escape sequences.
 335           TokenPtr++;
 336           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
 337             // This is the \:: escape sequence.
 338             TokenPtr++;
 339           }
 340           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
 341           formTokenWithChars(T, TokenPtr, tok::text);
 342           T.setText(UnescapedText);
 343           return;
 344         }
 345
 346         // Don't make zero-length commands.
 347         if (!isCommandNameStartCharacter(*TokenPtr)) {
 348           formTextToken(T, TokenPtr);
 349           return;
 350         }
 351
 352         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
 353         unsigned Length = TokenPtr - (BufferPtr + 1);
 354
 355         // Hardcoded support for lexing LaTeX formula commands
 356         // \f$ \f[ \f] \f{ \f} as a single command.
 357         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
 358           C = *TokenPtr;
 359           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
 360             TokenPtr++;
 361             Length++;
 362           }
 363         }
 364
 365         StringRef CommandName(BufferPtr + 1, Length);
 366
 367         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
 368         if (!Info) {
 369           if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
 370             StringRef CorrectedName = Info->Name;
 371             SourceLocation Loc = getSourceLocation(BufferPtr);
 372             SourceRange CommandRange(Loc.getLocWithOffset(1),
 373                                      getSourceLocation(TokenPtr));
 374             Diag(Loc, diag::warn_correct_comment_command_name)
 375               << CommandName << CorrectedName
 376               << FixItHint::CreateReplacement(CommandRange, CorrectedName);
 377           } else {
 378             formTokenWithChars(T, TokenPtr, tok::unknown_command);
 379             T.setUnknownCommandName(CommandName);
 380             Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
 381             return;
 382           }
 383         }
 384         if (Info->IsVerbatimBlockCommand) {
 385           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
 386           return;
 387         }
 388         if (Info->IsVerbatimLineCommand) {
 389           setupAndLexVerbatimLine(T, TokenPtr, Info);
 390           return;
 391         }
 392         formTokenWithChars(T, TokenPtr, CommandKind);
 393         T.setCommandID(Info->getID());
 394         return;
 395       }
 396
 397       case '&':
 398         lexHTMLCharacterReference(T);
 399         return;
 400
 401       case '<': {
 402         TokenPtr++;
 403         if (TokenPtr == CommentEnd) {
 404           formTextToken(T, TokenPtr);
 405           return;
 406         }
 407         const char C = *TokenPtr;
 408         if (isHTMLIdentifierStartingCharacter(C))
 409           setupAndLexHTMLStartTag(T);
 410         else if (C == '/')
 411           setupAndLexHTMLEndTag(T);
 412         else
 413           formTextToken(T, TokenPtr);
 414
 415         return;
 416       }
 417
 418       case '\n':
 419       case '\r':
 420         TokenPtr = skipNewline(TokenPtr, CommentEnd);
 421         formTokenWithChars(T, TokenPtr, tok::newline);
 422
 423         if (CommentState == LCS_InsideCComment)
 424           skipLineStartingDecorations();
 425         return;
 426
 427       default: {
 428         size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
 429                          find_first_of("\n\r\\@&<");
 430         if (End != StringRef::npos)
 431           TokenPtr += End;
 432         else
 433           TokenPtr = CommentEnd;
 434         formTextToken(T, TokenPtr);
 435         return;
 436       }
 437     }
 438   }
 439 }
 440
 441 void Lexer::setupAndLexVerbatimBlock(Token &T,
 442                                      const char *TextBegin,
 443                                      char Marker, const CommandInfo *Info) {
 444   assert(Info->IsVerbatimBlockCommand);
 445
 446   VerbatimBlockEndCommandName.clear();
 447   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
 448   VerbatimBlockEndCommandName.append(Info->EndCommandName);
 449
 450   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
 451   T.setVerbatimBlockID(Info->getID());
 452
 453   // If there is a newline following the verbatim opening command, skip the
 454   // newline so that we don't create an tok::verbatim_block_line with empty
 455   // text content.
 456   if (BufferPtr != CommentEnd &&
 457       isVerticalWhitespace(*BufferPtr)) {
 458     BufferPtr = skipNewline(BufferPtr, CommentEnd);
 459     State = LS_VerbatimBlockBody;
 460     return;
 461   }
 462
 463   State = LS_VerbatimBlockFirstLine;
 464 }
 465
 466 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
 467 again:
 468   assert(BufferPtr < CommentEnd);
 469
 470   // FIXME: It would be better to scan the text once, finding either the block
 471   // end command or newline.
 472   //
 473   // Extract current line.
 474   const char *Newline = findNewline(BufferPtr, CommentEnd);
 475   StringRef Line(BufferPtr, Newline - BufferPtr);
 476
 477   // Look for end command in current line.
 478   size_t Pos = Line.find(VerbatimBlockEndCommandName);
 479   const char *TextEnd;
 480   const char *NextLine;
 481   if (Pos == StringRef::npos) {
 482     // Current line is completely verbatim.
 483     TextEnd = Newline;
 484     NextLine = skipNewline(Newline, CommentEnd);
 485   } else if (Pos == 0) {
 486     // Current line contains just an end command.
 487     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
 488     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
 489     formTokenWithChars(T, End, tok::verbatim_block_end);
 490     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
 491     State = LS_Normal;
 492     return;
 493   } else {
 494     // There is some text, followed by end command.  Extract text first.
 495     TextEnd = BufferPtr + Pos;
 496     NextLine = TextEnd;
 497     // If there is only whitespace before end command, skip whitespace.
 498     if (isWhitespace(BufferPtr, TextEnd)) {
 499       BufferPtr = TextEnd;
 500       goto again;
 501     }
 502   }
 503
 504   StringRef Text(BufferPtr, TextEnd - BufferPtr);
 505   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
 506   T.setVerbatimBlockText(Text);
 507
 508   State = LS_VerbatimBlockBody;
 509 }
 510
 511 void Lexer::lexVerbatimBlockBody(Token &T) {
 512   assert(State == LS_VerbatimBlockBody);
 513
 514   if (CommentState == LCS_InsideCComment)
 515     skipLineStartingDecorations();
 516
 517   if (BufferPtr == CommentEnd) {
 518     formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
 519     T.setVerbatimBlockText("");
 520     return;
 521   }
 522
 523   lexVerbatimBlockFirstLine(T);
 524 }
 525
 526 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
 527                                     const CommandInfo *Info) {
 528   assert(Info->IsVerbatimLineCommand);
 529   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
 530   T.setVerbatimLineID(Info->getID());
 531
 532   State = LS_VerbatimLineText;
 533 }
 534
 535 void Lexer::lexVerbatimLineText(Token &T) {
 536   assert(State == LS_VerbatimLineText);
 537
 538   // Extract current line.
 539   const char *Newline = findNewline(BufferPtr, CommentEnd);
 540   StringRef Text(BufferPtr, Newline - BufferPtr);
 541   formTokenWithChars(T, Newline, tok::verbatim_line_text);
 542   T.setVerbatimLineText(Text);
 543
 544   State = LS_Normal;
 545 }
 546
 547 void Lexer::lexHTMLCharacterReference(Token &T) {
 548   const char *TokenPtr = BufferPtr;
 549   assert(*TokenPtr == '&');
 550   TokenPtr++;
 551   if (TokenPtr == CommentEnd) {
 552     formTextToken(T, TokenPtr);
 553     return;
 554   }
 555   const char *NamePtr;
 556   bool isNamed = false;
 557   bool isDecimal = false;
 558   char C = *TokenPtr;
 559   if (isHTMLNamedCharacterReferenceCharacter(C)) {
 560     NamePtr = TokenPtr;
 561     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
 562     isNamed = true;
 563   } else if (C == '#') {
 564     TokenPtr++;
 565     if (TokenPtr == CommentEnd) {
 566       formTextToken(T, TokenPtr);
 567       return;
 568     }
 569     C = *TokenPtr;
 570     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
 571       NamePtr = TokenPtr;
 572       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
 573       isDecimal = true;
 574     } else if (C == 'x' || C == 'X') {
 575       TokenPtr++;
 576       NamePtr = TokenPtr;
 577       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
 578     } else {
 579       formTextToken(T, TokenPtr);
 580       return;
 581     }
 582   } else {
 583     formTextToken(T, TokenPtr);
 584     return;
 585   }
 586   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
 587       *TokenPtr != ';') {
 588     formTextToken(T, TokenPtr);
 589     return;
 590   }
 591   StringRef Name(NamePtr, TokenPtr - NamePtr);
 592   TokenPtr++; // Skip semicolon.
 593   StringRef Resolved;
 594   if (isNamed)
 595     Resolved = resolveHTMLNamedCharacterReference(Name);
 596   else if (isDecimal)
 597     Resolved = resolveHTMLDecimalCharacterReference(Name);
 598   else
 599     Resolved = resolveHTMLHexCharacterReference(Name);
 600
 601   if (Resolved.empty()) {
 602     formTextToken(T, TokenPtr);
 603     return;
 604   }
 605   formTokenWithChars(T, TokenPtr, tok::text);
 606   T.setText(Resolved);
 607   return;
 608 }
 609
 610 void Lexer::setupAndLexHTMLStartTag(Token &T) {
 611   assert(BufferPtr[0] == '<' &&
 612          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
 613   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
 614   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
 615   if (!isHTMLTagName(Name)) {
 616     formTextToken(T, TagNameEnd);
 617     return;
 618   }
 619
 620   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
 621   T.setHTMLTagStartName(Name);
 622
 623   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
 624
 625   const char C = *BufferPtr;
 626   if (BufferPtr != CommentEnd &&
 627       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
 628     State = LS_HTMLStartTag;
 629 }
 630
 631 void Lexer::lexHTMLStartTag(Token &T) {
 632   assert(State == LS_HTMLStartTag);
 633
 634   const char *TokenPtr = BufferPtr;
 635   char C = *TokenPtr;
 636   if (isHTMLIdentifierCharacter(C)) {
 637     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
 638     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
 639     formTokenWithChars(T, TokenPtr, tok::html_ident);
 640     T.setHTMLIdent(Ident);
 641   } else {
 642     switch (C) {
 643     case '=':
 644       TokenPtr++;
 645       formTokenWithChars(T, TokenPtr, tok::html_equals);
 646       break;
 647     case '\"':
 648     case '\'': {
 649       const char *OpenQuote = TokenPtr;
 650       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
 651       const char *ClosingQuote = TokenPtr;
 652       if (TokenPtr != CommentEnd) // Skip closing quote.
 653         TokenPtr++;
 654       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
 655       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
 656                                       ClosingQuote - (OpenQuote + 1)));
 657       break;
 658     }
 659     case '>':
 660       TokenPtr++;
 661       formTokenWithChars(T, TokenPtr, tok::html_greater);
 662       State = LS_Normal;
 663       return;
 664     case '/':
 665       TokenPtr++;
 666       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
 667         TokenPtr++;
 668         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
 669       } else
 670         formTextToken(T, TokenPtr);
 671
 672       State = LS_Normal;
 673       return;
 674     }
 675   }
 676
 677   // Now look ahead and return to normal state if we don't see any HTML tokens
 678   // ahead.
 679   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
 680   if (BufferPtr == CommentEnd) {
 681     State = LS_Normal;
 682     return;
 683   }
 684
 685   C = *BufferPtr;
 686   if (!isHTMLIdentifierStartingCharacter(C) &&
 687       C != '=' && C != '\"' && C != '\'' && C != '>') {
 688     State = LS_Normal;
 689     return;
 690   }
 691 }
 692
 693 void Lexer::setupAndLexHTMLEndTag(Token &T) {
 694   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
 695
 696   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
 697   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
 698   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
 699   if (!isHTMLTagName(Name)) {
 700     formTextToken(T, TagNameEnd);
 701     return;
 702   }
 703
 704   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
 705
 706   formTokenWithChars(T, End, tok::html_end_tag);
 707   T.setHTMLTagEndName(Name);
 708
 709   if (BufferPtr != CommentEnd && *BufferPtr == '>')
 710     State = LS_HTMLEndTag;
 711 }
 712
 713 void Lexer::lexHTMLEndTag(Token &T) {
 714   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
 715
 716   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
 717   State = LS_Normal;
 718 }
 719
 720 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
 721              const CommandTraits &Traits,
 722              SourceLocation FileLoc,
 723              const char *BufferStart, const char *BufferEnd):
 724     Allocator(Allocator), Diags(Diags), Traits(Traits),
 725     BufferStart(BufferStart), BufferEnd(BufferEnd),
 726     FileLoc(FileLoc), BufferPtr(BufferStart),
 727     CommentState(LCS_BeforeComment), State(LS_Normal) {
 728 }
 729
 730 void Lexer::lex(Token &T) {
 731 again:
 732   switch (CommentState) {
 733   case LCS_BeforeComment:
 734     if (BufferPtr == BufferEnd) {
 735       formTokenWithChars(T, BufferPtr, tok::eof);
 736       return;
 737     }
 738
 739     assert(*BufferPtr == '/');
 740     BufferPtr++; // Skip first slash.
 741     switch(*BufferPtr) {
 742     case '/': { // BCPL comment.
 743       BufferPtr++; // Skip second slash.
 744
 745       if (BufferPtr != BufferEnd) {
 746         // Skip Doxygen magic marker, if it is present.
 747         // It might be missing because of a typo //< or /*<, or because we
 748         // merged this non-Doxygen comment into a bunch of Doxygen comments
 749         // around it: /** ... */ /* ... */ /** ... */
 750         const char C = *BufferPtr;
 751         if (C == '/' || C == '!')
 752           BufferPtr++;
 753       }
 754
 755       // Skip less-than symbol that marks trailing comments.
 756       // Skip it even if the comment is not a Doxygen one, because //< and /*<
 757       // are frequent typos.
 758       if (BufferPtr != BufferEnd && *BufferPtr == '<')
 759         BufferPtr++;
 760
 761       CommentState = LCS_InsideBCPLComment;
 762       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
 763         State = LS_Normal;
 764       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
 765       goto again;
 766     }
 767     case '*': { // C comment.
 768       BufferPtr++; // Skip star.
 769
 770       // Skip Doxygen magic marker.
 771       const char C = *BufferPtr;
 772       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
 773         BufferPtr++;
 774
 775       // Skip less-than symbol that marks trailing comments.
 776       if (BufferPtr != BufferEnd && *BufferPtr == '<')
 777         BufferPtr++;
 778
 779       CommentState = LCS_InsideCComment;
 780       State = LS_Normal;
 781       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
 782       goto again;
 783     }
 784     default:
 785       llvm_unreachable("second character of comment should be '/' or '*'");
 786     }
 787
 788   case LCS_BetweenComments: {
 789     // Consecutive comments are extracted only if there is only whitespace
 790     // between them.  So we can search for the start of the next comment.
 791     const char *EndWhitespace = BufferPtr;
 792     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
 793       EndWhitespace++;
 794
 795     // Turn any whitespace between comments (and there is only whitespace
 796     // between them -- guaranteed by comment extraction) into a newline.  We
 797     // have two newlines between C comments in total (first one was synthesized
 798     // after a comment).
 799     formTokenWithChars(T, EndWhitespace, tok::newline);
 800
 801     CommentState = LCS_BeforeComment;
 802     break;
 803   }
 804
 805   case LCS_InsideBCPLComment:
 806   case LCS_InsideCComment:
 807     if (BufferPtr != CommentEnd) {
 808       lexCommentText(T);
 809       break;
 810     } else {
 811       // Skip C comment closing sequence.
 812       if (CommentState == LCS_InsideCComment) {
 813         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
 814         BufferPtr += 2;
 815         assert(BufferPtr <= BufferEnd);
 816
 817         // Synthenize newline just after the C comment, regardless if there is
 818         // actually a newline.
 819         formTokenWithChars(T, BufferPtr, tok::newline);
 820
 821         CommentState = LCS_BetweenComments;
 822         break;
 823       } else {
 824         // Don't synthesized a newline after BCPL comment.
 825         CommentState = LCS_BetweenComments;
 826         goto again;
 827       }
 828     }
 829   }
 830 }
 831
 832 StringRef Lexer::getSpelling(const Token &Tok,
 833                              const SourceManager &SourceMgr,
 834                              bool *Invalid) const {
 835   SourceLocation Loc = Tok.getLocation();
 836   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
 837
 838   bool InvalidTemp = false;
 839   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
 840   if (InvalidTemp) {
 841     *Invalid = true;
 842     return StringRef();
 843   }
 844
 845   const char *Begin = File.data() + LocInfo.second;
 846   return StringRef(Begin, Tok.getLength());
 847 }
 848
 849 } // end namespace comments
 850 } // end namespace clang
 851