1 #include "clang/AST/CommentLexer.h"
2 #include "clang/AST/CommentCommandTraits.h"
3 #include "clang/Basic/ConvertUTF.h"
4 #include "llvm/ADT/StringSwitch.h"
5 #include "llvm/Support/ErrorHandling.h"
10 void Token::dump(const Lexer &L, const SourceManager &SM) const {
11 llvm::errs() << "comments::Token Kind=" << Kind << " ";
13 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
17 bool isHTMLNamedCharacterReferenceCharacter(char C) {
18 return (C >= 'a' && C <= 'z') ||
19 (C >= 'A' && C <= 'Z');
22 bool isHTMLDecimalCharacterReferenceCharacter(char C) {
23 return C >= '0' && C <= '9';
26 bool isHTMLHexCharacterReferenceCharacter(char C) {
27 return (C >= '0' && C <= '9') ||
28 (C >= 'a' && C <= 'f') ||
29 (C >= 'A' && C <= 'F');
32 #include "clang/AST/CommentHTMLTags.inc"
34 } // unnamed namespace
36 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
37 return llvm::StringSwitch<StringRef>(Name)
46 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
47 unsigned CodePoint = 0;
48 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
49 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
51 CodePoint += Name[i] - '0';
54 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
55 char *ResolvedPtr = Resolved;
56 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
57 return StringRef(Resolved, ResolvedPtr - Resolved);
62 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
63 unsigned CodePoint = 0;
64 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
66 const char C = Name[i];
67 assert(isHTMLHexCharacterReferenceCharacter(C));
68 if (C >= '0' && C <= '9')
69 CodePoint += Name[i] - '0';
70 else if (C >= 'a' && C <= 'f')
71 CodePoint += Name[i] - 'a' + 10;
73 CodePoint += Name[i] - 'A' + 10;
76 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
77 char *ResolvedPtr = Resolved;
78 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
79 return StringRef(Resolved, ResolvedPtr - Resolved);
84 void Lexer::skipLineStartingDecorations() {
85 // This function should be called only for C comments
86 assert(CommentState == LCS_InsideCComment);
88 if (BufferPtr == CommentEnd)
96 const char *NewBufferPtr = BufferPtr;
98 if (NewBufferPtr == CommentEnd)
101 char C = *NewBufferPtr;
102 while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
104 if (NewBufferPtr == CommentEnd)
109 BufferPtr = NewBufferPtr + 1;
119 /// Returns pointer to the first newline character in the string.
120 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
121 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
122 const char C = *BufferPtr;
123 if (C == '\n' || C == '\r')
129 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
130 if (BufferPtr == BufferEnd)
133 if (*BufferPtr == '\n')
136 assert(*BufferPtr == '\r');
138 if (BufferPtr != BufferEnd && *BufferPtr == '\n')
144 const char *skipNamedCharacterReference(const char *BufferPtr,
145 const char *BufferEnd) {
146 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
147 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
153 const char *skipDecimalCharacterReference(const char *BufferPtr,
154 const char *BufferEnd) {
155 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
156 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
162 const char *skipHexCharacterReference(const char *BufferPtr,
163 const char *BufferEnd) {
164 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
165 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
171 bool isHTMLIdentifierStartingCharacter(char C) {
172 return (C >= 'a' && C <= 'z') ||
173 (C >= 'A' && C <= 'Z');
176 bool isHTMLIdentifierCharacter(char C) {
177 return (C >= 'a' && C <= 'z') ||
178 (C >= 'A' && C <= 'Z') ||
179 (C >= '0' && C <= '9');
182 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
183 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184 if (!isHTMLIdentifierCharacter(*BufferPtr))
190 /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
193 /// Returns pointer to closing quote.
194 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
196 const char Quote = *BufferPtr;
197 assert(Quote == '\"' || Quote == '\'');
200 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
201 const char C = *BufferPtr;
202 if (C == Quote && BufferPtr[-1] != '\\')
208 bool isHorizontalWhitespace(char C) {
209 return C == ' ' || C == '\t' || C == '\f' || C == '\v';
212 bool isWhitespace(char C) {
213 return C == ' ' || C == '\n' || C == '\r' ||
214 C == '\t' || C == '\f' || C == '\v';
217 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
218 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
219 if (!isWhitespace(*BufferPtr))
225 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
226 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
229 bool isCommandNameStartCharacter(char C) {
230 return (C >= 'a' && C <= 'z') ||
231 (C >= 'A' && C <= 'Z');
234 bool isCommandNameCharacter(char C) {
235 return (C >= 'a' && C <= 'z') ||
236 (C >= 'A' && C <= 'Z') ||
237 (C >= '0' && C <= '9');
240 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
241 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
242 if (!isCommandNameCharacter(*BufferPtr))
248 /// Return the one past end pointer for BCPL comments.
249 /// Handles newlines escaped with backslash or trigraph for backslahs.
250 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
251 const char *CurPtr = BufferPtr;
252 while (CurPtr != BufferEnd) {
254 while (C != '\n' && C != '\r') {
256 if (CurPtr == BufferEnd)
260 // We found a newline, check if it is escaped.
261 const char *EscapePtr = CurPtr - 1;
262 while(isHorizontalWhitespace(*EscapePtr))
265 if (*EscapePtr == '\\' ||
266 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
267 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
268 // We found an escaped newline.
269 CurPtr = skipNewline(CurPtr, BufferEnd);
271 return CurPtr; // Not an escaped newline.
276 /// Return the one past end pointer for C comments.
277 /// Very dumb, does not handle escaped newlines or trigraphs.
278 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
279 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
280 if (*BufferPtr == '*') {
281 assert(BufferPtr + 1 != BufferEnd);
282 if (*(BufferPtr + 1) == '/')
286 llvm_unreachable("buffer end hit before '*/' was seen");
288 } // unnamed namespace
290 void Lexer::lexCommentText(Token &T) {
291 assert(CommentState == LCS_InsideBCPLComment ||
292 CommentState == LCS_InsideCComment);
297 case LS_VerbatimBlockFirstLine:
298 lexVerbatimBlockFirstLine(T);
300 case LS_VerbatimBlockBody:
301 lexVerbatimBlockBody(T);
303 case LS_VerbatimLineText:
304 lexVerbatimLineText(T);
306 case LS_HTMLStartTag:
314 assert(State == LS_Normal);
316 const char *TokenPtr = BufferPtr;
317 assert(TokenPtr < CommentEnd);
318 while (TokenPtr != CommentEnd) {
323 if (TokenPtr == CommentEnd) {
324 formTextToken(T, TokenPtr);
332 case '\\': case '@': case '&': case '$':
333 case '#': case '<': case '>': case '%':
334 case '\"': case '.': case ':':
335 // This is one of \\ \@ \& \$ etc escape sequences.
337 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
338 // This is the \:: escape sequence.
341 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
342 formTokenWithChars(T, TokenPtr, tok::text);
343 T.setText(UnescapedText);
347 // Don't make zero-length commands.
348 if (!isCommandNameStartCharacter(*TokenPtr)) {
349 formTextToken(T, TokenPtr);
353 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
354 unsigned Length = TokenPtr - (BufferPtr + 1);
356 // Hardcoded support for lexing LaTeX formula commands
357 // \f$ \f[ \f] \f{ \f} as a single command.
358 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
360 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
366 const StringRef CommandName(BufferPtr + 1, Length);
368 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
370 formTokenWithChars(T, TokenPtr, tok::unknown_command);
371 T.setUnknownCommandName(CommandName);
374 if (Info->IsVerbatimBlockCommand) {
375 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
378 if (Info->IsVerbatimLineCommand) {
379 setupAndLexVerbatimLine(T, TokenPtr, Info);
382 formTokenWithChars(T, TokenPtr, tok::command);
383 T.setCommandID(Info->getID());
388 lexHTMLCharacterReference(T);
393 if (TokenPtr == CommentEnd) {
394 formTextToken(T, TokenPtr);
397 const char C = *TokenPtr;
398 if (isHTMLIdentifierStartingCharacter(C))
399 setupAndLexHTMLStartTag(T);
401 setupAndLexHTMLEndTag(T);
403 formTextToken(T, TokenPtr);
410 TokenPtr = skipNewline(TokenPtr, CommentEnd);
411 formTokenWithChars(T, TokenPtr, tok::newline);
413 if (CommentState == LCS_InsideCComment)
414 skipLineStartingDecorations();
420 if (TokenPtr == CommentEnd)
422 const char C = *TokenPtr;
423 if(C == '\n' || C == '\r' ||
424 C == '\\' || C == '@' || C == '&' || C == '<')
427 formTextToken(T, TokenPtr);
434 void Lexer::setupAndLexVerbatimBlock(Token &T,
435 const char *TextBegin,
436 char Marker, const CommandInfo *Info) {
437 assert(Info->IsVerbatimBlockCommand);
439 VerbatimBlockEndCommandName.clear();
440 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
441 VerbatimBlockEndCommandName.append(Info->EndCommandName);
443 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
444 T.setVerbatimBlockID(Info->getID());
446 // If there is a newline following the verbatim opening command, skip the
447 // newline so that we don't create an tok::verbatim_block_line with empty
449 if (BufferPtr != CommentEnd) {
450 const char C = *BufferPtr;
451 if (C == '\n' || C == '\r') {
452 BufferPtr = skipNewline(BufferPtr, CommentEnd);
453 State = LS_VerbatimBlockBody;
458 State = LS_VerbatimBlockFirstLine;
461 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
463 assert(BufferPtr < CommentEnd);
465 // FIXME: It would be better to scan the text once, finding either the block
466 // end command or newline.
468 // Extract current line.
469 const char *Newline = findNewline(BufferPtr, CommentEnd);
470 StringRef Line(BufferPtr, Newline - BufferPtr);
472 // Look for end command in current line.
473 size_t Pos = Line.find(VerbatimBlockEndCommandName);
475 const char *NextLine;
476 if (Pos == StringRef::npos) {
477 // Current line is completely verbatim.
479 NextLine = skipNewline(Newline, CommentEnd);
480 } else if (Pos == 0) {
481 // Current line contains just an end command.
482 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
483 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
484 formTokenWithChars(T, End, tok::verbatim_block_end);
485 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
489 // There is some text, followed by end command. Extract text first.
490 TextEnd = BufferPtr + Pos;
492 // If there is only whitespace before end command, skip whitespace.
493 if (isWhitespace(BufferPtr, TextEnd)) {
499 StringRef Text(BufferPtr, TextEnd - BufferPtr);
500 formTokenWithChars(T, NextLine, tok::verbatim_block_line);
501 T.setVerbatimBlockText(Text);
503 State = LS_VerbatimBlockBody;
506 void Lexer::lexVerbatimBlockBody(Token &T) {
507 assert(State == LS_VerbatimBlockBody);
509 if (CommentState == LCS_InsideCComment)
510 skipLineStartingDecorations();
512 lexVerbatimBlockFirstLine(T);
515 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
516 const CommandInfo *Info) {
517 assert(Info->IsVerbatimLineCommand);
518 formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
519 T.setVerbatimLineID(Info->getID());
521 State = LS_VerbatimLineText;
524 void Lexer::lexVerbatimLineText(Token &T) {
525 assert(State == LS_VerbatimLineText);
527 // Extract current line.
528 const char *Newline = findNewline(BufferPtr, CommentEnd);
529 const StringRef Text(BufferPtr, Newline - BufferPtr);
530 formTokenWithChars(T, Newline, tok::verbatim_line_text);
531 T.setVerbatimLineText(Text);
536 void Lexer::lexHTMLCharacterReference(Token &T) {
537 const char *TokenPtr = BufferPtr;
538 assert(*TokenPtr == '&');
540 if (TokenPtr == CommentEnd) {
541 formTextToken(T, TokenPtr);
545 bool isNamed = false;
546 bool isDecimal = false;
548 if (isHTMLNamedCharacterReferenceCharacter(C)) {
550 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
552 } else if (C == '#') {
554 if (TokenPtr == CommentEnd) {
555 formTextToken(T, TokenPtr);
559 if (isHTMLDecimalCharacterReferenceCharacter(C)) {
561 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
563 } else if (C == 'x' || C == 'X') {
566 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
568 formTextToken(T, TokenPtr);
572 formTextToken(T, TokenPtr);
575 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
577 formTextToken(T, TokenPtr);
580 StringRef Name(NamePtr, TokenPtr - NamePtr);
581 TokenPtr++; // Skip semicolon.
584 Resolved = resolveHTMLNamedCharacterReference(Name);
586 Resolved = resolveHTMLDecimalCharacterReference(Name);
588 Resolved = resolveHTMLHexCharacterReference(Name);
590 if (Resolved.empty()) {
591 formTextToken(T, TokenPtr);
594 formTokenWithChars(T, TokenPtr, tok::text);
599 void Lexer::setupAndLexHTMLStartTag(Token &T) {
600 assert(BufferPtr[0] == '<' &&
601 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
602 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
603 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
604 if (!isHTMLTagName(Name)) {
605 formTextToken(T, TagNameEnd);
609 formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
610 T.setHTMLTagStartName(Name);
612 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
614 const char C = *BufferPtr;
615 if (BufferPtr != CommentEnd &&
616 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
617 State = LS_HTMLStartTag;
620 void Lexer::lexHTMLStartTag(Token &T) {
621 assert(State == LS_HTMLStartTag);
623 const char *TokenPtr = BufferPtr;
625 if (isHTMLIdentifierCharacter(C)) {
626 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
627 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
628 formTokenWithChars(T, TokenPtr, tok::html_ident);
629 T.setHTMLIdent(Ident);
634 formTokenWithChars(T, TokenPtr, tok::html_equals);
638 const char *OpenQuote = TokenPtr;
639 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
640 const char *ClosingQuote = TokenPtr;
641 if (TokenPtr != CommentEnd) // Skip closing quote.
643 formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
644 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
645 ClosingQuote - (OpenQuote + 1)));
650 formTokenWithChars(T, TokenPtr, tok::html_greater);
655 if (TokenPtr != CommentEnd && *TokenPtr == '>') {
657 formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
659 formTextToken(T, TokenPtr);
666 // Now look ahead and return to normal state if we don't see any HTML tokens
668 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
669 if (BufferPtr == CommentEnd) {
675 if (!isHTMLIdentifierStartingCharacter(C) &&
676 C != '=' && C != '\"' && C != '\'' && C != '>') {
682 void Lexer::setupAndLexHTMLEndTag(Token &T) {
683 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
685 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
686 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
687 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
688 if (!isHTMLTagName(Name)) {
689 formTextToken(T, TagNameEnd);
693 const char *End = skipWhitespace(TagNameEnd, CommentEnd);
695 formTokenWithChars(T, End, tok::html_end_tag);
696 T.setHTMLTagEndName(Name);
698 if (BufferPtr != CommentEnd && *BufferPtr == '>')
699 State = LS_HTMLEndTag;
702 void Lexer::lexHTMLEndTag(Token &T) {
703 assert(BufferPtr != CommentEnd && *BufferPtr == '>');
705 formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
709 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
710 SourceLocation FileLoc,
711 const char *BufferStart, const char *BufferEnd):
712 Allocator(Allocator), Traits(Traits),
713 BufferStart(BufferStart), BufferEnd(BufferEnd),
714 FileLoc(FileLoc), BufferPtr(BufferStart),
715 CommentState(LCS_BeforeComment), State(LS_Normal) {
718 void Lexer::lex(Token &T) {
720 switch (CommentState) {
721 case LCS_BeforeComment:
722 if (BufferPtr == BufferEnd) {
723 formTokenWithChars(T, BufferPtr, tok::eof);
727 assert(*BufferPtr == '/');
728 BufferPtr++; // Skip first slash.
730 case '/': { // BCPL comment.
731 BufferPtr++; // Skip second slash.
733 if (BufferPtr != BufferEnd) {
734 // Skip Doxygen magic marker, if it is present.
735 // It might be missing because of a typo //< or /*<, or because we
736 // merged this non-Doxygen comment into a bunch of Doxygen comments
737 // around it: /** ... */ /* ... */ /** ... */
738 const char C = *BufferPtr;
739 if (C == '/' || C == '!')
743 // Skip less-than symbol that marks trailing comments.
744 // Skip it even if the comment is not a Doxygen one, because //< and /*<
745 // are frequent typos.
746 if (BufferPtr != BufferEnd && *BufferPtr == '<')
749 CommentState = LCS_InsideBCPLComment;
750 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
752 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
755 case '*': { // C comment.
756 BufferPtr++; // Skip star.
758 // Skip Doxygen magic marker.
759 const char C = *BufferPtr;
760 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
763 // Skip less-than symbol that marks trailing comments.
764 if (BufferPtr != BufferEnd && *BufferPtr == '<')
767 CommentState = LCS_InsideCComment;
769 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
773 llvm_unreachable("second character of comment should be '/' or '*'");
776 case LCS_BetweenComments: {
777 // Consecutive comments are extracted only if there is only whitespace
778 // between them. So we can search for the start of the next comment.
779 const char *EndWhitespace = BufferPtr;
780 while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
783 // Turn any whitespace between comments (and there is only whitespace
784 // between them -- guaranteed by comment extraction) into a newline. We
785 // have two newlines between C comments in total (first one was synthesized
787 formTokenWithChars(T, EndWhitespace, tok::newline);
789 CommentState = LCS_BeforeComment;
793 case LCS_InsideBCPLComment:
794 case LCS_InsideCComment:
795 if (BufferPtr != CommentEnd) {
799 // Skip C comment closing sequence.
800 if (CommentState == LCS_InsideCComment) {
801 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
803 assert(BufferPtr <= BufferEnd);
805 // Synthenize newline just after the C comment, regardless if there is
806 // actually a newline.
807 formTokenWithChars(T, BufferPtr, tok::newline);
809 CommentState = LCS_BetweenComments;
812 // Don't synthesized a newline after BCPL comment.
813 CommentState = LCS_BetweenComments;
820 StringRef Lexer::getSpelling(const Token &Tok,
821 const SourceManager &SourceMgr,
822 bool *Invalid) const {
823 SourceLocation Loc = Tok.getLocation();
824 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
826 bool InvalidTemp = false;
827 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
833 const char *Begin = File.data() + LocInfo.second;
834 return StringRef(Begin, Tok.getLength());
837 } // end namespace comments
838 } // end namespace clang