1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines lexer for structured comments and supporting token class.
12 //===----------------------------------------------------------------------===//
14 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
15 #define LLVM_CLANG_AST_COMMENTLEXER_H
17 #include "clang/Basic/Diagnostic.h"
18 #include "clang/Basic/SourceManager.h"
19 #include "llvm/ADT/SmallString.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/Support/Allocator.h"
22 #include "llvm/Support/raw_ostream.h"
28 class TextTokenRetokenizer;
37 unknown_command, // Command that does not have an ID.
38 backslash_command, // Command with an ID, that used backslash marker.
39 at_command, // Command with an ID, that used 'at' marker.
45 html_start_tag, // <tag
48 html_quoted_string, // "blah\"blah" or 'blah\'blah'
50 html_slash_greater, // />
53 } // end namespace tok
58 friend class TextTokenRetokenizer;
60 /// The location of the token.
63 /// The actual kind of the token.
66 /// Length of the token spelling in comment. Can be 0 for synthenized
70 /// Contains text value associated with a token.
73 /// Integer value associated with a token.
75 /// If the token is a known command, contains command ID and TextPtr is
76 /// unused (command spelling can be found with CommandTraits). Otherwise,
77 /// contains the length of the string that starts at TextPtr.
81 SourceLocation getLocation() const LLVM_READONLY { return Loc; }
82 void setLocation(SourceLocation SL) { Loc = SL; }
84 SourceLocation getEndLocation() const LLVM_READONLY {
85 if (Length == 0 || Length == 1)
87 return Loc.getLocWithOffset(Length - 1);
90 tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
91 void setKind(tok::TokenKind K) { Kind = K; }
93 bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
94 bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
96 unsigned getLength() const LLVM_READONLY { return Length; }
97 void setLength(unsigned L) { Length = L; }
99 StringRef getText() const LLVM_READONLY {
100 assert(is(tok::text));
101 return StringRef(TextPtr, IntVal);
104 void setText(StringRef Text) {
105 assert(is(tok::text));
106 TextPtr = Text.data();
107 IntVal = Text.size();
110 StringRef getUnknownCommandName() const LLVM_READONLY {
111 assert(is(tok::unknown_command));
112 return StringRef(TextPtr, IntVal);
115 void setUnknownCommandName(StringRef Name) {
116 assert(is(tok::unknown_command));
117 TextPtr = Name.data();
118 IntVal = Name.size();
121 unsigned getCommandID() const LLVM_READONLY {
122 assert(is(tok::backslash_command) || is(tok::at_command));
126 void setCommandID(unsigned ID) {
127 assert(is(tok::backslash_command) || is(tok::at_command));
131 unsigned getVerbatimBlockID() const LLVM_READONLY {
132 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
136 void setVerbatimBlockID(unsigned ID) {
137 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
141 StringRef getVerbatimBlockText() const LLVM_READONLY {
142 assert(is(tok::verbatim_block_line));
143 return StringRef(TextPtr, IntVal);
146 void setVerbatimBlockText(StringRef Text) {
147 assert(is(tok::verbatim_block_line));
148 TextPtr = Text.data();
149 IntVal = Text.size();
152 unsigned getVerbatimLineID() const LLVM_READONLY {
153 assert(is(tok::verbatim_line_name));
157 void setVerbatimLineID(unsigned ID) {
158 assert(is(tok::verbatim_line_name));
162 StringRef getVerbatimLineText() const LLVM_READONLY {
163 assert(is(tok::verbatim_line_text));
164 return StringRef(TextPtr, IntVal);
167 void setVerbatimLineText(StringRef Text) {
168 assert(is(tok::verbatim_line_text));
169 TextPtr = Text.data();
170 IntVal = Text.size();
173 StringRef getHTMLTagStartName() const LLVM_READONLY {
174 assert(is(tok::html_start_tag));
175 return StringRef(TextPtr, IntVal);
178 void setHTMLTagStartName(StringRef Name) {
179 assert(is(tok::html_start_tag));
180 TextPtr = Name.data();
181 IntVal = Name.size();
184 StringRef getHTMLIdent() const LLVM_READONLY {
185 assert(is(tok::html_ident));
186 return StringRef(TextPtr, IntVal);
189 void setHTMLIdent(StringRef Name) {
190 assert(is(tok::html_ident));
191 TextPtr = Name.data();
192 IntVal = Name.size();
195 StringRef getHTMLQuotedString() const LLVM_READONLY {
196 assert(is(tok::html_quoted_string));
197 return StringRef(TextPtr, IntVal);
200 void setHTMLQuotedString(StringRef Str) {
201 assert(is(tok::html_quoted_string));
202 TextPtr = Str.data();
206 StringRef getHTMLTagEndName() const LLVM_READONLY {
207 assert(is(tok::html_end_tag));
208 return StringRef(TextPtr, IntVal);
211 void setHTMLTagEndName(StringRef Name) {
212 assert(is(tok::html_end_tag));
213 TextPtr = Name.data();
214 IntVal = Name.size();
217 void dump(const Lexer &L, const SourceManager &SM) const;
223 Lexer(const Lexer &) = delete;
224 void operator=(const Lexer &) = delete;
226 /// Allocator for strings that are semantic values of tokens and have to be
227 /// computed (for example, resolved decimal character references).
228 llvm::BumpPtrAllocator &Allocator;
230 DiagnosticsEngine &Diags;
232 const CommandTraits &Traits;
234 const char *const BufferStart;
235 const char *const BufferEnd;
236 SourceLocation FileLoc;
238 const char *BufferPtr;
240 /// One past end pointer for the current comment. For BCPL comments points
241 /// to newline or BufferEnd, for C comments points to star in '*/'.
242 const char *CommentEnd;
244 enum LexerCommentState {
246 LCS_InsideBCPLComment,
251 /// Low-level lexer state, track if we are inside or outside of comment.
252 LexerCommentState CommentState;
255 /// Lexing normal comment text
258 /// Finished lexing verbatim block beginning command, will lex first body
260 LS_VerbatimBlockFirstLine,
262 /// Lexing verbatim block body line-by-line, skipping line-starting
264 LS_VerbatimBlockBody,
266 /// Finished lexing verbatim line beginning command, will lex text (one
270 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
273 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
277 /// Current lexing mode.
280 /// If State is LS_VerbatimBlock, contains the name of verbatim end
281 /// command, including command marker.
282 SmallString<16> VerbatimBlockEndCommandName;
284 /// If true, the commands, html tags, etc will be parsed and reported as
285 /// separate tokens inside the comment body. If false, the comment text will
286 /// be parsed into text and newline tokens.
289 /// Given a character reference name (e.g., "lt"), return the character that
290 /// it stands for (e.g., "<").
291 StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
293 /// Given a Unicode codepoint as base-10 integer, return the character.
294 StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
296 /// Given a Unicode codepoint as base-16 integer, return the character.
297 StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
299 void formTokenWithChars(Token &Result, const char *TokEnd,
300 tok::TokenKind Kind);
302 void formTextToken(Token &Result, const char *TokEnd) {
303 StringRef Text(BufferPtr, TokEnd - BufferPtr);
304 formTokenWithChars(Result, TokEnd, tok::text);
305 Result.setText(Text);
308 SourceLocation getSourceLocation(const char *Loc) const {
309 assert(Loc >= BufferStart && Loc <= BufferEnd &&
310 "Location out of range for this buffer!");
312 const unsigned CharNo = Loc - BufferStart;
313 return FileLoc.getLocWithOffset(CharNo);
316 DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
317 return Diags.Report(Loc, DiagID);
320 /// Eat string matching regexp \code \s*\* \endcode.
321 void skipLineStartingDecorations();
323 /// Lex comment text, including commands if ParseCommands is set to true.
324 void lexCommentText(Token &T);
326 void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
327 const CommandInfo *Info);
329 void lexVerbatimBlockFirstLine(Token &T);
331 void lexVerbatimBlockBody(Token &T);
333 void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
334 const CommandInfo *Info);
336 void lexVerbatimLineText(Token &T);
338 void lexHTMLCharacterReference(Token &T);
340 void setupAndLexHTMLStartTag(Token &T);
342 void lexHTMLStartTag(Token &T);
344 void setupAndLexHTMLEndTag(Token &T);
346 void lexHTMLEndTag(Token &T);
349 Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
350 const CommandTraits &Traits, SourceLocation FileLoc,
351 const char *BufferStart, const char *BufferEnd,
352 bool ParseCommands = true);
356 StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr,
357 bool *Invalid = nullptr) const;
360 } // end namespace comments
361 } // end namespace clang