1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines lexer for structured comments and supporting token class.
12 //===----------------------------------------------------------------------===//
14 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
15 #define LLVM_CLANG_AST_COMMENTLEXER_H
17 #include "clang/Basic/Diagnostic.h"
18 #include "clang/Basic/SourceManager.h"
19 #include "llvm/ADT/SmallString.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/Support/Allocator.h"
23 #include "llvm/Support/raw_ostream.h"
29 class TextTokenRetokenizer;
38 unknown_command, // Command that does not have an ID.
39 backslash_command, // Command with an ID, that used backslash marker.
40 at_command, // Command with an ID, that used 'at' marker.
46 html_start_tag, // <tag
49 html_quoted_string, // "blah\"blah" or 'blah\'blah'
51 html_slash_greater, // />
54 } // end namespace tok
56 /// \brief Comment token.
59 friend class TextTokenRetokenizer;
61 /// The location of the token.
64 /// The actual kind of the token.
67 /// Length of the token spelling in comment. Can be 0 for synthenized
71 /// Contains text value associated with a token.
74 /// Integer value associated with a token.
76 /// If the token is a konwn command, contains command ID and TextPtr is
77 /// unused (command spelling can be found with CommandTraits). Otherwise,
78 /// contains the length of the string that starts at TextPtr.
82 SourceLocation getLocation() const LLVM_READONLY { return Loc; }
83 void setLocation(SourceLocation SL) { Loc = SL; }
85 SourceLocation getEndLocation() const LLVM_READONLY {
86 if (Length == 0 || Length == 1)
88 return Loc.getLocWithOffset(Length - 1);
91 tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
92 void setKind(tok::TokenKind K) { Kind = K; }
94 bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
95 bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
97 unsigned getLength() const LLVM_READONLY { return Length; }
98 void setLength(unsigned L) { Length = L; }
100 StringRef getText() const LLVM_READONLY {
101 assert(is(tok::text));
102 return StringRef(TextPtr, IntVal);
105 void setText(StringRef Text) {
106 assert(is(tok::text));
107 TextPtr = Text.data();
108 IntVal = Text.size();
111 StringRef getUnknownCommandName() const LLVM_READONLY {
112 assert(is(tok::unknown_command));
113 return StringRef(TextPtr, IntVal);
116 void setUnknownCommandName(StringRef Name) {
117 assert(is(tok::unknown_command));
118 TextPtr = Name.data();
119 IntVal = Name.size();
122 unsigned getCommandID() const LLVM_READONLY {
123 assert(is(tok::backslash_command) || is(tok::at_command));
127 void setCommandID(unsigned ID) {
128 assert(is(tok::backslash_command) || is(tok::at_command));
132 unsigned getVerbatimBlockID() const LLVM_READONLY {
133 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
137 void setVerbatimBlockID(unsigned ID) {
138 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
142 StringRef getVerbatimBlockText() const LLVM_READONLY {
143 assert(is(tok::verbatim_block_line));
144 return StringRef(TextPtr, IntVal);
147 void setVerbatimBlockText(StringRef Text) {
148 assert(is(tok::verbatim_block_line));
149 TextPtr = Text.data();
150 IntVal = Text.size();
153 unsigned getVerbatimLineID() const LLVM_READONLY {
154 assert(is(tok::verbatim_line_name));
158 void setVerbatimLineID(unsigned ID) {
159 assert(is(tok::verbatim_line_name));
163 StringRef getVerbatimLineText() const LLVM_READONLY {
164 assert(is(tok::verbatim_line_text));
165 return StringRef(TextPtr, IntVal);
168 void setVerbatimLineText(StringRef Text) {
169 assert(is(tok::verbatim_line_text));
170 TextPtr = Text.data();
171 IntVal = Text.size();
174 StringRef getHTMLTagStartName() const LLVM_READONLY {
175 assert(is(tok::html_start_tag));
176 return StringRef(TextPtr, IntVal);
179 void setHTMLTagStartName(StringRef Name) {
180 assert(is(tok::html_start_tag));
181 TextPtr = Name.data();
182 IntVal = Name.size();
185 StringRef getHTMLIdent() const LLVM_READONLY {
186 assert(is(tok::html_ident));
187 return StringRef(TextPtr, IntVal);
190 void setHTMLIdent(StringRef Name) {
191 assert(is(tok::html_ident));
192 TextPtr = Name.data();
193 IntVal = Name.size();
196 StringRef getHTMLQuotedString() const LLVM_READONLY {
197 assert(is(tok::html_quoted_string));
198 return StringRef(TextPtr, IntVal);
201 void setHTMLQuotedString(StringRef Str) {
202 assert(is(tok::html_quoted_string));
203 TextPtr = Str.data();
207 StringRef getHTMLTagEndName() const LLVM_READONLY {
208 assert(is(tok::html_end_tag));
209 return StringRef(TextPtr, IntVal);
212 void setHTMLTagEndName(StringRef Name) {
213 assert(is(tok::html_end_tag));
214 TextPtr = Name.data();
215 IntVal = Name.size();
218 void dump(const Lexer &L, const SourceManager &SM) const;
221 /// \brief Comment lexer.
224 Lexer(const Lexer &) = delete;
225 void operator=(const Lexer &) = delete;
227 /// Allocator for strings that are semantic values of tokens and have to be
228 /// computed (for example, resolved decimal character references).
229 llvm::BumpPtrAllocator &Allocator;
231 DiagnosticsEngine &Diags;
233 const CommandTraits &Traits;
235 const char *const BufferStart;
236 const char *const BufferEnd;
237 SourceLocation FileLoc;
239 const char *BufferPtr;
241 /// One past end pointer for the current comment. For BCPL comments points
242 /// to newline or BufferEnd, for C comments points to star in '*/'.
243 const char *CommentEnd;
245 enum LexerCommentState {
247 LCS_InsideBCPLComment,
252 /// Low-level lexer state, track if we are inside or outside of comment.
253 LexerCommentState CommentState;
256 /// Lexing normal comment text
259 /// Finished lexing verbatim block beginning command, will lex first body
261 LS_VerbatimBlockFirstLine,
263 /// Lexing verbatim block body line-by-line, skipping line-starting
265 LS_VerbatimBlockBody,
267 /// Finished lexing verbatim line beginning command, will lex text (one
271 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
274 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
278 /// Current lexing mode.
281 /// If State is LS_VerbatimBlock, contains the name of verbatim end
282 /// command, including command marker.
283 SmallString<16> VerbatimBlockEndCommandName;
285 /// Given a character reference name (e.g., "lt"), return the character that
286 /// it stands for (e.g., "<").
287 StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
289 /// Given a Unicode codepoint as base-10 integer, return the character.
290 StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
292 /// Given a Unicode codepoint as base-16 integer, return the character.
293 StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
295 void formTokenWithChars(Token &Result, const char *TokEnd,
296 tok::TokenKind Kind);
298 void formTextToken(Token &Result, const char *TokEnd) {
299 StringRef Text(BufferPtr, TokEnd - BufferPtr);
300 formTokenWithChars(Result, TokEnd, tok::text);
301 Result.setText(Text);
304 SourceLocation getSourceLocation(const char *Loc) const {
305 assert(Loc >= BufferStart && Loc <= BufferEnd &&
306 "Location out of range for this buffer!");
308 const unsigned CharNo = Loc - BufferStart;
309 return FileLoc.getLocWithOffset(CharNo);
312 DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
313 return Diags.Report(Loc, DiagID);
316 /// Eat string matching regexp \code \s*\* \endcode.
317 void skipLineStartingDecorations();
319 /// Lex stuff inside comments. CommentEnd should be set correctly.
320 void lexCommentText(Token &T);
322 void setupAndLexVerbatimBlock(Token &T,
323 const char *TextBegin,
324 char Marker, const CommandInfo *Info);
326 void lexVerbatimBlockFirstLine(Token &T);
328 void lexVerbatimBlockBody(Token &T);
330 void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
331 const CommandInfo *Info);
333 void lexVerbatimLineText(Token &T);
335 void lexHTMLCharacterReference(Token &T);
337 void setupAndLexHTMLStartTag(Token &T);
339 void lexHTMLStartTag(Token &T);
341 void setupAndLexHTMLEndTag(Token &T);
343 void lexHTMLEndTag(Token &T);
346 Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
347 const CommandTraits &Traits,
348 SourceLocation FileLoc,
349 const char *BufferStart, const char *BufferEnd);
353 StringRef getSpelling(const Token &Tok,
354 const SourceManager &SourceMgr,
355 bool *Invalid = nullptr) const;
358 } // end namespace comments
359 } // end namespace clang