1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines lexer for structured comments and supporting token class.
12 //===----------------------------------------------------------------------===//
14 #ifndef LLVM_CLANG_AST_COMMENT_LEXER_H
15 #define LLVM_CLANG_AST_COMMENT_LEXER_H
17 #include "clang/Basic/SourceManager.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/ADT/SmallString.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/Support/Allocator.h"
22 #include "llvm/Support/raw_ostream.h"
28 class TextTokenRetokenizer;
37 unknown_command, // Command that does not have an ID.
38 command, // Command with an ID.
44 html_start_tag, // <tag
47 html_quoted_string, // "blah\"blah" or 'blah\'blah'
49 html_slash_greater, // />
52 } // end namespace tok
54 /// \brief Comment token.
57 friend class TextTokenRetokenizer;
59 /// The location of the token.
62 /// The actual kind of the token.
65 /// Length of the token spelling in comment. Can be 0 for synthenized
69 /// Contains text value associated with a token.
72 /// Integer value associated with a token.
74 /// If the token is a konwn command, contains command ID and TextPtr is
75 /// unused (command spelling can be found with CommandTraits). Otherwise,
76 /// contains the length of the string that starts at TextPtr.
80 SourceLocation getLocation() const LLVM_READONLY { return Loc; }
81 void setLocation(SourceLocation SL) { Loc = SL; }
83 SourceLocation getEndLocation() const LLVM_READONLY {
84 if (Length == 0 || Length == 1)
86 return Loc.getLocWithOffset(Length - 1);
89 tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
90 void setKind(tok::TokenKind K) { Kind = K; }
92 bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
93 bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
95 unsigned getLength() const LLVM_READONLY { return Length; }
96 void setLength(unsigned L) { Length = L; }
98 StringRef getText() const LLVM_READONLY {
99 assert(is(tok::text));
100 return StringRef(TextPtr, IntVal);
103 void setText(StringRef Text) {
104 assert(is(tok::text));
105 TextPtr = Text.data();
106 IntVal = Text.size();
109 StringRef getUnknownCommandName() const LLVM_READONLY {
110 assert(is(tok::unknown_command));
111 return StringRef(TextPtr, IntVal);
114 void setUnknownCommandName(StringRef Name) {
115 assert(is(tok::unknown_command));
116 TextPtr = Name.data();
117 IntVal = Name.size();
120 unsigned getCommandID() const LLVM_READONLY {
121 assert(is(tok::command));
125 void setCommandID(unsigned ID) {
126 assert(is(tok::command));
130 unsigned getVerbatimBlockID() const LLVM_READONLY {
131 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
135 void setVerbatimBlockID(unsigned ID) {
136 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
140 StringRef getVerbatimBlockText() const LLVM_READONLY {
141 assert(is(tok::verbatim_block_line));
142 return StringRef(TextPtr, IntVal);
145 void setVerbatimBlockText(StringRef Text) {
146 assert(is(tok::verbatim_block_line));
147 TextPtr = Text.data();
148 IntVal = Text.size();
151 unsigned getVerbatimLineID() const LLVM_READONLY {
152 assert(is(tok::verbatim_line_name));
156 void setVerbatimLineID(unsigned ID) {
157 assert(is(tok::verbatim_line_name));
161 StringRef getVerbatimLineText() const LLVM_READONLY {
162 assert(is(tok::verbatim_line_text));
163 return StringRef(TextPtr, IntVal);
166 void setVerbatimLineText(StringRef Text) {
167 assert(is(tok::verbatim_line_text));
168 TextPtr = Text.data();
169 IntVal = Text.size();
172 StringRef getHTMLTagStartName() const LLVM_READONLY {
173 assert(is(tok::html_start_tag));
174 return StringRef(TextPtr, IntVal);
177 void setHTMLTagStartName(StringRef Name) {
178 assert(is(tok::html_start_tag));
179 TextPtr = Name.data();
180 IntVal = Name.size();
183 StringRef getHTMLIdent() const LLVM_READONLY {
184 assert(is(tok::html_ident));
185 return StringRef(TextPtr, IntVal);
188 void setHTMLIdent(StringRef Name) {
189 assert(is(tok::html_ident));
190 TextPtr = Name.data();
191 IntVal = Name.size();
194 StringRef getHTMLQuotedString() const LLVM_READONLY {
195 assert(is(tok::html_quoted_string));
196 return StringRef(TextPtr, IntVal);
199 void setHTMLQuotedString(StringRef Str) {
200 assert(is(tok::html_quoted_string));
201 TextPtr = Str.data();
205 StringRef getHTMLTagEndName() const LLVM_READONLY {
206 assert(is(tok::html_end_tag));
207 return StringRef(TextPtr, IntVal);
210 void setHTMLTagEndName(StringRef Name) {
211 assert(is(tok::html_end_tag));
212 TextPtr = Name.data();
213 IntVal = Name.size();
216 void dump(const Lexer &L, const SourceManager &SM) const;
219 /// \brief Comment lexer.
222 Lexer(const Lexer &) LLVM_DELETED_FUNCTION;
223 void operator=(const Lexer &) LLVM_DELETED_FUNCTION;
225 /// Allocator for strings that are semantic values of tokens and have to be
226 /// computed (for example, resolved decimal character references).
227 llvm::BumpPtrAllocator &Allocator;
229 const CommandTraits &Traits;
231 const char *const BufferStart;
232 const char *const BufferEnd;
233 SourceLocation FileLoc;
235 const char *BufferPtr;
237 /// One past end pointer for the current comment. For BCPL comments points
238 /// to newline or BufferEnd, for C comments points to star in '*/'.
239 const char *CommentEnd;
241 enum LexerCommentState {
243 LCS_InsideBCPLComment,
248 /// Low-level lexer state, track if we are inside or outside of comment.
249 LexerCommentState CommentState;
252 /// Lexing normal comment text
255 /// Finished lexing verbatim block beginning command, will lex first body
257 LS_VerbatimBlockFirstLine,
259 /// Lexing verbatim block body line-by-line, skipping line-starting
261 LS_VerbatimBlockBody,
263 /// Finished lexing verbatim line beginning command, will lex text (one
267 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
270 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
274 /// Current lexing mode.
277 /// If State is LS_VerbatimBlock, contains the name of verbatim end
278 /// command, including command marker.
279 SmallString<16> VerbatimBlockEndCommandName;
281 /// Given a character reference name (e.g., "lt"), return the character that
282 /// it stands for (e.g., "<").
283 StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
285 /// Given a Unicode codepoint as base-10 integer, return the character.
286 StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
288 /// Given a Unicode codepoint as base-16 integer, return the character.
289 StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
291 void formTokenWithChars(Token &Result, const char *TokEnd,
292 tok::TokenKind Kind) {
293 const unsigned TokLen = TokEnd - BufferPtr;
294 Result.setLocation(getSourceLocation(BufferPtr));
295 Result.setKind(Kind);
296 Result.setLength(TokLen);
298 Result.TextPtr = "<UNSET>";
304 void formTextToken(Token &Result, const char *TokEnd) {
305 StringRef Text(BufferPtr, TokEnd - BufferPtr);
306 formTokenWithChars(Result, TokEnd, tok::text);
307 Result.setText(Text);
310 SourceLocation getSourceLocation(const char *Loc) const {
311 assert(Loc >= BufferStart && Loc <= BufferEnd &&
312 "Location out of range for this buffer!");
314 const unsigned CharNo = Loc - BufferStart;
315 return FileLoc.getLocWithOffset(CharNo);
318 /// Eat string matching regexp \code \s*\* \endcode.
319 void skipLineStartingDecorations();
321 /// Lex stuff inside comments. CommentEnd should be set correctly.
322 void lexCommentText(Token &T);
324 void setupAndLexVerbatimBlock(Token &T,
325 const char *TextBegin,
326 char Marker, const CommandInfo *Info);
328 void lexVerbatimBlockFirstLine(Token &T);
330 void lexVerbatimBlockBody(Token &T);
332 void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
333 const CommandInfo *Info);
335 void lexVerbatimLineText(Token &T);
337 void lexHTMLCharacterReference(Token &T);
339 void setupAndLexHTMLStartTag(Token &T);
341 void lexHTMLStartTag(Token &T);
343 void setupAndLexHTMLEndTag(Token &T);
345 void lexHTMLEndTag(Token &T);
348 Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
349 SourceLocation FileLoc,
350 const char *BufferStart, const char *BufferEnd);
354 StringRef getSpelling(const Token &Tok,
355 const SourceManager &SourceMgr,
356 bool *Invalid = NULL) const;
359 } // end namespace comments
360 } // end namespace clang