1 //===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
8 // Record tokens that a preprocessor emits and define operations to map between
9 // the tokens written in a file and tokens produced by the preprocessor.
11 // When running the compiler, there are two token streams we are interested in:
12 // - "spelled" tokens directly correspond to a substring written in some
14 // - "expanded" tokens represent the result of preprocessing, parses consumes
15 // this token stream to produce the AST.
17 // Expanded tokens correspond directly to locations found in the AST, allowing
18 // to find subranges of the token stream covered by various AST nodes. Spelled
19 // tokens correspond directly to the source code written by the user.
21 // To allow composing these two use-cases, we also define operations that map
22 // between expanded and spelled tokens that produced them (macro calls,
25 //===----------------------------------------------------------------------===//
27 #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
28 #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
30 #include "clang/Basic/LangOptions.h"
31 #include "clang/Basic/SourceLocation.h"
32 #include "clang/Basic/SourceManager.h"
33 #include "clang/Basic/TokenKinds.h"
34 #include "clang/Lex/Token.h"
35 #include "llvm/ADT/ArrayRef.h"
36 #include "llvm/ADT/DenseMap.h"
37 #include "llvm/ADT/Optional.h"
38 #include "llvm/ADT/StringRef.h"
39 #include "llvm/Support/Compiler.h"
40 #include "llvm/Support/raw_ostream.h"
49 /// A half-open character range inside a particular file, the start offset is
50 /// included and the end offset is excluded from the range.
52 /// EXPECTS: File.isValid() && Begin <= End.
53 FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset);
54 /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID().
55 FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length);
56 /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files
58 FileRange(const SourceManager &SM, SourceLocation BeginLoc,
59 SourceLocation EndLoc);
61 FileID file() const { return File; }
62 /// Start is a start offset (inclusive) in the corresponding file.
63 unsigned beginOffset() const { return Begin; }
64 /// End offset (exclusive) in the corresponding file.
65 unsigned endOffset() const { return End; }
67 unsigned length() const { return End - Begin; }
69 /// Check if \p Offset is inside the range.
70 bool contains(unsigned Offset) const {
71 return Begin <= Offset && Offset < End;
73 /// Check \p Offset is inside the range or equal to its endpoint.
74 bool touches(unsigned Offset) const {
75 return Begin <= Offset && Offset <= End;
78 /// Gets the substring that this FileRange refers to.
79 llvm::StringRef text(const SourceManager &SM) const;
81 /// Convert to the clang range. The returned range is always a char range,
82 /// never a token range.
83 CharSourceRange toCharRange(const SourceManager &SM) const;
85 friend bool operator==(const FileRange &L, const FileRange &R) {
86 return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
88 friend bool operator!=(const FileRange &L, const FileRange &R) {
98 /// For debugging purposes.
99 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);
101 /// A token coming directly from a file or from a macro invocation. Has just
102 /// enough information to locate the token in the source code.
103 /// Can represent both expanded and spelled tokens.
106 Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind);
107 /// EXPECTS: clang::Token is not an annotation token.
108 explicit Token(const clang::Token &T);
110 tok::TokenKind kind() const { return Kind; }
111 /// Location of the first character of a token.
112 SourceLocation location() const { return Location; }
113 /// Location right after the last character of a token.
114 SourceLocation endLocation() const {
115 return Location.getLocWithOffset(Length);
117 unsigned length() const { return Length; }
119 /// Get the substring covered by the token. Note that will include all
120 /// digraphs, newline continuations, etc. E.g. tokens for 'int' and
123 /// both have the same kind tok::kw_int, but results of text() are different.
124 llvm::StringRef text(const SourceManager &SM) const;
126 /// Gets a range of this token.
127 /// EXPECTS: token comes from a file, not from a macro expansion.
128 FileRange range(const SourceManager &SM) const;
130 /// Given two tokens inside the same file, returns a file range that starts at
131 /// \p First and ends at \p Last.
132 /// EXPECTS: First and Last are file tokens from the same file, Last starts
134 static FileRange range(const SourceManager &SM, const syntax::Token &First,
135 const syntax::Token &Last);
137 std::string dumpForTests(const SourceManager &SM) const;
138 /// For debugging purposes.
139 std::string str() const;
142 SourceLocation Location;
146 /// For debugging purposes. Equivalent to a call to Token::str().
147 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T);
149 /// A list of tokens obtained by preprocessing a text buffer and operations to
150 /// map between the expanded and spelled tokens, i.e. TokenBuffer has
151 /// information about two token streams:
152 /// 1. Expanded tokens: tokens produced by the preprocessor after all macro
154 /// 2. Spelled tokens: corresponding directly to the source code of a file
155 /// before any macro replacements occurred.
156 /// Here's an example to illustrate a difference between those two:
160 /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
161 /// Expanded tokens are {'int','a','=','10',';','eof'}.
163 /// Note that the expanded token stream has a tok::eof token at the end, the
164 /// spelled tokens never store a 'eof' token.
166 /// The full list expanded tokens can be obtained with expandedTokens(). Spelled
167 /// tokens for each of the files can be obtained via spelledTokens(FileID).
169 /// To map between the expanded and spelled tokens use findSpelledByExpanded().
171 /// To build a token buffer use the TokenCollector class. You can also compute
172 /// the spelled tokens of a file using the tokenize() helper.
174 /// FIXME: allow mappings into macro arguments.
177 TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {}
179 TokenBuffer(TokenBuffer &&) = default;
180 TokenBuffer(const TokenBuffer &) = delete;
181 TokenBuffer &operator=(TokenBuffer &&) = default;
182 TokenBuffer &operator=(const TokenBuffer &) = delete;
184 /// All tokens produced by the preprocessor after all macro replacements,
185 /// directives, etc. Source locations found in the clang AST will always
186 /// point to one of these tokens.
187 /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()).
188 /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
189 /// into two '>' tokens by the parser. However, TokenBuffer currently
190 /// keeps it as a single '>>' token.
191 llvm::ArrayRef<syntax::Token> expandedTokens() const {
192 return ExpandedTokens;
195 /// Builds a cache to make future calls to expandedToken(SourceRange) faster.
196 /// Creates an index only once. Further calls to it will be no-op.
197 void indexExpandedTokens();
199 /// Returns the subrange of expandedTokens() corresponding to the closed
201 /// Consider calling indexExpandedTokens() before for faster lookups.
202 llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const;
204 /// Returns the subrange of spelled tokens corresponding to AST node spanning
205 /// \p Expanded. This is the text that should be replaced if a refactoring
206 /// were to rewrite the node. If \p Expanded is empty, the returned value is
209 /// Will fail if the expanded tokens do not correspond to a sequence of
210 /// spelled tokens. E.g. for the following example:
212 /// #define FIRST f1 f2 f3
213 /// #define SECOND s1 s2 s3
214 /// #define ID2(X, Y) X Y
216 /// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
217 /// d ID2(e f g, h) i // expanded tokens are: d e f g h i
219 /// the results would be:
220 /// expanded => spelled
221 /// ------------------------
223 /// s1 s2 s3 => SECOND
224 /// a f1 f2 f3 => a FIRST
225 /// a f1 => can't map
226 /// s1 s2 => can't map
230 /// EXPECTS: \p Expanded is a subrange of expandedTokens().
231 /// Complexity is logarithmic.
232 llvm::Optional<llvm::ArrayRef<syntax::Token>>
233 spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const;
235 /// Find the subranges of expanded tokens, corresponding to \p Spelled.
237 /// Some spelled tokens may not be present in the expanded token stream, so
238 /// this function can return an empty vector, e.g. for tokens of macro
239 /// directives or disabled preprocessor branches.
241 /// Some spelled tokens can be duplicated in the expanded token stream
242 /// multiple times and this function will return multiple results in those
243 /// cases. This happens when \p Spelled is inside a macro argument.
245 /// FIXME: return correct results on macro arguments. For now, we return an
248 /// (!) will return empty vector on tokens from #define body:
249 /// E.g. for the following example:
251 /// #define FIRST(A) f1 A = A f2
254 /// a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s
255 /// The results would be
256 /// spelled => expanded
257 /// ------------------------
258 /// #define FIRST => {}
259 /// a FIRST(arg) => {a f1 arg = arg f2}
260 /// arg => {arg, arg} // arg #1 is before `=` and arg #2 is
261 /// // after `=` in the expanded tokens.
262 llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1>
263 expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;
265 /// An expansion produced by the preprocessor, includes macro expansions and
266 /// preprocessor directives. Preprocessor always maps a non-empty range of
267 /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a
268 /// few examples of expansions:
269 /// #pragma once // Expands to an empty range.
270 /// #define FOO 1 2 3 // Expands an empty range.
271 /// FOO // Expands to "1 2 3".
272 /// FIXME(ibiryukov): implement this, currently #include expansions are empty.
273 /// #include <vector> // Expands to tokens produced by the include.
275 llvm::ArrayRef<syntax::Token> Spelled;
276 llvm::ArrayRef<syntax::Token> Expanded;
278 /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting
279 /// a preprocessor directive) return the subrange of expanded tokens that the
280 /// macro expands to.
281 llvm::Optional<Expansion>
282 expansionStartingAt(const syntax::Token *Spelled) const;
283 /// Returns all expansions (partially) expanded from the specified tokens.
284 /// This is the expansions whose Spelled range intersects \p Spelled.
285 std::vector<Expansion>
286 expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const;
288 /// Lexed tokens of a file before preprocessing. E.g. for the following input
289 /// #define DECL(name) int name = 10
291 /// spelledTokens() returns
292 /// {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10",
293 /// "DECL", "(", "a", ")", ";"}
294 llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const;
296 /// Returns the spelled Token starting at Loc, if there are no such tokens
298 const syntax::Token *spelledTokenAt(SourceLocation Loc) const;
300 /// Get all tokens that expand a macro in \p FID. For the following input
302 /// #define FOO2(X) int X
306 /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5
308 std::vector<const syntax::Token *> macroExpansions(FileID FID) const;
310 const SourceManager &sourceManager() const { return *SourceMgr; }
312 std::string dumpForTests() const;
315 /// Describes a mapping between a continuous subrange of spelled tokens and
316 /// expanded tokens. Represents macro expansions, preprocessor directives,
317 /// conditionally disabled pp regions, etc.
319 /// #define BAR(a) a + 1
320 /// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
321 /// BAR(1) // invocation #2, tokens = {'a', '+', '1'},
322 /// macroTokens = {'BAR', '(', '1', ')'}.
324 // Positions in the corresponding spelled token stream. The corresponding
325 // range is never empty.
326 unsigned BeginSpelled = 0;
327 unsigned EndSpelled = 0;
328 // Positions in the expanded token stream. The corresponding range can be
330 unsigned BeginExpanded = 0;
331 unsigned EndExpanded = 0;
333 /// For debugging purposes.
334 std::string str() const;
336 /// Spelled tokens of the file with information about the subranges.
338 /// Lexed, but not preprocessed, tokens of the file. These map directly to
339 /// text in the corresponding files and include tokens of all preprocessor
341 /// FIXME: spelled tokens don't change across FileID that map to the same
342 /// FileEntry. We could consider deduplicating them to save memory.
343 std::vector<syntax::Token> SpelledTokens;
344 /// A sorted list to convert between the spelled and expanded token streams.
345 std::vector<Mapping> Mappings;
346 /// The first expanded token produced for this FileID.
347 unsigned BeginExpanded = 0;
348 unsigned EndExpanded = 0;
351 friend class TokenCollector;
353 /// Maps a single expanded token to its spelled counterpart or a mapping that
355 std::pair<const syntax::Token *, const Mapping *>
356 spelledForExpandedToken(const syntax::Token *Expanded) const;
358 /// Returns a mapping starting before \p Spelled token, or nullptr if no
359 /// such mapping exists.
360 static const Mapping *
361 mappingStartingBeforeSpelled(const MarkedFile &F,
362 const syntax::Token *Spelled);
364 /// Convert a private Mapping to a public Expansion.
365 Expansion makeExpansion(const MarkedFile &, const Mapping &) const;
366 /// Returns the file that the Spelled tokens are taken from.
367 /// Asserts that they are non-empty, from a tracked file, and in-bounds.
368 const MarkedFile &fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;
370 /// Token stream produced after preprocessing, conceputally this captures the
371 /// same stream as 'clang -E' (excluding the preprocessor directives like
373 std::vector<syntax::Token> ExpandedTokens;
374 // Index of ExpandedTokens for faster lookups by SourceLocation.
375 llvm::DenseMap<SourceLocation, unsigned> ExpandedTokIndex;
376 llvm::DenseMap<FileID, MarkedFile> Files;
377 // The value is never null, pointer instead of reference to avoid disabling
378 // implicit assignment operator.
379 const SourceManager *SourceMgr;
382 /// The spelled tokens that overlap or touch a spelling location Loc.
383 /// This always returns 0-2 tokens.
384 llvm::ArrayRef<syntax::Token>
385 spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens);
386 llvm::ArrayRef<syntax::Token>
387 spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens);
389 /// The identifier token that overlaps or touches a spelling location Loc.
390 /// If there is none, returns nullptr.
391 const syntax::Token *
392 spelledIdentifierTouching(SourceLocation Loc,
393 llvm::ArrayRef<syntax::Token> Tokens);
394 const syntax::Token *
395 spelledIdentifierTouching(SourceLocation Loc,
396 const syntax::TokenBuffer &Tokens);
398 /// Lex the text buffer, corresponding to \p FID, in raw mode and record the
399 /// resulting spelled tokens. Does minimal post-processing on raw identifiers,
400 /// setting the appropriate token kind (instead of the raw_identifier reported
401 /// by lexer in raw mode). This is a very low-level function, most users should
402 /// prefer to use TokenCollector. Lexing in raw mode produces wildly different
403 /// results from what one might expect when running a C++ frontend, e.g.
404 /// preprocessor does not run at all.
405 /// The result will *not* have a 'eof' token at the end.
406 std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
407 const LangOptions &LO);
408 /// Similar to one above, instead of whole file tokenizes a part of it. Note
409 /// that, the first token might be incomplete if FR.startOffset is not at the
410 /// beginning of a token, and the last token returned will start before the
411 /// FR.endOffset but might end after it.
412 std::vector<syntax::Token>
413 tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO);
415 /// Collects tokens for the main file while running the frontend action. An
416 /// instance of this object should be created on
417 /// FrontendAction::BeginSourceFile() and the results should be consumed after
418 /// FrontendAction::Execute() finishes.
419 class TokenCollector {
421 /// Adds the hooks to collect the tokens. Should be called before the
422 /// preprocessing starts, i.e. as a part of BeginSourceFile() or
423 /// CreateASTConsumer().
424 TokenCollector(Preprocessor &P);
426 /// Finalizes token collection. Should be called after preprocessing is
427 /// finished, i.e. after running Execute().
428 LLVM_NODISCARD TokenBuffer consume() &&;
431 /// Maps from a start to an end spelling location of transformations
432 /// performed by the preprocessor. These include:
433 /// 1. range from '#' to the last token in the line for PP directives,
434 /// 2. macro name and arguments for macro expansions.
435 /// Note that we record only top-level macro expansions, intermediate
436 /// expansions (e.g. inside macro arguments) are ignored.
438 /// Used to find correct boundaries of macro calls and directives when
439 /// building mappings from spelled to expanded tokens.
441 /// Logically, at each point of the preprocessor execution there is a stack of
442 /// macro expansions being processed and we could use it to recover the
443 /// location information we need. However, the public preprocessor API only
444 /// exposes the points when macro expansions start (when we push a macro onto
445 /// the stack) and not when they end (when we pop a macro from the stack).
446 /// To workaround this limitation, we rely on source location information
447 /// stored in this map.
448 using PPExpansions = llvm::DenseMap<SourceLocation, SourceLocation>;
450 class CollectPPExpansions;
452 std::vector<syntax::Token> Expanded;
453 // FIXME: we only store macro expansions, also add directives(#pragma, etc.)
454 PPExpansions Expansions;
456 CollectPPExpansions *Collector;
459 } // namespace syntax