1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
13 //===----------------------------------------------------------------------===//
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
17 #include "clang/Basic/SourceLocation.h"
18 #include "clang/Basic/SourceManager.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
25 FormatTokenLexer::FormatTokenLexer(
26 const SourceManager &SourceMgr, FileID ID, unsigned Column,
27 const FormatStyle &Style, encoding::Encoding Encoding,
28 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29 IdentifierTable &IdentTable)
30 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31 Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
32 Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
33 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
34 FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
35 MacroBlockEndRegex(Style.MacroBlockEnd) {
36 Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr,
37 getFormattingLangOpts(Style)));
38 Lex->SetKeepWhitespaceMode(true);
40 for (const std::string &ForEachMacro : Style.ForEachMacros)
41 Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
42 for (const std::string &AttributeMacro : Style.AttributeMacros)
43 Macros.insert({&IdentTable.get(AttributeMacro), TT_AttributeMacro});
44 for (const std::string &StatementMacro : Style.StatementMacros)
45 Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
46 for (const std::string &TypenameMacro : Style.TypenameMacros)
47 Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro});
48 for (const std::string &NamespaceMacro : Style.NamespaceMacros)
49 Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro});
50 for (const std::string &WhitespaceSensitiveMacro :
51 Style.WhitespaceSensitiveMacros) {
53 {&IdentTable.get(WhitespaceSensitiveMacro), TT_UntouchableMacroFunc});
55 for (const std::string &StatementAttributeLikeMacro :
56 Style.StatementAttributeLikeMacros)
57 Macros.insert({&IdentTable.get(StatementAttributeLikeMacro),
58 TT_StatementAttributeLikeMacro});
61 ArrayRef<FormatToken *> FormatTokenLexer::lex() {
62 assert(Tokens.empty());
63 assert(FirstInLineIndex == 0);
65 Tokens.push_back(getNextToken());
66 if (Style.Language == FormatStyle::LK_JavaScript) {
67 tryParseJSRegexLiteral();
68 handleTemplateStrings();
70 if (Style.Language == FormatStyle::LK_TextProto)
71 tryParsePythonComment();
72 tryMergePreviousTokens();
74 // This needs to come after tokens have been merged so that C#
75 // string literals are correctly identified.
76 handleCSharpVerbatimAndInterpolatedStrings();
77 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
78 FirstInLineIndex = Tokens.size() - 1;
79 } while (Tokens.back()->Tok.isNot(tok::eof));
83 void FormatTokenLexer::tryMergePreviousTokens() {
84 if (tryMerge_TMacro())
86 if (tryMergeConflictMarkers())
88 if (tryMergeLessLess())
90 if (tryMergeForEach())
92 if (Style.isCpp() && tryTransformTryUsageForC())
95 if (Style.isCSharp()) {
96 if (tryMergeCSharpKeywordVariables())
98 if (tryMergeCSharpStringLiteral())
100 if (tryMergeCSharpDoubleQuestion())
102 if (tryMergeCSharpNullConditional())
104 if (tryTransformCSharpForEach())
106 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
107 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
111 if (tryMergeNSStringLiteral())
114 if (Style.Language == FormatStyle::LK_JavaScript) {
115 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
116 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
118 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
120 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
121 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
122 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
124 static const tok::TokenKind JSNullPropagatingOperator[] = {tok::question,
126 static const tok::TokenKind JSNullishOperator[] = {tok::question,
128 static const tok::TokenKind JSNullishEqual[] = {tok::question,
129 tok::question, tok::equal};
130 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
131 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
133 // FIXME: Investigate what token type gives the correct operator priority.
134 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
136 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
138 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
140 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
142 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
144 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
145 Tokens.back()->Tok.setKind(tok::starequal);
148 if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator)) {
149 // Treat like the "||" operator (as opposed to the ternary ?).
150 Tokens.back()->Tok.setKind(tok::pipepipe);
153 if (tryMergeTokens(JSNullPropagatingOperator,
154 TT_JsNullPropagatingOperator)) {
155 // Treat like a regular "." access.
156 Tokens.back()->Tok.setKind(tok::period);
159 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
160 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual) ||
161 tryMergeTokens(JSNullishEqual, TT_JsNullishCoalescingEqual)) {
162 // Treat like the "=" assignment operator.
163 Tokens.back()->Tok.setKind(tok::equal);
166 if (tryMergeJSPrivateIdentifier())
170 if (Style.Language == FormatStyle::LK_Java) {
171 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
172 tok::greater, tok::greater, tok::greaterequal};
173 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
178 bool FormatTokenLexer::tryMergeNSStringLiteral() {
179 if (Tokens.size() < 2)
181 auto &At = *(Tokens.end() - 2);
182 auto &String = *(Tokens.end() - 1);
183 if (!At->is(tok::at) || !String->is(tok::string_literal))
185 At->Tok.setKind(tok::string_literal);
186 At->TokenText = StringRef(At->TokenText.begin(),
187 String->TokenText.end() - At->TokenText.begin());
188 At->ColumnWidth += String->ColumnWidth;
189 At->setType(TT_ObjCStringLiteral);
190 Tokens.erase(Tokens.end() - 1);
194 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
195 // Merges #idenfier into a single identifier with the text #identifier
196 // but the token tok::identifier.
197 if (Tokens.size() < 2)
199 auto &Hash = *(Tokens.end() - 2);
200 auto &Identifier = *(Tokens.end() - 1);
201 if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
203 Hash->Tok.setKind(tok::identifier);
205 StringRef(Hash->TokenText.begin(),
206 Identifier->TokenText.end() - Hash->TokenText.begin());
207 Hash->ColumnWidth += Identifier->ColumnWidth;
208 Hash->setType(TT_JsPrivateIdentifier);
209 Tokens.erase(Tokens.end() - 1);
213 // Search for verbatim or interpolated string literals @"ABC" or
214 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
215 // prevent splitting of @, $ and ".
216 // Merging of multiline verbatim strings with embedded '"' is handled in
217 // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
218 bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
219 if (Tokens.size() < 2)
222 // Interpolated strings could contain { } with " characters inside.
224 // should not be split into $"{x ?? ", null, "}" but should treated as a
225 // single string-literal.
227 // We opt not to try and format expressions inside {} within a C#
228 // interpolated string. Formatting expressions within an interpolated string
229 // would require similar work as that done for JavaScript template strings
230 // in `handleTemplateStrings()`.
231 auto &CSharpInterpolatedString = *(Tokens.end() - 2);
232 if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral &&
233 (CSharpInterpolatedString->TokenText.startswith(R"($")") ||
234 CSharpInterpolatedString->TokenText.startswith(R"($@")"))) {
235 int UnmatchedOpeningBraceCount = 0;
237 auto TokenTextSize = CSharpInterpolatedString->TokenText.size();
238 for (size_t Index = 0; Index < TokenTextSize; ++Index) {
239 char C = CSharpInterpolatedString->TokenText[Index];
241 // "{{" inside an interpolated string is an escaped '{' so skip it.
242 if (Index + 1 < TokenTextSize &&
243 CSharpInterpolatedString->TokenText[Index + 1] == '{') {
247 ++UnmatchedOpeningBraceCount;
248 } else if (C == '}') {
249 // "}}" inside an interpolated string is an escaped '}' so skip it.
250 if (Index + 1 < TokenTextSize &&
251 CSharpInterpolatedString->TokenText[Index + 1] == '}') {
255 --UnmatchedOpeningBraceCount;
259 if (UnmatchedOpeningBraceCount > 0) {
260 auto &NextToken = *(Tokens.end() - 1);
261 CSharpInterpolatedString->TokenText =
262 StringRef(CSharpInterpolatedString->TokenText.begin(),
263 NextToken->TokenText.end() -
264 CSharpInterpolatedString->TokenText.begin());
265 CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth;
266 Tokens.erase(Tokens.end() - 1);
271 // Look for @"aaaaaa" or $"aaaaaa".
272 auto &String = *(Tokens.end() - 1);
273 if (!String->is(tok::string_literal))
276 auto &At = *(Tokens.end() - 2);
277 if (!(At->is(tok::at) || At->TokenText == "$"))
280 if (Tokens.size() > 2 && At->is(tok::at)) {
281 auto &Dollar = *(Tokens.end() - 3);
282 if (Dollar->TokenText == "$") {
283 // This looks like $@"aaaaa" so we need to combine all 3 tokens.
284 Dollar->Tok.setKind(tok::string_literal);
286 StringRef(Dollar->TokenText.begin(),
287 String->TokenText.end() - Dollar->TokenText.begin());
288 Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
289 Dollar->setType(TT_CSharpStringLiteral);
290 Tokens.erase(Tokens.end() - 2);
291 Tokens.erase(Tokens.end() - 1);
296 // Convert back into just a string_literal.
297 At->Tok.setKind(tok::string_literal);
298 At->TokenText = StringRef(At->TokenText.begin(),
299 String->TokenText.end() - At->TokenText.begin());
300 At->ColumnWidth += String->ColumnWidth;
301 At->setType(TT_CSharpStringLiteral);
302 Tokens.erase(Tokens.end() - 1);
306 // Valid C# attribute targets:
307 // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
308 const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
309 "assembly", "module", "field", "event", "method",
310 "param", "property", "return", "type",
313 bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
314 if (Tokens.size() < 2)
316 auto &FirstQuestion = *(Tokens.end() - 2);
317 auto &SecondQuestion = *(Tokens.end() - 1);
318 if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question))
320 FirstQuestion->Tok.setKind(tok::question); // no '??' in clang tokens.
321 FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
322 SecondQuestion->TokenText.end() -
323 FirstQuestion->TokenText.begin());
324 FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
325 FirstQuestion->setType(TT_CSharpNullCoalescing);
326 Tokens.erase(Tokens.end() - 1);
330 // Merge '?[' and '?.' pairs into single tokens.
331 bool FormatTokenLexer::tryMergeCSharpNullConditional() {
332 if (Tokens.size() < 2)
334 auto &Question = *(Tokens.end() - 2);
335 auto &PeriodOrLSquare = *(Tokens.end() - 1);
336 if (!Question->is(tok::question) ||
337 !PeriodOrLSquare->isOneOf(tok::l_square, tok::period))
339 Question->TokenText =
340 StringRef(Question->TokenText.begin(),
341 PeriodOrLSquare->TokenText.end() - Question->TokenText.begin());
342 Question->ColumnWidth += PeriodOrLSquare->ColumnWidth;
344 if (PeriodOrLSquare->is(tok::l_square)) {
345 Question->Tok.setKind(tok::question); // no '?[' in clang tokens.
346 Question->setType(TT_CSharpNullConditionalLSquare);
348 Question->Tok.setKind(tok::question); // no '?.' in clang tokens.
349 Question->setType(TT_CSharpNullConditional);
352 Tokens.erase(Tokens.end() - 1);
356 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
357 if (Tokens.size() < 2)
359 auto &At = *(Tokens.end() - 2);
360 auto &Keyword = *(Tokens.end() - 1);
361 if (!At->is(tok::at))
363 if (!Keywords.isCSharpKeyword(*Keyword))
366 At->Tok.setKind(tok::identifier);
367 At->TokenText = StringRef(At->TokenText.begin(),
368 Keyword->TokenText.end() - At->TokenText.begin());
369 At->ColumnWidth += Keyword->ColumnWidth;
370 At->setType(Keyword->getType());
371 Tokens.erase(Tokens.end() - 1);
375 // In C# transform identifier foreach into kw_foreach
376 bool FormatTokenLexer::tryTransformCSharpForEach() {
377 if (Tokens.size() < 1)
379 auto &Identifier = *(Tokens.end() - 1);
380 if (!Identifier->is(tok::identifier))
382 if (Identifier->TokenText != "foreach")
385 Identifier->setType(TT_ForEachMacro);
386 Identifier->Tok.setKind(tok::kw_for);
390 bool FormatTokenLexer::tryMergeForEach() {
391 if (Tokens.size() < 2)
393 auto &For = *(Tokens.end() - 2);
394 auto &Each = *(Tokens.end() - 1);
395 if (!For->is(tok::kw_for))
397 if (!Each->is(tok::identifier))
399 if (Each->TokenText != "each")
402 For->setType(TT_ForEachMacro);
403 For->Tok.setKind(tok::kw_for);
405 For->TokenText = StringRef(For->TokenText.begin(),
406 Each->TokenText.end() - For->TokenText.begin());
407 For->ColumnWidth += Each->ColumnWidth;
408 Tokens.erase(Tokens.end() - 1);
412 bool FormatTokenLexer::tryTransformTryUsageForC() {
413 if (Tokens.size() < 2)
415 auto &Try = *(Tokens.end() - 2);
416 if (!Try->is(tok::kw_try))
418 auto &Next = *(Tokens.end() - 1);
419 if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
422 if (Tokens.size() > 2) {
423 auto &At = *(Tokens.end() - 3);
428 Try->Tok.setKind(tok::identifier);
432 bool FormatTokenLexer::tryMergeLessLess() {
433 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
434 if (Tokens.size() < 3)
437 bool FourthTokenIsLess = false;
438 if (Tokens.size() > 3)
439 FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
441 auto First = Tokens.end() - 3;
442 if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
443 First[0]->isNot(tok::less) || FourthTokenIsLess)
446 // Only merge if there currently is no whitespace between the two "<".
447 if (First[1]->WhitespaceRange.getBegin() !=
448 First[1]->WhitespaceRange.getEnd())
451 First[0]->Tok.setKind(tok::lessless);
452 First[0]->TokenText = "<<";
453 First[0]->ColumnWidth += 1;
454 Tokens.erase(Tokens.end() - 2);
458 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
460 if (Tokens.size() < Kinds.size())
463 SmallVectorImpl<FormatToken *>::const_iterator First =
464 Tokens.end() - Kinds.size();
465 if (!First[0]->is(Kinds[0]))
467 unsigned AddLength = 0;
468 for (unsigned i = 1; i < Kinds.size(); ++i) {
469 if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
470 First[i]->WhitespaceRange.getEnd())
472 AddLength += First[i]->TokenText.size();
474 Tokens.resize(Tokens.size() - Kinds.size() + 1);
475 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
476 First[0]->TokenText.size() + AddLength);
477 First[0]->ColumnWidth += AddLength;
478 First[0]->setType(NewType);
482 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
483 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
484 // NB: This is not entirely correct, as an r_paren can introduce an operand
485 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
486 // corner case to not matter in practice, though.
487 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
488 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
489 tok::colon, tok::question, tok::tilde) ||
490 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
491 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
492 tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
493 Tok->isBinaryOperator();
496 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
500 // Regex literals can only follow after prefix unary operators, not after
501 // postfix unary operators. If the '++' is followed by a non-operand
502 // introducing token, the slash here is the operand and not the start of a
504 // `!` is an unary prefix operator, but also a post-fix operator that casts
505 // away nullability, so the same check applies.
506 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
507 return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
509 // The previous token must introduce an operand location where regex
510 // literals can occur.
511 if (!precedesOperand(Prev))
517 // Tries to parse a JavaScript Regex literal starting at the current token,
518 // if that begins with a slash and is in a location where JavaScript allows
519 // regex literals. Changes the current token to a regex literal and updates
520 // its text if successful.
521 void FormatTokenLexer::tryParseJSRegexLiteral() {
522 FormatToken *RegexToken = Tokens.back();
523 if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
526 FormatToken *Prev = nullptr;
527 for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
528 // NB: Because previous pointers are not initialized yet, this cannot use
529 // Token.getPreviousNonComment.
530 if ((*I)->isNot(tok::comment)) {
536 if (!canPrecedeRegexLiteral(Prev))
539 // 'Manually' lex ahead in the current file buffer.
540 const char *Offset = Lex->getBufferLocation();
541 const char *RegexBegin = Offset - RegexToken->TokenText.size();
542 StringRef Buffer = Lex->getBuffer();
543 bool InCharacterClass = false;
544 bool HaveClosingSlash = false;
545 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
546 // Regular expressions are terminated with a '/', which can only be
547 // escaped using '\' or a character class between '[' and ']'.
548 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
551 // Skip the escaped character.
555 InCharacterClass = true;
558 InCharacterClass = false;
561 if (!InCharacterClass)
562 HaveClosingSlash = true;
567 RegexToken->setType(TT_RegexLiteral);
568 // Treat regex literals like other string_literals.
569 RegexToken->Tok.setKind(tok::string_literal);
570 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
571 RegexToken->ColumnWidth = RegexToken->TokenText.size();
573 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
576 void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
577 FormatToken *CSharpStringLiteral = Tokens.back();
579 if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral)
582 // Deal with multiline strings.
583 if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") ||
584 CSharpStringLiteral->TokenText.startswith(R"($@")")))
587 const char *StrBegin =
588 Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size();
589 const char *Offset = StrBegin;
590 if (CSharpStringLiteral->TokenText.startswith(R"(@")"))
592 else // CSharpStringLiteral->TokenText.startswith(R"($@")")
595 // Look for a terminating '"' in the current file buffer.
596 // Make no effort to format code within an interpolated or verbatim string.
597 for (; Offset != Lex->getBuffer().end(); ++Offset) {
598 if (Offset[0] == '"') {
599 // "" within a verbatim string is an escaped double quote: skip it.
600 if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"')
607 // Make no attempt to format code properly if a verbatim string is
609 if (Offset == Lex->getBuffer().end())
612 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
613 CSharpStringLiteral->TokenText = LiteralText;
615 // Adjust width for potentially multiline string literals.
616 size_t FirstBreak = LiteralText.find('\n');
617 StringRef FirstLineText = FirstBreak == StringRef::npos
619 : LiteralText.substr(0, FirstBreak);
620 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
621 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
623 size_t LastBreak = LiteralText.rfind('\n');
624 if (LastBreak != StringRef::npos) {
625 CSharpStringLiteral->IsMultiline = true;
626 unsigned StartColumn = 0;
627 CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs(
628 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
629 Style.TabWidth, Encoding);
632 SourceLocation loc = Offset < Lex->getBuffer().end()
633 ? Lex->getSourceLocation(Offset + 1)
634 : SourceMgr.getLocForEndOfFile(ID);
635 resetLexer(SourceMgr.getFileOffset(loc));
638 void FormatTokenLexer::handleTemplateStrings() {
639 FormatToken *BacktickToken = Tokens.back();
641 if (BacktickToken->is(tok::l_brace)) {
642 StateStack.push(LexerState::NORMAL);
645 if (BacktickToken->is(tok::r_brace)) {
646 if (StateStack.size() == 1)
649 if (StateStack.top() != LexerState::TEMPLATE_STRING)
651 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
652 } else if (BacktickToken->is(tok::unknown) &&
653 BacktickToken->TokenText == "`") {
654 StateStack.push(LexerState::TEMPLATE_STRING);
656 return; // Not actually a template
659 // 'Manually' lex ahead in the current file buffer.
660 const char *Offset = Lex->getBufferLocation();
661 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
662 for (; Offset != Lex->getBuffer().end(); ++Offset) {
663 if (Offset[0] == '`') {
667 if (Offset[0] == '\\') {
668 ++Offset; // Skip the escaped character.
669 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
671 // '${' introduces an expression interpolation in the template string.
672 StateStack.push(LexerState::NORMAL);
678 StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
679 BacktickToken->setType(TT_TemplateString);
680 BacktickToken->Tok.setKind(tok::string_literal);
681 BacktickToken->TokenText = LiteralText;
683 // Adjust width for potentially multiline string literals.
684 size_t FirstBreak = LiteralText.find('\n');
685 StringRef FirstLineText = FirstBreak == StringRef::npos
687 : LiteralText.substr(0, FirstBreak);
688 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
689 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
690 size_t LastBreak = LiteralText.rfind('\n');
691 if (LastBreak != StringRef::npos) {
692 BacktickToken->IsMultiline = true;
693 unsigned StartColumn = 0; // The template tail spans the entire line.
694 BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
695 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
696 Style.TabWidth, Encoding);
699 SourceLocation loc = Offset < Lex->getBuffer().end()
700 ? Lex->getSourceLocation(Offset + 1)
701 : SourceMgr.getLocForEndOfFile(ID);
702 resetLexer(SourceMgr.getFileOffset(loc));
705 void FormatTokenLexer::tryParsePythonComment() {
706 FormatToken *HashToken = Tokens.back();
707 if (!HashToken->isOneOf(tok::hash, tok::hashhash))
709 // Turn the remainder of this line into a comment.
710 const char *CommentBegin =
711 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
712 size_t From = CommentBegin - Lex->getBuffer().begin();
713 size_t To = Lex->getBuffer().find_first_of('\n', From);
714 if (To == StringRef::npos)
715 To = Lex->getBuffer().size();
716 size_t Len = To - From;
717 HashToken->setType(TT_LineComment);
718 HashToken->Tok.setKind(tok::comment);
719 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
720 SourceLocation Loc = To < Lex->getBuffer().size()
721 ? Lex->getSourceLocation(CommentBegin + Len)
722 : SourceMgr.getLocForEndOfFile(ID);
723 resetLexer(SourceMgr.getFileOffset(Loc));
726 bool FormatTokenLexer::tryMerge_TMacro() {
727 if (Tokens.size() < 4)
729 FormatToken *Last = Tokens.back();
730 if (!Last->is(tok::r_paren))
733 FormatToken *String = Tokens[Tokens.size() - 2];
734 if (!String->is(tok::string_literal) || String->IsMultiline)
737 if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
740 FormatToken *Macro = Tokens[Tokens.size() - 4];
741 if (Macro->TokenText != "_T")
744 const char *Start = Macro->TokenText.data();
745 const char *End = Last->TokenText.data() + Last->TokenText.size();
746 String->TokenText = StringRef(Start, End - Start);
747 String->IsFirst = Macro->IsFirst;
748 String->LastNewlineOffset = Macro->LastNewlineOffset;
749 String->WhitespaceRange = Macro->WhitespaceRange;
750 String->OriginalColumn = Macro->OriginalColumn;
751 String->ColumnWidth = encoding::columnWidthWithTabs(
752 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
753 String->NewlinesBefore = Macro->NewlinesBefore;
754 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
759 Tokens.back() = String;
763 bool FormatTokenLexer::tryMergeConflictMarkers() {
764 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
767 // Conflict lines look like:
768 // <marker> <text from the vcs>
770 // >>>>>>> /file/in/file/system at revision 1234
772 // We merge all tokens in a line that starts with a conflict marker
773 // into a single token with a special token type that the unwrapped line
774 // parser will use to correctly rebuild the underlying code.
777 // Get the position of the first token in the line.
778 unsigned FirstInLineOffset;
779 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
780 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
781 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
782 // Calculate the offset of the start of the current line.
783 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
784 if (LineOffset == StringRef::npos) {
790 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
792 if (FirstSpace == StringRef::npos) {
793 LineStart = Buffer.substr(LineOffset);
795 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
798 TokenType Type = TT_Unknown;
799 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
800 Type = TT_ConflictStart;
801 } else if (LineStart == "|||||||" || LineStart == "=======" ||
802 LineStart == "====") {
803 Type = TT_ConflictAlternative;
804 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
805 Type = TT_ConflictEnd;
808 if (Type != TT_Unknown) {
809 FormatToken *Next = Tokens.back();
811 Tokens.resize(FirstInLineIndex + 1);
812 // We do not need to build a complete token here, as we will skip it
813 // during parsing anyway (as we must not touch whitespace around conflict
815 Tokens.back()->setType(Type);
816 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
818 Tokens.push_back(Next);
825 FormatToken *FormatTokenLexer::getStashedToken() {
826 // Create a synthesized second '>' or '<' token.
827 Token Tok = FormatTok->Tok;
828 StringRef TokenText = FormatTok->TokenText;
830 unsigned OriginalColumn = FormatTok->OriginalColumn;
831 FormatTok = new (Allocator.Allocate()) FormatToken;
832 FormatTok->Tok = Tok;
833 SourceLocation TokLocation =
834 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
835 FormatTok->Tok.setLocation(TokLocation);
836 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
837 FormatTok->TokenText = TokenText;
838 FormatTok->ColumnWidth = 1;
839 FormatTok->OriginalColumn = OriginalColumn + 1;
844 FormatToken *FormatTokenLexer::getNextToken() {
845 if (StateStack.top() == LexerState::TOKEN_STASHED) {
847 return getStashedToken();
850 FormatTok = new (Allocator.Allocate()) FormatToken;
851 readRawToken(*FormatTok);
852 SourceLocation WhitespaceStart =
853 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
854 FormatTok->IsFirst = IsFirstToken;
855 IsFirstToken = false;
857 // Consume and record whitespace until we find a significant token.
858 unsigned WhitespaceLength = TrailingWhitespace;
859 while (FormatTok->Tok.is(tok::unknown)) {
860 StringRef Text = FormatTok->TokenText;
861 auto EscapesNewline = [&](int pos) {
862 // A '\r' here is just part of '\r\n'. Skip it.
863 if (pos >= 0 && Text[pos] == '\r')
865 // See whether there is an odd number of '\' before this.
866 // FIXME: This is wrong. A '\' followed by a newline is always removed,
867 // regardless of whether there is another '\' before it.
868 // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
870 for (; pos >= 0; --pos, ++count)
871 if (Text[pos] != '\\')
875 // FIXME: This miscounts tok:unknown tokens that are not just
876 // whitespace, e.g. a '`' character.
877 for (int i = 0, e = Text.size(); i != e; ++i) {
880 ++FormatTok->NewlinesBefore;
881 FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
882 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
886 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
898 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
901 if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
902 FormatTok->setType(TT_ImplicitStringLiteral);
905 FormatTok->setType(TT_ImplicitStringLiteral);
908 if (FormatTok->getType() == TT_ImplicitStringLiteral)
912 if (FormatTok->is(TT_ImplicitStringLiteral))
914 WhitespaceLength += FormatTok->Tok.getLength();
916 readRawToken(*FormatTok);
919 // JavaScript and Java do not allow to escape the end of the line with a
920 // backslash. Backslashes are syntax errors in plain source, but can occur in
921 // comments. When a single line comment ends with a \, it'll cause the next
922 // line of code to be lexed as a comment, breaking formatting. The code below
923 // finds comments that contain a backslash followed by a line break, truncates
924 // the comment token at the backslash, and resets the lexer to restart behind
926 if ((Style.Language == FormatStyle::LK_JavaScript ||
927 Style.Language == FormatStyle::LK_Java) &&
928 FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
929 size_t BackslashPos = FormatTok->TokenText.find('\\');
930 while (BackslashPos != StringRef::npos) {
931 if (BackslashPos + 1 < FormatTok->TokenText.size() &&
932 FormatTok->TokenText[BackslashPos + 1] == '\n') {
933 const char *Offset = Lex->getBufferLocation();
934 Offset -= FormatTok->TokenText.size();
935 Offset += BackslashPos + 1;
936 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
937 FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
938 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
939 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
943 BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
947 // In case the token starts with escaped newlines, we want to
948 // take them into account as whitespace - this pattern is quite frequent
949 // in macro definitions.
950 // FIXME: Add a more explicit test.
951 while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
952 unsigned SkippedWhitespace = 0;
953 if (FormatTok->TokenText.size() > 2 &&
954 (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n'))
955 SkippedWhitespace = 3;
956 else if (FormatTok->TokenText[1] == '\n')
957 SkippedWhitespace = 2;
961 ++FormatTok->NewlinesBefore;
962 WhitespaceLength += SkippedWhitespace;
963 FormatTok->LastNewlineOffset = SkippedWhitespace;
965 FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
968 FormatTok->WhitespaceRange = SourceRange(
969 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
971 FormatTok->OriginalColumn = Column;
973 TrailingWhitespace = 0;
974 if (FormatTok->Tok.is(tok::comment)) {
975 // FIXME: Add the trimmed whitespace to Column.
976 StringRef UntrimmedText = FormatTok->TokenText;
977 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
978 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
979 } else if (FormatTok->Tok.is(tok::raw_identifier)) {
980 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
981 FormatTok->Tok.setIdentifierInfo(&Info);
982 FormatTok->Tok.setKind(Info.getTokenID());
983 if (Style.Language == FormatStyle::LK_Java &&
984 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
986 FormatTok->Tok.setKind(tok::identifier);
987 FormatTok->Tok.setIdentifierInfo(nullptr);
988 } else if (Style.Language == FormatStyle::LK_JavaScript &&
989 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
991 FormatTok->Tok.setKind(tok::identifier);
992 FormatTok->Tok.setIdentifierInfo(nullptr);
994 } else if (FormatTok->Tok.is(tok::greatergreater)) {
995 FormatTok->Tok.setKind(tok::greater);
996 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
998 StateStack.push(LexerState::TOKEN_STASHED);
999 } else if (FormatTok->Tok.is(tok::lessless)) {
1000 FormatTok->Tok.setKind(tok::less);
1001 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1003 StateStack.push(LexerState::TOKEN_STASHED);
1006 // Now FormatTok is the next non-whitespace token.
1008 StringRef Text = FormatTok->TokenText;
1009 size_t FirstNewlinePos = Text.find('\n');
1010 if (FirstNewlinePos == StringRef::npos) {
1011 // FIXME: ColumnWidth actually depends on the start column, we need to
1012 // take this into account when the token is moved.
1013 FormatTok->ColumnWidth =
1014 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1015 Column += FormatTok->ColumnWidth;
1017 FormatTok->IsMultiline = true;
1018 // FIXME: ColumnWidth actually depends on the start column, we need to
1019 // take this into account when the token is moved.
1020 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1021 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1023 // The last line of the token always starts in column 0.
1024 // Thus, the length can be precomputed even in the presence of tabs.
1025 FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1026 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1027 Column = FormatTok->LastLineColumnWidth;
1030 if (Style.isCpp()) {
1031 auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
1032 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1033 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1035 it != Macros.end()) {
1036 FormatTok->setType(it->second);
1037 } else if (FormatTok->is(tok::identifier)) {
1038 if (MacroBlockBeginRegex.match(Text)) {
1039 FormatTok->setType(TT_MacroBlockBegin);
1040 } else if (MacroBlockEndRegex.match(Text)) {
1041 FormatTok->setType(TT_MacroBlockEnd);
1049 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1050 Lex->LexFromRawLexer(Tok.Tok);
1051 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1052 Tok.Tok.getLength());
1053 // For formatting, treat unterminated string literals like normal string
1055 if (Tok.is(tok::unknown)) {
1056 if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1057 Tok.Tok.setKind(tok::string_literal);
1058 Tok.IsUnterminatedLiteral = true;
1059 } else if (Style.Language == FormatStyle::LK_JavaScript &&
1060 Tok.TokenText == "''") {
1061 Tok.Tok.setKind(tok::string_literal);
1065 if ((Style.Language == FormatStyle::LK_JavaScript ||
1066 Style.Language == FormatStyle::LK_Proto ||
1067 Style.Language == FormatStyle::LK_TextProto) &&
1068 Tok.is(tok::char_constant)) {
1069 Tok.Tok.setKind(tok::string_literal);
1072 if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
1073 Tok.TokenText == "/* clang-format on */")) {
1074 FormattingDisabled = false;
1077 Tok.Finalized = FormattingDisabled;
1079 if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
1080 Tok.TokenText == "/* clang-format off */")) {
1081 FormattingDisabled = true;
1085 void FormatTokenLexer::resetLexer(unsigned Offset) {
1086 StringRef Buffer = SourceMgr.getBufferData(ID);
1087 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
1088 getFormattingLangOpts(Style), Buffer.begin(),
1089 Buffer.begin() + Offset, Buffer.end()));
1090 Lex->SetKeepWhitespaceMode(true);
1091 TrailingWhitespace = 0;
1094 } // namespace format
1095 } // namespace clang