contrib/llvm/tools/clang/lib/ASTMatchers/Dynamic/Parser.cpp

   1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 ///
  10 /// \file
  11 /// \brief Recursive parser implementation for the matcher expression grammar.
  12 ///
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "clang/ASTMatchers/Dynamic/Parser.h"
  16 #include "clang/ASTMatchers/Dynamic/Registry.h"
  17 #include "clang/Basic/CharInfo.h"
  18 #include "llvm/ADT/Optional.h"
  19 #include "llvm/Support/ManagedStatic.h"
  20 #include <string>
  21 #include <vector>
  22
  23 namespace clang {
  24 namespace ast_matchers {
  25 namespace dynamic {
  26
  27 /// \brief Simple structure to hold information for one token from the parser.
  28 struct Parser::TokenInfo {
  29   /// \brief Different possible tokens.
  30   enum TokenKind {
  31     TK_Eof,
  32     TK_OpenParen,
  33     TK_CloseParen,
  34     TK_Comma,
  35     TK_Period,
  36     TK_Literal,
  37     TK_Ident,
  38     TK_InvalidChar,
  39     TK_Error,
  40     TK_CodeCompletion
  41   };
  42
  43   /// \brief Some known identifiers.
  44   static const char* const ID_Bind;
  45
  46   TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
  47
  48   StringRef Text;
  49   TokenKind Kind;
  50   SourceRange Range;
  51   VariantValue Value;
  52 };
  53
  54 const char* const Parser::TokenInfo::ID_Bind = "bind";
  55
  56 /// \brief Simple tokenizer for the parser.
  57 class Parser::CodeTokenizer {
  58 public:
  59   explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
  60       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
  61         CodeCompletionLocation(nullptr) {
  62     NextToken = getNextToken();
  63   }
  64
  65   CodeTokenizer(StringRef MatcherCode, Diagnostics *Error,
  66                 unsigned CodeCompletionOffset)
  67       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
  68         CodeCompletionLocation(MatcherCode.data() + CodeCompletionOffset) {
  69     NextToken = getNextToken();
  70   }
  71
  72   /// \brief Returns but doesn't consume the next token.
  73   const TokenInfo &peekNextToken() const { return NextToken; }
  74
  75   /// \brief Consumes and returns the next token.
  76   TokenInfo consumeNextToken() {
  77     TokenInfo ThisToken = NextToken;
  78     NextToken = getNextToken();
  79     return ThisToken;
  80   }
  81
  82   TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
  83
  84 private:
  85   TokenInfo getNextToken() {
  86     consumeWhitespace();
  87     TokenInfo Result;
  88     Result.Range.Start = currentLocation();
  89
  90     if (CodeCompletionLocation && CodeCompletionLocation <= Code.data()) {
  91       Result.Kind = TokenInfo::TK_CodeCompletion;
  92       Result.Text = StringRef(CodeCompletionLocation, 0);
  93       CodeCompletionLocation = nullptr;
  94       return Result;
  95     }
  96
  97     if (Code.empty()) {
  98       Result.Kind = TokenInfo::TK_Eof;
  99       Result.Text = "";
 100       return Result;
 101     }
 102
 103     switch (Code[0]) {
 104     case ',':
 105       Result.Kind = TokenInfo::TK_Comma;
 106       Result.Text = Code.substr(0, 1);
 107       Code = Code.drop_front();
 108       break;
 109     case '.':
 110       Result.Kind = TokenInfo::TK_Period;
 111       Result.Text = Code.substr(0, 1);
 112       Code = Code.drop_front();
 113       break;
 114     case '(':
 115       Result.Kind = TokenInfo::TK_OpenParen;
 116       Result.Text = Code.substr(0, 1);
 117       Code = Code.drop_front();
 118       break;
 119     case ')':
 120       Result.Kind = TokenInfo::TK_CloseParen;
 121       Result.Text = Code.substr(0, 1);
 122       Code = Code.drop_front();
 123       break;
 124
 125     case '"':
 126     case '\'':
 127       // Parse a string literal.
 128       consumeStringLiteral(&Result);
 129       break;
 130
 131     case '0': case '1': case '2': case '3': case '4':
 132     case '5': case '6': case '7': case '8': case '9':
 133       // Parse an unsigned literal.
 134       consumeUnsignedLiteral(&Result);
 135       break;
 136
 137     default:
 138       if (isAlphanumeric(Code[0])) {
 139         // Parse an identifier
 140         size_t TokenLength = 1;
 141         while (1) {
 142           // A code completion location in/immediately after an identifier will
 143           // cause the portion of the identifier before the code completion
 144           // location to become a code completion token.
 145           if (CodeCompletionLocation == Code.data() + TokenLength) {
 146             CodeCompletionLocation = nullptr;
 147             Result.Kind = TokenInfo::TK_CodeCompletion;
 148             Result.Text = Code.substr(0, TokenLength);
 149             Code = Code.drop_front(TokenLength);
 150             return Result;
 151           }
 152           if (TokenLength == Code.size() || !isAlphanumeric(Code[TokenLength]))
 153             break;
 154           ++TokenLength;
 155         }
 156         Result.Kind = TokenInfo::TK_Ident;
 157         Result.Text = Code.substr(0, TokenLength);
 158         Code = Code.drop_front(TokenLength);
 159       } else {
 160         Result.Kind = TokenInfo::TK_InvalidChar;
 161         Result.Text = Code.substr(0, 1);
 162         Code = Code.drop_front(1);
 163       }
 164       break;
 165     }
 166
 167     Result.Range.End = currentLocation();
 168     return Result;
 169   }
 170
 171   /// \brief Consume an unsigned literal.
 172   void consumeUnsignedLiteral(TokenInfo *Result) {
 173     unsigned Length = 1;
 174     if (Code.size() > 1) {
 175       // Consume the 'x' or 'b' radix modifier, if present.
 176       switch (toLowercase(Code[1])) {
 177       case 'x': case 'b': Length = 2;
 178       }
 179     }
 180     while (Length < Code.size() && isHexDigit(Code[Length]))
 181       ++Length;
 182
 183     Result->Text = Code.substr(0, Length);
 184     Code = Code.drop_front(Length);
 185
 186     unsigned Value;
 187     if (!Result->Text.getAsInteger(0, Value)) {
 188       Result->Kind = TokenInfo::TK_Literal;
 189       Result->Value = Value;
 190     } else {
 191       SourceRange Range;
 192       Range.Start = Result->Range.Start;
 193       Range.End = currentLocation();
 194       Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text;
 195       Result->Kind = TokenInfo::TK_Error;
 196     }
 197   }
 198
 199   /// \brief Consume a string literal.
 200   ///
 201   /// \c Code must be positioned at the start of the literal (the opening
 202   /// quote). Consumed until it finds the same closing quote character.
 203   void consumeStringLiteral(TokenInfo *Result) {
 204     bool InEscape = false;
 205     const char Marker = Code[0];
 206     for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
 207       if (InEscape) {
 208         InEscape = false;
 209         continue;
 210       }
 211       if (Code[Length] == '\\') {
 212         InEscape = true;
 213         continue;
 214       }
 215       if (Code[Length] == Marker) {
 216         Result->Kind = TokenInfo::TK_Literal;
 217         Result->Text = Code.substr(0, Length + 1);
 218         Result->Value = Code.substr(1, Length - 1);
 219         Code = Code.drop_front(Length + 1);
 220         return;
 221       }
 222     }
 223
 224     StringRef ErrorText = Code;
 225     Code = Code.drop_front(Code.size());
 226     SourceRange Range;
 227     Range.Start = Result->Range.Start;
 228     Range.End = currentLocation();
 229     Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
 230     Result->Kind = TokenInfo::TK_Error;
 231   }
 232
 233   /// \brief Consume all leading whitespace from \c Code.
 234   void consumeWhitespace() {
 235     while (!Code.empty() && isWhitespace(Code[0])) {
 236       if (Code[0] == '\n') {
 237         ++Line;
 238         StartOfLine = Code.drop_front();
 239       }
 240       Code = Code.drop_front();
 241     }
 242   }
 243
 244   SourceLocation currentLocation() {
 245     SourceLocation Location;
 246     Location.Line = Line;
 247     Location.Column = Code.data() - StartOfLine.data() + 1;
 248     return Location;
 249   }
 250
 251   StringRef Code;
 252   StringRef StartOfLine;
 253   unsigned Line;
 254   Diagnostics *Error;
 255   TokenInfo NextToken;
 256   const char *CodeCompletionLocation;
 257 };
 258
 259 Parser::Sema::~Sema() {}
 260
 261 std::vector<ArgKind> Parser::Sema::getAcceptedCompletionTypes(
 262     llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> Context) {
 263   return std::vector<ArgKind>();
 264 }
 265
 266 std::vector<MatcherCompletion>
 267 Parser::Sema::getMatcherCompletions(llvm::ArrayRef<ArgKind> AcceptedTypes) {
 268   return std::vector<MatcherCompletion>();
 269 }
 270
 271 struct Parser::ScopedContextEntry {
 272   Parser *P;
 273
 274   ScopedContextEntry(Parser *P, MatcherCtor C) : P(P) {
 275     P->ContextStack.push_back(std::make_pair(C, 0u));
 276   }
 277
 278   ~ScopedContextEntry() {
 279     P->ContextStack.pop_back();
 280   }
 281
 282   void nextArg() {
 283     ++P->ContextStack.back().second;
 284   }
 285 };
 286
 287 /// \brief Parse expressions that start with an identifier.
 288 ///
 289 /// This function can parse named values and matchers.
 290 /// In case of failure it will try to determine the user's intent to give
 291 /// an appropriate error message.
 292 bool Parser::parseIdentifierPrefixImpl(VariantValue *Value) {
 293   const TokenInfo NameToken = Tokenizer->consumeNextToken();
 294
 295   if (Tokenizer->nextTokenKind() != TokenInfo::TK_OpenParen) {
 296     // Parse as a named value.
 297     if (const VariantValue NamedValue =
 298             NamedValues ? NamedValues->lookup(NameToken.Text)
 299                         : VariantValue()) {
 300       *Value = NamedValue;
 301       return true;
 302     }
 303     // If the syntax is correct and the name is not a matcher either, report
 304     // unknown named value.
 305     if ((Tokenizer->nextTokenKind() == TokenInfo::TK_Comma ||
 306          Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen ||
 307          Tokenizer->nextTokenKind() == TokenInfo::TK_Eof) &&
 308         !S->lookupMatcherCtor(NameToken.Text)) {
 309       Error->addError(NameToken.Range, Error->ET_RegistryValueNotFound)
 310           << NameToken.Text;
 311       return false;
 312     }
 313     // Otherwise, fallback to the matcher parser.
 314   }
 315
 316   // Parse as a matcher expression.
 317   return parseMatcherExpressionImpl(NameToken, Value);
 318 }
 319
 320 /// \brief Parse and validate a matcher expression.
 321 /// \return \c true on success, in which case \c Value has the matcher parsed.
 322 ///   If the input is malformed, or some argument has an error, it
 323 ///   returns \c false.
 324 bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken,
 325                                         VariantValue *Value) {
 326   assert(NameToken.Kind == TokenInfo::TK_Ident);
 327   const TokenInfo OpenToken = Tokenizer->consumeNextToken();
 328   if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
 329     Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
 330         << OpenToken.Text;
 331     return false;
 332   }
 333
 334   llvm::Optional<MatcherCtor> Ctor = S->lookupMatcherCtor(NameToken.Text);
 335
 336   if (!Ctor) {
 337     Error->addError(NameToken.Range, Error->ET_RegistryMatcherNotFound)
 338         << NameToken.Text;
 339     // Do not return here. We need to continue to give completion suggestions.
 340   }
 341
 342   std::vector<ParserValue> Args;
 343   TokenInfo EndToken;
 344
 345   {
 346     ScopedContextEntry SCE(this, Ctor ? *Ctor : nullptr);
 347
 348     while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
 349       if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
 350         // End of args.
 351         EndToken = Tokenizer->consumeNextToken();
 352         break;
 353       }
 354       if (Args.size() > 0) {
 355         // We must find a , token to continue.
 356         const TokenInfo CommaToken = Tokenizer->consumeNextToken();
 357         if (CommaToken.Kind != TokenInfo::TK_Comma) {
 358           Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
 359               << CommaToken.Text;
 360           return false;
 361         }
 362       }
 363
 364       Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
 365                                NameToken.Text, NameToken.Range,
 366                                Args.size() + 1);
 367       ParserValue ArgValue;
 368       ArgValue.Text = Tokenizer->peekNextToken().Text;
 369       ArgValue.Range = Tokenizer->peekNextToken().Range;
 370       if (!parseExpressionImpl(&ArgValue.Value)) {
 371         return false;
 372       }
 373
 374       Args.push_back(ArgValue);
 375       SCE.nextArg();
 376     }
 377   }
 378
 379   if (EndToken.Kind == TokenInfo::TK_Eof) {
 380     Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
 381     return false;
 382   }
 383
 384   std::string BindID;
 385   if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
 386     // Parse .bind("foo")
 387     Tokenizer->consumeNextToken();  // consume the period.
 388     const TokenInfo BindToken = Tokenizer->consumeNextToken();
 389     if (BindToken.Kind == TokenInfo::TK_CodeCompletion) {
 390       addCompletion(BindToken, MatcherCompletion("bind(\"", "bind", 1));
 391       return false;
 392     }
 393
 394     const TokenInfo OpenToken = Tokenizer->consumeNextToken();
 395     const TokenInfo IDToken = Tokenizer->consumeNextToken();
 396     const TokenInfo CloseToken = Tokenizer->consumeNextToken();
 397
 398     // TODO: We could use different error codes for each/some to be more
 399     //       explicit about the syntax error.
 400     if (BindToken.Kind != TokenInfo::TK_Ident ||
 401         BindToken.Text != TokenInfo::ID_Bind) {
 402       Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
 403       return false;
 404     }
 405     if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
 406       Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
 407       return false;
 408     }
 409     if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
 410       Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
 411       return false;
 412     }
 413     if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
 414       Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
 415       return false;
 416     }
 417     BindID = IDToken.Value.getString();
 418   }
 419
 420   if (!Ctor)
 421     return false;
 422
 423   // Merge the start and end infos.
 424   Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
 425                            NameToken.Text, NameToken.Range);
 426   SourceRange MatcherRange = NameToken.Range;
 427   MatcherRange.End = EndToken.Range.End;
 428   VariantMatcher Result = S->actOnMatcherExpression(
 429       *Ctor, MatcherRange, BindID, Args, Error);
 430   if (Result.isNull()) return false;
 431
 432   *Value = Result;
 433   return true;
 434 }
 435
 436 // If the prefix of this completion matches the completion token, add it to
 437 // Completions minus the prefix.
 438 void Parser::addCompletion(const TokenInfo &CompToken,
 439                            const MatcherCompletion& Completion) {
 440   if (StringRef(Completion.TypedText).startswith(CompToken.Text) &&
 441       Completion.Specificity > 0) {
 442     Completions.emplace_back(Completion.TypedText.substr(CompToken.Text.size()),
 443                              Completion.MatcherDecl, Completion.Specificity);
 444   }
 445 }
 446
 447 std::vector<MatcherCompletion> Parser::getNamedValueCompletions(
 448     ArrayRef<ArgKind> AcceptedTypes) {
 449   if (!NamedValues) return std::vector<MatcherCompletion>();
 450   std::vector<MatcherCompletion> Result;
 451   for (const auto &Entry : *NamedValues) {
 452     unsigned Specificity;
 453     if (Entry.getValue().isConvertibleTo(AcceptedTypes, &Specificity)) {
 454       std::string Decl =
 455           (Entry.getValue().getTypeAsString() + " " + Entry.getKey()).str();
 456       Result.emplace_back(Entry.getKey(), Decl, Specificity);
 457     }
 458   }
 459   return Result;
 460 }
 461
 462 void Parser::addExpressionCompletions() {
 463   const TokenInfo CompToken = Tokenizer->consumeNextToken();
 464   assert(CompToken.Kind == TokenInfo::TK_CodeCompletion);
 465
 466   // We cannot complete code if there is an invalid element on the context
 467   // stack.
 468   for (ContextStackTy::iterator I = ContextStack.begin(),
 469                                 E = ContextStack.end();
 470        I != E; ++I) {
 471     if (!I->first)
 472       return;
 473   }
 474
 475   auto AcceptedTypes = S->getAcceptedCompletionTypes(ContextStack);
 476   for (const auto &Completion : S->getMatcherCompletions(AcceptedTypes)) {
 477     addCompletion(CompToken, Completion);
 478   }
 479
 480   for (const auto &Completion : getNamedValueCompletions(AcceptedTypes)) {
 481     addCompletion(CompToken, Completion);
 482   }
 483 }
 484
 485 /// \brief Parse an <Expresssion>
 486 bool Parser::parseExpressionImpl(VariantValue *Value) {
 487   switch (Tokenizer->nextTokenKind()) {
 488   case TokenInfo::TK_Literal:
 489     *Value = Tokenizer->consumeNextToken().Value;
 490     return true;
 491
 492   case TokenInfo::TK_Ident:
 493     return parseIdentifierPrefixImpl(Value);
 494
 495   case TokenInfo::TK_CodeCompletion:
 496     addExpressionCompletions();
 497     return false;
 498
 499   case TokenInfo::TK_Eof:
 500     Error->addError(Tokenizer->consumeNextToken().Range,
 501                     Error->ET_ParserNoCode);
 502     return false;
 503
 504   case TokenInfo::TK_Error:
 505     // This error was already reported by the tokenizer.
 506     return false;
 507
 508   case TokenInfo::TK_OpenParen:
 509   case TokenInfo::TK_CloseParen:
 510   case TokenInfo::TK_Comma:
 511   case TokenInfo::TK_Period:
 512   case TokenInfo::TK_InvalidChar:
 513     const TokenInfo Token = Tokenizer->consumeNextToken();
 514     Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
 515     return false;
 516   }
 517
 518   llvm_unreachable("Unknown token kind.");
 519 }
 520
 521 static llvm::ManagedStatic<Parser::RegistrySema> DefaultRegistrySema;
 522
 523 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
 524                const NamedValueMap *NamedValues, Diagnostics *Error)
 525     : Tokenizer(Tokenizer), S(S ? S : &*DefaultRegistrySema),
 526       NamedValues(NamedValues), Error(Error) {}
 527
 528 Parser::RegistrySema::~RegistrySema() {}
 529
 530 llvm::Optional<MatcherCtor>
 531 Parser::RegistrySema::lookupMatcherCtor(StringRef MatcherName) {
 532   return Registry::lookupMatcherCtor(MatcherName);
 533 }
 534
 535 VariantMatcher Parser::RegistrySema::actOnMatcherExpression(
 536     MatcherCtor Ctor, SourceRange NameRange, StringRef BindID,
 537     ArrayRef<ParserValue> Args, Diagnostics *Error) {
 538   if (BindID.empty()) {
 539     return Registry::constructMatcher(Ctor, NameRange, Args, Error);
 540   } else {
 541     return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args,
 542                                            Error);
 543   }
 544 }
 545
 546 std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes(
 547     ArrayRef<std::pair<MatcherCtor, unsigned>> Context) {
 548   return Registry::getAcceptedCompletionTypes(Context);
 549 }
 550
 551 std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions(
 552     ArrayRef<ArgKind> AcceptedTypes) {
 553   return Registry::getMatcherCompletions(AcceptedTypes);
 554 }
 555
 556 bool Parser::parseExpression(StringRef Code, Sema *S,
 557                              const NamedValueMap *NamedValues,
 558                              VariantValue *Value, Diagnostics *Error) {
 559   CodeTokenizer Tokenizer(Code, Error);
 560   if (!Parser(&Tokenizer, S, NamedValues, Error).parseExpressionImpl(Value))
 561     return false;
 562   if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
 563     Error->addError(Tokenizer.peekNextToken().Range,
 564                     Error->ET_ParserTrailingCode);
 565     return false;
 566   }
 567   return true;
 568 }
 569
 570 std::vector<MatcherCompletion>
 571 Parser::completeExpression(StringRef Code, unsigned CompletionOffset, Sema *S,
 572                            const NamedValueMap *NamedValues) {
 573   Diagnostics Error;
 574   CodeTokenizer Tokenizer(Code, &Error, CompletionOffset);
 575   Parser P(&Tokenizer, S, NamedValues, &Error);
 576   VariantValue Dummy;
 577   P.parseExpressionImpl(&Dummy);
 578
 579   // Sort by specificity, then by name.
 580   std::sort(P.Completions.begin(), P.Completions.end(),
 581             [](const MatcherCompletion &A, const MatcherCompletion &B) {
 582     if (A.Specificity != B.Specificity)
 583       return A.Specificity > B.Specificity;
 584     return A.TypedText < B.TypedText;
 585   });
 586
 587   return P.Completions;
 588 }
 589
 590 llvm::Optional<DynTypedMatcher>
 591 Parser::parseMatcherExpression(StringRef Code, Sema *S,
 592                                const NamedValueMap *NamedValues,
 593                                Diagnostics *Error) {
 594   VariantValue Value;
 595   if (!parseExpression(Code, S, NamedValues, &Value, Error))
 596     return llvm::Optional<DynTypedMatcher>();
 597   if (!Value.isMatcher()) {
 598     Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
 599     return llvm::Optional<DynTypedMatcher>();
 600   }
 601   llvm::Optional<DynTypedMatcher> Result =
 602       Value.getMatcher().getSingleMatcher();
 603   if (!Result.hasValue()) {
 604     Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
 605         << Value.getTypeAsString();
 606   }
 607   return Result;
 608 }
 609
 610 }  // namespace dynamic
 611 }  // namespace ast_matchers
 612 }  // namespace clang