contrib/llvm/tools/clang/lib/ASTMatchers/Dynamic/Parser.cpp

   1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 ///
  10 /// \file
  11 /// \brief Recursive parser implementation for the matcher expression grammar.
  12 ///
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "clang/ASTMatchers/Dynamic/Parser.h"
  16 #include "clang/ASTMatchers/Dynamic/Registry.h"
  17 #include "clang/Basic/CharInfo.h"
  18 #include "llvm/ADT/Optional.h"
  19 #include "llvm/ADT/Twine.h"
  20 #include <string>
  21 #include <vector>
  22
  23 namespace clang {
  24 namespace ast_matchers {
  25 namespace dynamic {
  26
  27 /// \brief Simple structure to hold information for one token from the parser.
  28 struct Parser::TokenInfo {
  29   /// \brief Different possible tokens.
  30   enum TokenKind {
  31     TK_Eof,
  32     TK_OpenParen,
  33     TK_CloseParen,
  34     TK_Comma,
  35     TK_Period,
  36     TK_Literal,
  37     TK_Ident,
  38     TK_InvalidChar,
  39     TK_Error,
  40     TK_CodeCompletion
  41   };
  42
  43   /// \brief Some known identifiers.
  44   static const char* const ID_Bind;
  45
  46   TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
  47
  48   StringRef Text;
  49   TokenKind Kind;
  50   SourceRange Range;
  51   VariantValue Value;
  52 };
  53
  54 const char* const Parser::TokenInfo::ID_Bind = "bind";
  55
  56 /// \brief Simple tokenizer for the parser.
  57 class Parser::CodeTokenizer {
  58 public:
  59   explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
  60       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
  61         CodeCompletionLocation(nullptr) {
  62     NextToken = getNextToken();
  63   }
  64
  65   CodeTokenizer(StringRef MatcherCode, Diagnostics *Error,
  66                 unsigned CodeCompletionOffset)
  67       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
  68         CodeCompletionLocation(MatcherCode.data() + CodeCompletionOffset) {
  69     NextToken = getNextToken();
  70   }
  71
  72   /// \brief Returns but doesn't consume the next token.
  73   const TokenInfo &peekNextToken() const { return NextToken; }
  74
  75   /// \brief Consumes and returns the next token.
  76   TokenInfo consumeNextToken() {
  77     TokenInfo ThisToken = NextToken;
  78     NextToken = getNextToken();
  79     return ThisToken;
  80   }
  81
  82   TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
  83
  84 private:
  85   TokenInfo getNextToken() {
  86     consumeWhitespace();
  87     TokenInfo Result;
  88     Result.Range.Start = currentLocation();
  89
  90     if (CodeCompletionLocation && CodeCompletionLocation <= Code.data()) {
  91       Result.Kind = TokenInfo::TK_CodeCompletion;
  92       Result.Text = StringRef(CodeCompletionLocation, 0);
  93       CodeCompletionLocation = nullptr;
  94       return Result;
  95     }
  96
  97     if (Code.empty()) {
  98       Result.Kind = TokenInfo::TK_Eof;
  99       Result.Text = "";
 100       return Result;
 101     }
 102
 103     switch (Code[0]) {
 104     case ',':
 105       Result.Kind = TokenInfo::TK_Comma;
 106       Result.Text = Code.substr(0, 1);
 107       Code = Code.drop_front();
 108       break;
 109     case '.':
 110       Result.Kind = TokenInfo::TK_Period;
 111       Result.Text = Code.substr(0, 1);
 112       Code = Code.drop_front();
 113       break;
 114     case '(':
 115       Result.Kind = TokenInfo::TK_OpenParen;
 116       Result.Text = Code.substr(0, 1);
 117       Code = Code.drop_front();
 118       break;
 119     case ')':
 120       Result.Kind = TokenInfo::TK_CloseParen;
 121       Result.Text = Code.substr(0, 1);
 122       Code = Code.drop_front();
 123       break;
 124
 125     case '"':
 126     case '\'':
 127       // Parse a string literal.
 128       consumeStringLiteral(&Result);
 129       break;
 130
 131     case '0': case '1': case '2': case '3': case '4':
 132     case '5': case '6': case '7': case '8': case '9':
 133       // Parse an unsigned literal.
 134       consumeUnsignedLiteral(&Result);
 135       break;
 136
 137     default:
 138       if (isAlphanumeric(Code[0])) {
 139         // Parse an identifier
 140         size_t TokenLength = 1;
 141         while (1) {
 142           // A code completion location in/immediately after an identifier will
 143           // cause the portion of the identifier before the code completion
 144           // location to become a code completion token.
 145           if (CodeCompletionLocation == Code.data() + TokenLength) {
 146             CodeCompletionLocation = nullptr;
 147             Result.Kind = TokenInfo::TK_CodeCompletion;
 148             Result.Text = Code.substr(0, TokenLength);
 149             Code = Code.drop_front(TokenLength);
 150             return Result;
 151           }
 152           if (TokenLength == Code.size() || !isAlphanumeric(Code[TokenLength]))
 153             break;
 154           ++TokenLength;
 155         }
 156         Result.Kind = TokenInfo::TK_Ident;
 157         Result.Text = Code.substr(0, TokenLength);
 158         Code = Code.drop_front(TokenLength);
 159       } else {
 160         Result.Kind = TokenInfo::TK_InvalidChar;
 161         Result.Text = Code.substr(0, 1);
 162         Code = Code.drop_front(1);
 163       }
 164       break;
 165     }
 166
 167     Result.Range.End = currentLocation();
 168     return Result;
 169   }
 170
 171   /// \brief Consume an unsigned literal.
 172   void consumeUnsignedLiteral(TokenInfo *Result) {
 173     unsigned Length = 1;
 174     if (Code.size() > 1) {
 175       // Consume the 'x' or 'b' radix modifier, if present.
 176       switch (toLowercase(Code[1])) {
 177       case 'x': case 'b': Length = 2;
 178       }
 179     }
 180     while (Length < Code.size() && isHexDigit(Code[Length]))
 181       ++Length;
 182
 183     Result->Text = Code.substr(0, Length);
 184     Code = Code.drop_front(Length);
 185
 186     unsigned Value;
 187     if (!Result->Text.getAsInteger(0, Value)) {
 188       Result->Kind = TokenInfo::TK_Literal;
 189       Result->Value = Value;
 190     } else {
 191       SourceRange Range;
 192       Range.Start = Result->Range.Start;
 193       Range.End = currentLocation();
 194       Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text;
 195       Result->Kind = TokenInfo::TK_Error;
 196     }
 197   }
 198
 199   /// \brief Consume a string literal.
 200   ///
 201   /// \c Code must be positioned at the start of the literal (the opening
 202   /// quote). Consumed until it finds the same closing quote character.
 203   void consumeStringLiteral(TokenInfo *Result) {
 204     bool InEscape = false;
 205     const char Marker = Code[0];
 206     for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
 207       if (InEscape) {
 208         InEscape = false;
 209         continue;
 210       }
 211       if (Code[Length] == '\\') {
 212         InEscape = true;
 213         continue;
 214       }
 215       if (Code[Length] == Marker) {
 216         Result->Kind = TokenInfo::TK_Literal;
 217         Result->Text = Code.substr(0, Length + 1);
 218         Result->Value = Code.substr(1, Length - 1).str();
 219         Code = Code.drop_front(Length + 1);
 220         return;
 221       }
 222     }
 223
 224     StringRef ErrorText = Code;
 225     Code = Code.drop_front(Code.size());
 226     SourceRange Range;
 227     Range.Start = Result->Range.Start;
 228     Range.End = currentLocation();
 229     Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
 230     Result->Kind = TokenInfo::TK_Error;
 231   }
 232
 233   /// \brief Consume all leading whitespace from \c Code.
 234   void consumeWhitespace() {
 235     while (!Code.empty() && isWhitespace(Code[0])) {
 236       if (Code[0] == '\n') {
 237         ++Line;
 238         StartOfLine = Code.drop_front();
 239       }
 240       Code = Code.drop_front();
 241     }
 242   }
 243
 244   SourceLocation currentLocation() {
 245     SourceLocation Location;
 246     Location.Line = Line;
 247     Location.Column = Code.data() - StartOfLine.data() + 1;
 248     return Location;
 249   }
 250
 251   StringRef Code;
 252   StringRef StartOfLine;
 253   unsigned Line;
 254   Diagnostics *Error;
 255   TokenInfo NextToken;
 256   const char *CodeCompletionLocation;
 257 };
 258
 259 Parser::Sema::~Sema() {}
 260
 261 VariantValue Parser::Sema::getNamedValue(StringRef Name) {
 262   return VariantValue();
 263 }
 264
 265 struct Parser::ScopedContextEntry {
 266   Parser *P;
 267
 268   ScopedContextEntry(Parser *P, MatcherCtor C) : P(P) {
 269     P->ContextStack.push_back(std::make_pair(C, 0u));
 270   }
 271
 272   ~ScopedContextEntry() {
 273     P->ContextStack.pop_back();
 274   }
 275
 276   void nextArg() {
 277     ++P->ContextStack.back().second;
 278   }
 279 };
 280
 281 /// \brief Parse expressions that start with an identifier.
 282 ///
 283 /// This function can parse named values and matchers.
 284 /// In case of failure it will try to determine the user's intent to give
 285 /// an appropriate error message.
 286 bool Parser::parseIdentifierPrefixImpl(VariantValue *Value) {
 287   const TokenInfo NameToken = Tokenizer->consumeNextToken();
 288
 289   if (Tokenizer->nextTokenKind() != TokenInfo::TK_OpenParen) {
 290     // Parse as a named value.
 291     if (const VariantValue NamedValue = S->getNamedValue(NameToken.Text)) {
 292       *Value = NamedValue;
 293       return true;
 294     }
 295     // If the syntax is correct and the name is not a matcher either, report
 296     // unknown named value.
 297     if ((Tokenizer->nextTokenKind() == TokenInfo::TK_Comma ||
 298          Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen ||
 299          Tokenizer->nextTokenKind() == TokenInfo::TK_Eof) &&
 300         !S->lookupMatcherCtor(NameToken.Text)) {
 301       Error->addError(NameToken.Range, Error->ET_RegistryValueNotFound)
 302           << NameToken.Text;
 303       return false;
 304     }
 305     // Otherwise, fallback to the matcher parser.
 306   }
 307
 308   // Parse as a matcher expression.
 309   return parseMatcherExpressionImpl(NameToken, Value);
 310 }
 311
 312 /// \brief Parse and validate a matcher expression.
 313 /// \return \c true on success, in which case \c Value has the matcher parsed.
 314 ///   If the input is malformed, or some argument has an error, it
 315 ///   returns \c false.
 316 bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken,
 317                                         VariantValue *Value) {
 318   assert(NameToken.Kind == TokenInfo::TK_Ident);
 319   const TokenInfo OpenToken = Tokenizer->consumeNextToken();
 320   if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
 321     Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
 322         << OpenToken.Text;
 323     return false;
 324   }
 325
 326   llvm::Optional<MatcherCtor> Ctor = S->lookupMatcherCtor(NameToken.Text);
 327
 328   if (!Ctor) {
 329     Error->addError(NameToken.Range, Error->ET_RegistryMatcherNotFound)
 330         << NameToken.Text;
 331     // Do not return here. We need to continue to give completion suggestions.
 332   }
 333
 334   std::vector<ParserValue> Args;
 335   TokenInfo EndToken;
 336
 337   {
 338     ScopedContextEntry SCE(this, Ctor ? *Ctor : nullptr);
 339
 340     while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
 341       if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
 342         // End of args.
 343         EndToken = Tokenizer->consumeNextToken();
 344         break;
 345       }
 346       if (Args.size() > 0) {
 347         // We must find a , token to continue.
 348         const TokenInfo CommaToken = Tokenizer->consumeNextToken();
 349         if (CommaToken.Kind != TokenInfo::TK_Comma) {
 350           Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
 351               << CommaToken.Text;
 352           return false;
 353         }
 354       }
 355
 356       Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
 357                                NameToken.Text, NameToken.Range,
 358                                Args.size() + 1);
 359       ParserValue ArgValue;
 360       ArgValue.Text = Tokenizer->peekNextToken().Text;
 361       ArgValue.Range = Tokenizer->peekNextToken().Range;
 362       if (!parseExpressionImpl(&ArgValue.Value)) {
 363         return false;
 364       }
 365
 366       Args.push_back(ArgValue);
 367       SCE.nextArg();
 368     }
 369   }
 370
 371   if (EndToken.Kind == TokenInfo::TK_Eof) {
 372     Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
 373     return false;
 374   }
 375
 376   std::string BindID;
 377   if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
 378     // Parse .bind("foo")
 379     Tokenizer->consumeNextToken();  // consume the period.
 380     const TokenInfo BindToken = Tokenizer->consumeNextToken();
 381     if (BindToken.Kind == TokenInfo::TK_CodeCompletion) {
 382       addCompletion(BindToken, "bind(\"", "bind");
 383       return false;
 384     }
 385
 386     const TokenInfo OpenToken = Tokenizer->consumeNextToken();
 387     const TokenInfo IDToken = Tokenizer->consumeNextToken();
 388     const TokenInfo CloseToken = Tokenizer->consumeNextToken();
 389
 390     // TODO: We could use different error codes for each/some to be more
 391     //       explicit about the syntax error.
 392     if (BindToken.Kind != TokenInfo::TK_Ident ||
 393         BindToken.Text != TokenInfo::ID_Bind) {
 394       Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
 395       return false;
 396     }
 397     if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
 398       Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
 399       return false;
 400     }
 401     if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
 402       Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
 403       return false;
 404     }
 405     if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
 406       Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
 407       return false;
 408     }
 409     BindID = IDToken.Value.getString();
 410   }
 411
 412   if (!Ctor)
 413     return false;
 414
 415   // Merge the start and end infos.
 416   Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
 417                            NameToken.Text, NameToken.Range);
 418   SourceRange MatcherRange = NameToken.Range;
 419   MatcherRange.End = EndToken.Range.End;
 420   VariantMatcher Result = S->actOnMatcherExpression(
 421       *Ctor, MatcherRange, BindID, Args, Error);
 422   if (Result.isNull()) return false;
 423
 424   *Value = Result;
 425   return true;
 426 }
 427
 428 // If the prefix of this completion matches the completion token, add it to
 429 // Completions minus the prefix.
 430 void Parser::addCompletion(const TokenInfo &CompToken, StringRef TypedText,
 431                            StringRef Decl) {
 432   if (TypedText.size() >= CompToken.Text.size() &&
 433       TypedText.substr(0, CompToken.Text.size()) == CompToken.Text) {
 434     Completions.push_back(
 435         MatcherCompletion(TypedText.substr(CompToken.Text.size()), Decl));
 436   }
 437 }
 438
 439 void Parser::addExpressionCompletions() {
 440   const TokenInfo CompToken = Tokenizer->consumeNextToken();
 441   assert(CompToken.Kind == TokenInfo::TK_CodeCompletion);
 442
 443   // We cannot complete code if there is an invalid element on the context
 444   // stack.
 445   for (ContextStackTy::iterator I = ContextStack.begin(),
 446                                 E = ContextStack.end();
 447        I != E; ++I) {
 448     if (!I->first)
 449       return;
 450   }
 451
 452   std::vector<MatcherCompletion> RegCompletions =
 453       Registry::getCompletions(ContextStack);
 454   for (std::vector<MatcherCompletion>::iterator I = RegCompletions.begin(),
 455                                                 E = RegCompletions.end();
 456        I != E; ++I) {
 457     addCompletion(CompToken, I->TypedText, I->MatcherDecl);
 458   }
 459 }
 460
 461 /// \brief Parse an <Expresssion>
 462 bool Parser::parseExpressionImpl(VariantValue *Value) {
 463   switch (Tokenizer->nextTokenKind()) {
 464   case TokenInfo::TK_Literal:
 465     *Value = Tokenizer->consumeNextToken().Value;
 466     return true;
 467
 468   case TokenInfo::TK_Ident:
 469     return parseIdentifierPrefixImpl(Value);
 470
 471   case TokenInfo::TK_CodeCompletion:
 472     addExpressionCompletions();
 473     return false;
 474
 475   case TokenInfo::TK_Eof:
 476     Error->addError(Tokenizer->consumeNextToken().Range,
 477                     Error->ET_ParserNoCode);
 478     return false;
 479
 480   case TokenInfo::TK_Error:
 481     // This error was already reported by the tokenizer.
 482     return false;
 483
 484   case TokenInfo::TK_OpenParen:
 485   case TokenInfo::TK_CloseParen:
 486   case TokenInfo::TK_Comma:
 487   case TokenInfo::TK_Period:
 488   case TokenInfo::TK_InvalidChar:
 489     const TokenInfo Token = Tokenizer->consumeNextToken();
 490     Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
 491     return false;
 492   }
 493
 494   llvm_unreachable("Unknown token kind.");
 495 }
 496
 497 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
 498                Diagnostics *Error)
 499     : Tokenizer(Tokenizer), S(S), Error(Error) {}
 500
 501 Parser::RegistrySema::~RegistrySema() {}
 502
 503 llvm::Optional<MatcherCtor>
 504 Parser::RegistrySema::lookupMatcherCtor(StringRef MatcherName) {
 505   return Registry::lookupMatcherCtor(MatcherName);
 506 }
 507
 508 VariantMatcher Parser::RegistrySema::actOnMatcherExpression(
 509     MatcherCtor Ctor, const SourceRange &NameRange, StringRef BindID,
 510     ArrayRef<ParserValue> Args, Diagnostics *Error) {
 511   if (BindID.empty()) {
 512     return Registry::constructMatcher(Ctor, NameRange, Args, Error);
 513   } else {
 514     return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args,
 515                                            Error);
 516   }
 517 }
 518
 519 bool Parser::parseExpression(StringRef Code, VariantValue *Value,
 520                              Diagnostics *Error) {
 521   RegistrySema S;
 522   return parseExpression(Code, &S, Value, Error);
 523 }
 524
 525 bool Parser::parseExpression(StringRef Code, Sema *S,
 526                              VariantValue *Value, Diagnostics *Error) {
 527   CodeTokenizer Tokenizer(Code, Error);
 528   if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false;
 529   if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
 530     Error->addError(Tokenizer.peekNextToken().Range,
 531                     Error->ET_ParserTrailingCode);
 532     return false;
 533   }
 534   return true;
 535 }
 536
 537 std::vector<MatcherCompletion>
 538 Parser::completeExpression(StringRef Code, unsigned CompletionOffset) {
 539   Diagnostics Error;
 540   CodeTokenizer Tokenizer(Code, &Error, CompletionOffset);
 541   RegistrySema S;
 542   Parser P(&Tokenizer, &S, &Error);
 543   VariantValue Dummy;
 544   P.parseExpressionImpl(&Dummy);
 545
 546   return P.Completions;
 547 }
 548
 549 llvm::Optional<DynTypedMatcher>
 550 Parser::parseMatcherExpression(StringRef Code, Diagnostics *Error) {
 551   RegistrySema S;
 552   return parseMatcherExpression(Code, &S, Error);
 553 }
 554
 555 llvm::Optional<DynTypedMatcher>
 556 Parser::parseMatcherExpression(StringRef Code, Parser::Sema *S,
 557                                Diagnostics *Error) {
 558   VariantValue Value;
 559   if (!parseExpression(Code, S, &Value, Error))
 560     return llvm::Optional<DynTypedMatcher>();
 561   if (!Value.isMatcher()) {
 562     Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
 563     return llvm::Optional<DynTypedMatcher>();
 564   }
 565   llvm::Optional<DynTypedMatcher> Result =
 566       Value.getMatcher().getSingleMatcher();
 567   if (!Result.hasValue()) {
 568     Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
 569         << Value.getTypeAsString();
 570   }
 571   return Result;
 572 }
 573
 574 }  // namespace dynamic
 575 }  // namespace ast_matchers
 576 }  // namespace clang