contrib/llvm/tools/clang/lib/ASTMatchers/Dynamic/Parser.cpp

   1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 ///
  10 /// \file
  11 /// \brief Recursive parser implementation for the matcher expression grammar.
  12 ///
  13 //===----------------------------------------------------------------------===//
  14
  15 #include <string>
  16 #include <vector>
  17
  18 #include "clang/ASTMatchers/Dynamic/Parser.h"
  19 #include "clang/ASTMatchers/Dynamic/Registry.h"
  20 #include "clang/Basic/CharInfo.h"
  21 #include "llvm/ADT/Twine.h"
  22
  23 namespace clang {
  24 namespace ast_matchers {
  25 namespace dynamic {
  26
  27 /// \brief Simple structure to hold information for one token from the parser.
  28 struct Parser::TokenInfo {
  29   /// \brief Different possible tokens.
  30   enum TokenKind {
  31     TK_Eof = 0,
  32     TK_OpenParen = 1,
  33     TK_CloseParen = 2,
  34     TK_Comma = 3,
  35     TK_Period = 4,
  36     TK_Literal = 5,
  37     TK_Ident = 6,
  38     TK_InvalidChar = 7,
  39     TK_Error = 8
  40   };
  41
  42   /// \brief Some known identifiers.
  43   static const char* const ID_Bind;
  44
  45   TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
  46
  47   StringRef Text;
  48   TokenKind Kind;
  49   SourceRange Range;
  50   VariantValue Value;
  51 };
  52
  53 const char* const Parser::TokenInfo::ID_Bind = "bind";
  54
  55 /// \brief Simple tokenizer for the parser.
  56 class Parser::CodeTokenizer {
  57 public:
  58   explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
  59       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) {
  60     NextToken = getNextToken();
  61   }
  62
  63   /// \brief Returns but doesn't consume the next token.
  64   const TokenInfo &peekNextToken() const { return NextToken; }
  65
  66   /// \brief Consumes and returns the next token.
  67   TokenInfo consumeNextToken() {
  68     TokenInfo ThisToken = NextToken;
  69     NextToken = getNextToken();
  70     return ThisToken;
  71   }
  72
  73   TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
  74
  75 private:
  76   TokenInfo getNextToken() {
  77     consumeWhitespace();
  78     TokenInfo Result;
  79     Result.Range.Start = currentLocation();
  80
  81     if (Code.empty()) {
  82       Result.Kind = TokenInfo::TK_Eof;
  83       Result.Text = "";
  84       return Result;
  85     }
  86
  87     switch (Code[0]) {
  88     case ',':
  89       Result.Kind = TokenInfo::TK_Comma;
  90       Result.Text = Code.substr(0, 1);
  91       Code = Code.drop_front();
  92       break;
  93     case '.':
  94       Result.Kind = TokenInfo::TK_Period;
  95       Result.Text = Code.substr(0, 1);
  96       Code = Code.drop_front();
  97       break;
  98     case '(':
  99       Result.Kind = TokenInfo::TK_OpenParen;
 100       Result.Text = Code.substr(0, 1);
 101       Code = Code.drop_front();
 102       break;
 103     case ')':
 104       Result.Kind = TokenInfo::TK_CloseParen;
 105       Result.Text = Code.substr(0, 1);
 106       Code = Code.drop_front();
 107       break;
 108
 109     case '"':
 110     case '\'':
 111       // Parse a string literal.
 112       consumeStringLiteral(&Result);
 113       break;
 114
 115     case '0': case '1': case '2': case '3': case '4':
 116     case '5': case '6': case '7': case '8': case '9':
 117       // Parse an unsigned literal.
 118       consumeUnsignedLiteral(&Result);
 119       break;
 120
 121     default:
 122       if (isAlphanumeric(Code[0])) {
 123         // Parse an identifier
 124         size_t TokenLength = 1;
 125         while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength]))
 126           ++TokenLength;
 127         Result.Kind = TokenInfo::TK_Ident;
 128         Result.Text = Code.substr(0, TokenLength);
 129         Code = Code.drop_front(TokenLength);
 130       } else {
 131         Result.Kind = TokenInfo::TK_InvalidChar;
 132         Result.Text = Code.substr(0, 1);
 133         Code = Code.drop_front(1);
 134       }
 135       break;
 136     }
 137
 138     Result.Range.End = currentLocation();
 139     return Result;
 140   }
 141
 142   /// \brief Consume an unsigned literal.
 143   void consumeUnsignedLiteral(TokenInfo *Result) {
 144     unsigned Length = 1;
 145     if (Code.size() > 1) {
 146       // Consume the 'x' or 'b' radix modifier, if present.
 147       switch (toLowercase(Code[1])) {
 148       case 'x': case 'b': Length = 2;
 149       }
 150     }
 151     while (Length < Code.size() && isHexDigit(Code[Length]))
 152       ++Length;
 153
 154     Result->Text = Code.substr(0, Length);
 155     Code = Code.drop_front(Length);
 156
 157     unsigned Value;
 158     if (!Result->Text.getAsInteger(0, Value)) {
 159       Result->Kind = TokenInfo::TK_Literal;
 160       Result->Value = Value;
 161     } else {
 162       SourceRange Range;
 163       Range.Start = Result->Range.Start;
 164       Range.End = currentLocation();
 165       Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text;
 166       Result->Kind = TokenInfo::TK_Error;
 167     }
 168   }
 169
 170   /// \brief Consume a string literal.
 171   ///
 172   /// \c Code must be positioned at the start of the literal (the opening
 173   /// quote). Consumed until it finds the same closing quote character.
 174   void consumeStringLiteral(TokenInfo *Result) {
 175     bool InEscape = false;
 176     const char Marker = Code[0];
 177     for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
 178       if (InEscape) {
 179         InEscape = false;
 180         continue;
 181       }
 182       if (Code[Length] == '\\') {
 183         InEscape = true;
 184         continue;
 185       }
 186       if (Code[Length] == Marker) {
 187         Result->Kind = TokenInfo::TK_Literal;
 188         Result->Text = Code.substr(0, Length + 1);
 189         Result->Value = Code.substr(1, Length - 1).str();
 190         Code = Code.drop_front(Length + 1);
 191         return;
 192       }
 193     }
 194
 195     StringRef ErrorText = Code;
 196     Code = Code.drop_front(Code.size());
 197     SourceRange Range;
 198     Range.Start = Result->Range.Start;
 199     Range.End = currentLocation();
 200     Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
 201     Result->Kind = TokenInfo::TK_Error;
 202   }
 203
 204   /// \brief Consume all leading whitespace from \c Code.
 205   void consumeWhitespace() {
 206     while (!Code.empty() && isWhitespace(Code[0])) {
 207       if (Code[0] == '\n') {
 208         ++Line;
 209         StartOfLine = Code.drop_front();
 210       }
 211       Code = Code.drop_front();
 212     }
 213   }
 214
 215   SourceLocation currentLocation() {
 216     SourceLocation Location;
 217     Location.Line = Line;
 218     Location.Column = Code.data() - StartOfLine.data() + 1;
 219     return Location;
 220   }
 221
 222   StringRef Code;
 223   StringRef StartOfLine;
 224   unsigned Line;
 225   Diagnostics *Error;
 226   TokenInfo NextToken;
 227 };
 228
 229 Parser::Sema::~Sema() {}
 230
 231 /// \brief Parse and validate a matcher expression.
 232 /// \return \c true on success, in which case \c Value has the matcher parsed.
 233 ///   If the input is malformed, or some argument has an error, it
 234 ///   returns \c false.
 235 bool Parser::parseMatcherExpressionImpl(VariantValue *Value) {
 236   const TokenInfo NameToken = Tokenizer->consumeNextToken();
 237   assert(NameToken.Kind == TokenInfo::TK_Ident);
 238   const TokenInfo OpenToken = Tokenizer->consumeNextToken();
 239   if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
 240     Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
 241         << OpenToken.Text;
 242     return false;
 243   }
 244
 245   std::vector<ParserValue> Args;
 246   TokenInfo EndToken;
 247   while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
 248     if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
 249       // End of args.
 250       EndToken = Tokenizer->consumeNextToken();
 251       break;
 252     }
 253     if (Args.size() > 0) {
 254       // We must find a , token to continue.
 255       const TokenInfo CommaToken = Tokenizer->consumeNextToken();
 256       if (CommaToken.Kind != TokenInfo::TK_Comma) {
 257         Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
 258             << CommaToken.Text;
 259         return false;
 260       }
 261     }
 262
 263     Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
 264                              NameToken.Text, NameToken.Range, Args.size() + 1);
 265     ParserValue ArgValue;
 266     ArgValue.Text = Tokenizer->peekNextToken().Text;
 267     ArgValue.Range = Tokenizer->peekNextToken().Range;
 268     if (!parseExpressionImpl(&ArgValue.Value)) return false;
 269
 270     Args.push_back(ArgValue);
 271   }
 272
 273   if (EndToken.Kind == TokenInfo::TK_Eof) {
 274     Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
 275     return false;
 276   }
 277
 278   std::string BindID;
 279   if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
 280     // Parse .bind("foo")
 281     Tokenizer->consumeNextToken();  // consume the period.
 282     const TokenInfo BindToken = Tokenizer->consumeNextToken();
 283     const TokenInfo OpenToken = Tokenizer->consumeNextToken();
 284     const TokenInfo IDToken = Tokenizer->consumeNextToken();
 285     const TokenInfo CloseToken = Tokenizer->consumeNextToken();
 286
 287     // TODO: We could use different error codes for each/some to be more
 288     //       explicit about the syntax error.
 289     if (BindToken.Kind != TokenInfo::TK_Ident ||
 290         BindToken.Text != TokenInfo::ID_Bind) {
 291       Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
 292       return false;
 293     }
 294     if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
 295       Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
 296       return false;
 297     }
 298     if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
 299       Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
 300       return false;
 301     }
 302     if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
 303       Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
 304       return false;
 305     }
 306     BindID = IDToken.Value.getString();
 307   }
 308
 309   // Merge the start and end infos.
 310   Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
 311                            NameToken.Text, NameToken.Range);
 312   SourceRange MatcherRange = NameToken.Range;
 313   MatcherRange.End = EndToken.Range.End;
 314   VariantMatcher Result = S->actOnMatcherExpression(
 315       NameToken.Text, MatcherRange, BindID, Args, Error);
 316   if (Result.isNull()) return false;
 317
 318   *Value = Result;
 319   return true;
 320 }
 321
 322 /// \brief Parse an <Expresssion>
 323 bool Parser::parseExpressionImpl(VariantValue *Value) {
 324   switch (Tokenizer->nextTokenKind()) {
 325   case TokenInfo::TK_Literal:
 326     *Value = Tokenizer->consumeNextToken().Value;
 327     return true;
 328
 329   case TokenInfo::TK_Ident:
 330     return parseMatcherExpressionImpl(Value);
 331
 332   case TokenInfo::TK_Eof:
 333     Error->addError(Tokenizer->consumeNextToken().Range,
 334                     Error->ET_ParserNoCode);
 335     return false;
 336
 337   case TokenInfo::TK_Error:
 338     // This error was already reported by the tokenizer.
 339     return false;
 340
 341   case TokenInfo::TK_OpenParen:
 342   case TokenInfo::TK_CloseParen:
 343   case TokenInfo::TK_Comma:
 344   case TokenInfo::TK_Period:
 345   case TokenInfo::TK_InvalidChar:
 346     const TokenInfo Token = Tokenizer->consumeNextToken();
 347     Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
 348     return false;
 349   }
 350
 351   llvm_unreachable("Unknown token kind.");
 352 }
 353
 354 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
 355                Diagnostics *Error)
 356     : Tokenizer(Tokenizer), S(S), Error(Error) {}
 357
 358 class RegistrySema : public Parser::Sema {
 359 public:
 360   virtual ~RegistrySema() {}
 361   VariantMatcher actOnMatcherExpression(StringRef MatcherName,
 362                                         const SourceRange &NameRange,
 363                                         StringRef BindID,
 364                                         ArrayRef<ParserValue> Args,
 365                                         Diagnostics *Error) {
 366     if (BindID.empty()) {
 367       return Registry::constructMatcher(MatcherName, NameRange, Args, Error);
 368     } else {
 369       return Registry::constructBoundMatcher(MatcherName, NameRange, BindID,
 370                                              Args, Error);
 371     }
 372   }
 373 };
 374
 375 bool Parser::parseExpression(StringRef Code, VariantValue *Value,
 376                              Diagnostics *Error) {
 377   RegistrySema S;
 378   return parseExpression(Code, &S, Value, Error);
 379 }
 380
 381 bool Parser::parseExpression(StringRef Code, Sema *S,
 382                              VariantValue *Value, Diagnostics *Error) {
 383   CodeTokenizer Tokenizer(Code, Error);
 384   if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false;
 385   if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
 386     Error->addError(Tokenizer.peekNextToken().Range,
 387                     Error->ET_ParserTrailingCode);
 388     return false;
 389   }
 390   return true;
 391 }
 392
 393 llvm::Optional<DynTypedMatcher>
 394 Parser::parseMatcherExpression(StringRef Code, Diagnostics *Error) {
 395   RegistrySema S;
 396   return parseMatcherExpression(Code, &S, Error);
 397 }
 398
 399 llvm::Optional<DynTypedMatcher>
 400 Parser::parseMatcherExpression(StringRef Code, Parser::Sema *S,
 401                                Diagnostics *Error) {
 402   VariantValue Value;
 403   if (!parseExpression(Code, S, &Value, Error))
 404     return llvm::Optional<DynTypedMatcher>();
 405   if (!Value.isMatcher()) {
 406     Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
 407     return llvm::Optional<DynTypedMatcher>();
 408   }
 409   llvm::Optional<DynTypedMatcher> Result =
 410       Value.getMatcher().getSingleMatcher();
 411   if (!Result.hasValue()) {
 412     Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
 413         << Value.getTypeAsString();
 414   }
 415   return Result;
 416 }
 417
 418 }  // namespace dynamic
 419 }  // namespace ast_matchers
 420 }  // namespace clang