//===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// /// /// \file /// \brief Recursive parser implementation for the matcher expression grammar. /// //===----------------------------------------------------------------------===// #include #include #include "clang/ASTMatchers/Dynamic/Parser.h" #include "clang/ASTMatchers/Dynamic/Registry.h" #include "clang/Basic/CharInfo.h" #include "llvm/ADT/Twine.h" namespace clang { namespace ast_matchers { namespace dynamic { /// \brief Simple structure to hold information for one token from the parser. struct Parser::TokenInfo { /// \brief Different possible tokens. enum TokenKind { TK_Eof = 0, TK_OpenParen = 1, TK_CloseParen = 2, TK_Comma = 3, TK_Period = 4, TK_Literal = 5, TK_Ident = 6, TK_InvalidChar = 7, TK_Error = 8 }; /// \brief Some known identifiers. static const char* const ID_Bind; TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {} StringRef Text; TokenKind Kind; SourceRange Range; VariantValue Value; }; const char* const Parser::TokenInfo::ID_Bind = "bind"; /// \brief Simple tokenizer for the parser. class Parser::CodeTokenizer { public: explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error) : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) { NextToken = getNextToken(); } /// \brief Returns but doesn't consume the next token. const TokenInfo &peekNextToken() const { return NextToken; } /// \brief Consumes and returns the next token. TokenInfo consumeNextToken() { TokenInfo ThisToken = NextToken; NextToken = getNextToken(); return ThisToken; } TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; } private: TokenInfo getNextToken() { consumeWhitespace(); TokenInfo Result; Result.Range.Start = currentLocation(); if (Code.empty()) { Result.Kind = TokenInfo::TK_Eof; Result.Text = ""; return Result; } switch (Code[0]) { case ',': Result.Kind = TokenInfo::TK_Comma; Result.Text = Code.substr(0, 1); Code = Code.drop_front(); break; case '.': Result.Kind = TokenInfo::TK_Period; Result.Text = Code.substr(0, 1); Code = Code.drop_front(); break; case '(': Result.Kind = TokenInfo::TK_OpenParen; Result.Text = Code.substr(0, 1); Code = Code.drop_front(); break; case ')': Result.Kind = TokenInfo::TK_CloseParen; Result.Text = Code.substr(0, 1); Code = Code.drop_front(); break; case '"': case '\'': // Parse a string literal. consumeStringLiteral(&Result); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': // Parse an unsigned literal. consumeUnsignedLiteral(&Result); break; default: if (isAlphanumeric(Code[0])) { // Parse an identifier size_t TokenLength = 1; while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength])) ++TokenLength; Result.Kind = TokenInfo::TK_Ident; Result.Text = Code.substr(0, TokenLength); Code = Code.drop_front(TokenLength); } else { Result.Kind = TokenInfo::TK_InvalidChar; Result.Text = Code.substr(0, 1); Code = Code.drop_front(1); } break; } Result.Range.End = currentLocation(); return Result; } /// \brief Consume an unsigned literal. void consumeUnsignedLiteral(TokenInfo *Result) { unsigned Length = 1; if (Code.size() > 1) { // Consume the 'x' or 'b' radix modifier, if present. switch (toLowercase(Code[1])) { case 'x': case 'b': Length = 2; } } while (Length < Code.size() && isHexDigit(Code[Length])) ++Length; Result->Text = Code.substr(0, Length); Code = Code.drop_front(Length); unsigned Value; if (!Result->Text.getAsInteger(0, Value)) { Result->Kind = TokenInfo::TK_Literal; Result->Value = Value; } else { SourceRange Range; Range.Start = Result->Range.Start; Range.End = currentLocation(); Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text; Result->Kind = TokenInfo::TK_Error; } } /// \brief Consume a string literal. /// /// \c Code must be positioned at the start of the literal (the opening /// quote). Consumed until it finds the same closing quote character. void consumeStringLiteral(TokenInfo *Result) { bool InEscape = false; const char Marker = Code[0]; for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) { if (InEscape) { InEscape = false; continue; } if (Code[Length] == '\\') { InEscape = true; continue; } if (Code[Length] == Marker) { Result->Kind = TokenInfo::TK_Literal; Result->Text = Code.substr(0, Length + 1); Result->Value = Code.substr(1, Length - 1).str(); Code = Code.drop_front(Length + 1); return; } } StringRef ErrorText = Code; Code = Code.drop_front(Code.size()); SourceRange Range; Range.Start = Result->Range.Start; Range.End = currentLocation(); Error->addError(Range, Error->ET_ParserStringError) << ErrorText; Result->Kind = TokenInfo::TK_Error; } /// \brief Consume all leading whitespace from \c Code. void consumeWhitespace() { while (!Code.empty() && isWhitespace(Code[0])) { if (Code[0] == '\n') { ++Line; StartOfLine = Code.drop_front(); } Code = Code.drop_front(); } } SourceLocation currentLocation() { SourceLocation Location; Location.Line = Line; Location.Column = Code.data() - StartOfLine.data() + 1; return Location; } StringRef Code; StringRef StartOfLine; unsigned Line; Diagnostics *Error; TokenInfo NextToken; }; Parser::Sema::~Sema() {} /// \brief Parse and validate a matcher expression. /// \return \c true on success, in which case \c Value has the matcher parsed. /// If the input is malformed, or some argument has an error, it /// returns \c false. bool Parser::parseMatcherExpressionImpl(VariantValue *Value) { const TokenInfo NameToken = Tokenizer->consumeNextToken(); assert(NameToken.Kind == TokenInfo::TK_Ident); const TokenInfo OpenToken = Tokenizer->consumeNextToken(); if (OpenToken.Kind != TokenInfo::TK_OpenParen) { Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen) << OpenToken.Text; return false; } std::vector Args; TokenInfo EndToken; while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) { if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) { // End of args. EndToken = Tokenizer->consumeNextToken(); break; } if (Args.size() > 0) { // We must find a , token to continue. const TokenInfo CommaToken = Tokenizer->consumeNextToken(); if (CommaToken.Kind != TokenInfo::TK_Comma) { Error->addError(CommaToken.Range, Error->ET_ParserNoComma) << CommaToken.Text; return false; } } Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error, NameToken.Text, NameToken.Range, Args.size() + 1); ParserValue ArgValue; ArgValue.Text = Tokenizer->peekNextToken().Text; ArgValue.Range = Tokenizer->peekNextToken().Range; if (!parseExpressionImpl(&ArgValue.Value)) return false; Args.push_back(ArgValue); } if (EndToken.Kind == TokenInfo::TK_Eof) { Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen); return false; } std::string BindID; if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) { // Parse .bind("foo") Tokenizer->consumeNextToken(); // consume the period. const TokenInfo BindToken = Tokenizer->consumeNextToken(); const TokenInfo OpenToken = Tokenizer->consumeNextToken(); const TokenInfo IDToken = Tokenizer->consumeNextToken(); const TokenInfo CloseToken = Tokenizer->consumeNextToken(); // TODO: We could use different error codes for each/some to be more // explicit about the syntax error. if (BindToken.Kind != TokenInfo::TK_Ident || BindToken.Text != TokenInfo::ID_Bind) { Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr); return false; } if (OpenToken.Kind != TokenInfo::TK_OpenParen) { Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr); return false; } if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) { Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr); return false; } if (CloseToken.Kind != TokenInfo::TK_CloseParen) { Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr); return false; } BindID = IDToken.Value.getString(); } // Merge the start and end infos. Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error, NameToken.Text, NameToken.Range); SourceRange MatcherRange = NameToken.Range; MatcherRange.End = EndToken.Range.End; VariantMatcher Result = S->actOnMatcherExpression( NameToken.Text, MatcherRange, BindID, Args, Error); if (Result.isNull()) return false; *Value = Result; return true; } /// \brief Parse an bool Parser::parseExpressionImpl(VariantValue *Value) { switch (Tokenizer->nextTokenKind()) { case TokenInfo::TK_Literal: *Value = Tokenizer->consumeNextToken().Value; return true; case TokenInfo::TK_Ident: return parseMatcherExpressionImpl(Value); case TokenInfo::TK_Eof: Error->addError(Tokenizer->consumeNextToken().Range, Error->ET_ParserNoCode); return false; case TokenInfo::TK_Error: // This error was already reported by the tokenizer. return false; case TokenInfo::TK_OpenParen: case TokenInfo::TK_CloseParen: case TokenInfo::TK_Comma: case TokenInfo::TK_Period: case TokenInfo::TK_InvalidChar: const TokenInfo Token = Tokenizer->consumeNextToken(); Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text; return false; } llvm_unreachable("Unknown token kind."); } Parser::Parser(CodeTokenizer *Tokenizer, Sema *S, Diagnostics *Error) : Tokenizer(Tokenizer), S(S), Error(Error) {} class RegistrySema : public Parser::Sema { public: virtual ~RegistrySema() {} VariantMatcher actOnMatcherExpression(StringRef MatcherName, const SourceRange &NameRange, StringRef BindID, ArrayRef Args, Diagnostics *Error) { if (BindID.empty()) { return Registry::constructMatcher(MatcherName, NameRange, Args, Error); } else { return Registry::constructBoundMatcher(MatcherName, NameRange, BindID, Args, Error); } } }; bool Parser::parseExpression(StringRef Code, VariantValue *Value, Diagnostics *Error) { RegistrySema S; return parseExpression(Code, &S, Value, Error); } bool Parser::parseExpression(StringRef Code, Sema *S, VariantValue *Value, Diagnostics *Error) { CodeTokenizer Tokenizer(Code, Error); if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false; if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) { Error->addError(Tokenizer.peekNextToken().Range, Error->ET_ParserTrailingCode); return false; } return true; } llvm::Optional Parser::parseMatcherExpression(StringRef Code, Diagnostics *Error) { RegistrySema S; return parseMatcherExpression(Code, &S, Error); } llvm::Optional Parser::parseMatcherExpression(StringRef Code, Parser::Sema *S, Diagnostics *Error) { VariantValue Value; if (!parseExpression(Code, S, &Value, Error)) return llvm::Optional(); if (!Value.isMatcher()) { Error->addError(SourceRange(), Error->ET_ParserNotAMatcher); return llvm::Optional(); } llvm::Optional Result = Value.getMatcher().getSingleMatcher(); if (!Result.hasValue()) { Error->addError(SourceRange(), Error->ET_ParserOverloadedType) << Value.getTypeAsString(); } return Result; } } // namespace dynamic } // namespace ast_matchers } // namespace clang