contrib/llvm-project/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp

   1 //===- DependencyDirectivesSourceMinimizer.cpp -  -------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 ///
   9 /// \file
  10 /// This is the implementation for minimizing header and source files to the
  11 /// minimum necessary preprocessor directives for evaluating includes. It
  12 /// reduces the source down to #define, #include, #import, @import, and any
  13 /// conditional preprocessor logic that contains one of those.
  14 ///
  15 //===----------------------------------------------------------------------===//
  16
  17 #include "clang/Lex/DependencyDirectivesSourceMinimizer.h"
  18 #include "clang/Basic/CharInfo.h"
  19 #include "clang/Basic/Diagnostic.h"
  20 #include "clang/Lex/LexDiagnostic.h"
  21 #include "llvm/ADT/StringMap.h"
  22 #include "llvm/ADT/StringSwitch.h"
  23 #include "llvm/Support/MemoryBuffer.h"
  24
  25 using namespace llvm;
  26 using namespace clang;
  27 using namespace clang::minimize_source_to_dependency_directives;
  28
  29 namespace {
  30
  31 struct Minimizer {
  32   /// Minimized output.
  33   SmallVectorImpl<char> &Out;
  34   /// The known tokens encountered during the minimization.
  35   SmallVectorImpl<Token> &Tokens;
  36
  37   Minimizer(SmallVectorImpl<char> &Out, SmallVectorImpl<Token> &Tokens,
  38             StringRef Input, DiagnosticsEngine *Diags,
  39             SourceLocation InputSourceLoc)
  40       : Out(Out), Tokens(Tokens), Input(Input), Diags(Diags),
  41         InputSourceLoc(InputSourceLoc) {}
  42
  43   /// Lex the provided source and emit the minimized output.
  44   ///
  45   /// \returns True on error.
  46   bool minimize();
  47
  48 private:
  49   struct IdInfo {
  50     const char *Last;
  51     StringRef Name;
  52   };
  53
  54   /// Lex an identifier.
  55   ///
  56   /// \pre First points at a valid identifier head.
  57   LLVM_NODISCARD IdInfo lexIdentifier(const char *First, const char *const End);
  58   LLVM_NODISCARD bool isNextIdentifier(StringRef Id, const char *&First,
  59                                        const char *const End);
  60   LLVM_NODISCARD bool minimizeImpl(const char *First, const char *const End);
  61   LLVM_NODISCARD bool lexPPLine(const char *&First, const char *const End);
  62   LLVM_NODISCARD bool lexAt(const char *&First, const char *const End);
  63   LLVM_NODISCARD bool lexModule(const char *&First, const char *const End);
  64   LLVM_NODISCARD bool lexDefine(const char *&First, const char *const End);
  65   LLVM_NODISCARD bool lexPragma(const char *&First, const char *const End);
  66   LLVM_NODISCARD bool lexEndif(const char *&First, const char *const End);
  67   LLVM_NODISCARD bool lexDefault(TokenKind Kind, StringRef Directive,
  68                                  const char *&First, const char *const End);
  69   Token &makeToken(TokenKind K) {
  70     Tokens.emplace_back(K, Out.size());
  71     return Tokens.back();
  72   }
  73   void popToken() {
  74     Out.resize(Tokens.back().Offset);
  75     Tokens.pop_back();
  76   }
  77   TokenKind top() const { return Tokens.empty() ? pp_none : Tokens.back().K; }
  78
  79   Minimizer &put(char Byte) {
  80     Out.push_back(Byte);
  81     return *this;
  82   }
  83   Minimizer &append(StringRef S) { return append(S.begin(), S.end()); }
  84   Minimizer &append(const char *First, const char *Last) {
  85     Out.append(First, Last);
  86     return *this;
  87   }
  88
  89   void printToNewline(const char *&First, const char *const End);
  90   void printAdjacentModuleNameParts(const char *&First, const char *const End);
  91   LLVM_NODISCARD bool printAtImportBody(const char *&First,
  92                                         const char *const End);
  93   void printDirectiveBody(const char *&First, const char *const End);
  94   void printAdjacentMacroArgs(const char *&First, const char *const End);
  95   LLVM_NODISCARD bool printMacroArgs(const char *&First, const char *const End);
  96
  97   /// Reports a diagnostic if the diagnostic engine is provided. Always returns
  98   /// true at the end.
  99   bool reportError(const char *CurPtr, unsigned Err);
 100
 101   StringMap<char> SplitIds;
 102   StringRef Input;
 103   DiagnosticsEngine *Diags;
 104   SourceLocation InputSourceLoc;
 105 };
 106
 107 } // end anonymous namespace
 108
 109 bool Minimizer::reportError(const char *CurPtr, unsigned Err) {
 110   if (!Diags)
 111     return true;
 112   assert(CurPtr >= Input.data() && "invalid buffer ptr");
 113   Diags->Report(InputSourceLoc.getLocWithOffset(CurPtr - Input.data()), Err);
 114   return true;
 115 }
 116
 117 static void skipOverSpaces(const char *&First, const char *const End) {
 118   while (First != End && isHorizontalWhitespace(*First))
 119     ++First;
 120 }
 121
 122 LLVM_NODISCARD static bool isRawStringLiteral(const char *First,
 123                                               const char *Current) {
 124   assert(First <= Current);
 125
 126   // Check if we can even back up.
 127   if (*Current != '"' || First == Current)
 128     return false;
 129
 130   // Check for an "R".
 131   --Current;
 132   if (*Current != 'R')
 133     return false;
 134   if (First == Current || !isIdentifierBody(*--Current))
 135     return true;
 136
 137   // Check for a prefix of "u", "U", or "L".
 138   if (*Current == 'u' || *Current == 'U' || *Current == 'L')
 139     return First == Current || !isIdentifierBody(*--Current);
 140
 141   // Check for a prefix of "u8".
 142   if (*Current != '8' || First == Current || *Current-- != 'u')
 143     return false;
 144   return First == Current || !isIdentifierBody(*--Current);
 145 }
 146
 147 static void skipRawString(const char *&First, const char *const End) {
 148   assert(First[0] == '"');
 149   assert(First[-1] == 'R');
 150
 151   const char *Last = ++First;
 152   while (Last != End && *Last != '(')
 153     ++Last;
 154   if (Last == End) {
 155     First = Last; // Hit the end... just give up.
 156     return;
 157   }
 158
 159   StringRef Terminator(First, Last - First);
 160   for (;;) {
 161     // Move First to just past the next ")".
 162     First = Last;
 163     while (First != End && *First != ')')
 164       ++First;
 165     if (First == End)
 166       return;
 167     ++First;
 168
 169     // Look ahead for the terminator sequence.
 170     Last = First;
 171     while (Last != End && size_t(Last - First) < Terminator.size() &&
 172            Terminator[Last - First] == *Last)
 173       ++Last;
 174
 175     // Check if we hit it (or the end of the file).
 176     if (Last == End) {
 177       First = Last;
 178       return;
 179     }
 180     if (size_t(Last - First) < Terminator.size())
 181       continue;
 182     if (*Last != '"')
 183       continue;
 184     First = Last + 1;
 185     return;
 186   }
 187 }
 188
 189 // Returns the length of EOL, either 0 (no end-of-line), 1 (\n) or 2 (\r\n)
 190 static unsigned isEOL(const char *First, const char *const End) {
 191   if (First == End)
 192     return 0;
 193   if (End - First > 1 && isVerticalWhitespace(First[0]) &&
 194       isVerticalWhitespace(First[1]) && First[0] != First[1])
 195     return 2;
 196   return !!isVerticalWhitespace(First[0]);
 197 }
 198
 199 static void skipString(const char *&First, const char *const End) {
 200   assert(*First == '\'' || *First == '"' || *First == '<');
 201   const char Terminator = *First == '<' ? '>' : *First;
 202   for (++First; First != End && *First != Terminator; ++First) {
 203     // String and character literals don't extend past the end of the line.
 204     if (isVerticalWhitespace(*First))
 205       return;
 206     if (*First != '\\')
 207       continue;
 208     // Skip past backslash to the next character. This ensures that the
 209     // character right after it is skipped as well, which matters if it's
 210     // the terminator.
 211     if (++First == End)
 212       return;
 213     if (!isWhitespace(*First))
 214       continue;
 215     // Whitespace after the backslash might indicate a line continuation.
 216     const char *FirstAfterBackslashPastSpace = First;
 217     skipOverSpaces(FirstAfterBackslashPastSpace, End);
 218     if (unsigned NLSize = isEOL(FirstAfterBackslashPastSpace, End)) {
 219       // Advance the character pointer to the next line for the next
 220       // iteration.
 221       First = FirstAfterBackslashPastSpace + NLSize - 1;
 222     }
 223   }
 224   if (First != End)
 225     ++First; // Finish off the string.
 226 }
 227
 228 // Returns the length of the skipped newline
 229 static unsigned skipNewline(const char *&First, const char *End) {
 230   if (First == End)
 231     return 0;
 232   assert(isVerticalWhitespace(*First));
 233   unsigned Len = isEOL(First, End);
 234   assert(Len && "expected newline");
 235   First += Len;
 236   return Len;
 237 }
 238
 239 static bool wasLineContinuation(const char *First, unsigned EOLLen) {
 240   return *(First - (int)EOLLen - 1) == '\\';
 241 }
 242
 243 static void skipToNewlineRaw(const char *&First, const char *const End) {
 244   for (;;) {
 245     if (First == End)
 246       return;
 247
 248     unsigned Len = isEOL(First, End);
 249     if (Len)
 250       return;
 251
 252     do {
 253       if (++First == End)
 254         return;
 255       Len = isEOL(First, End);
 256     } while (!Len);
 257
 258     if (First[-1] != '\\')
 259       return;
 260
 261     First += Len;
 262     // Keep skipping lines...
 263   }
 264 }
 265
 266 static const char *findLastNonSpace(const char *First, const char *Last) {
 267   assert(First <= Last);
 268   while (First != Last && isHorizontalWhitespace(Last[-1]))
 269     --Last;
 270   return Last;
 271 }
 272
 273 static const char *findFirstTrailingSpace(const char *First,
 274                                           const char *Last) {
 275   const char *LastNonSpace = findLastNonSpace(First, Last);
 276   if (Last == LastNonSpace)
 277     return Last;
 278   assert(isHorizontalWhitespace(LastNonSpace[0]));
 279   return LastNonSpace + 1;
 280 }
 281
 282 static void skipLineComment(const char *&First, const char *const End) {
 283   assert(First[0] == '/' && First[1] == '/');
 284   First += 2;
 285   skipToNewlineRaw(First, End);
 286 }
 287
 288 static void skipBlockComment(const char *&First, const char *const End) {
 289   assert(First[0] == '/' && First[1] == '*');
 290   if (End - First < 4) {
 291     First = End;
 292     return;
 293   }
 294   for (First += 3; First != End; ++First)
 295     if (First[-1] == '*' && First[0] == '/') {
 296       ++First;
 297       return;
 298     }
 299 }
 300
 301 /// \returns True if the current single quotation mark character is a C++ 14
 302 /// digit separator.
 303 static bool isQuoteCppDigitSeparator(const char *const Start,
 304                                      const char *const Cur,
 305                                      const char *const End) {
 306   assert(*Cur == '\'' && "expected quotation character");
 307   // skipLine called in places where we don't expect a valid number
 308   // body before `start` on the same line, so always return false at the start.
 309   if (Start == Cur)
 310     return false;
 311   // The previous character must be a valid PP number character.
 312   // Make sure that the L, u, U, u8 prefixes don't get marked as a
 313   // separator though.
 314   char Prev = *(Cur - 1);
 315   if (Prev == 'L' || Prev == 'U' || Prev == 'u')
 316     return false;
 317   if (Prev == '8' && (Cur - 1 != Start) && *(Cur - 2) == 'u')
 318     return false;
 319   if (!isPreprocessingNumberBody(Prev))
 320     return false;
 321   // The next character should be a valid identifier body character.
 322   return (Cur + 1) < End && isIdentifierBody(*(Cur + 1));
 323 }
 324
 325 static void skipLine(const char *&First, const char *const End) {
 326   for (;;) {
 327     assert(First <= End);
 328     if (First == End)
 329       return;
 330
 331     if (isVerticalWhitespace(*First)) {
 332       skipNewline(First, End);
 333       return;
 334     }
 335     const char *Start = First;
 336     while (First != End && !isVerticalWhitespace(*First)) {
 337       // Iterate over strings correctly to avoid comments and newlines.
 338       if (*First == '"' ||
 339           (*First == '\'' && !isQuoteCppDigitSeparator(Start, First, End))) {
 340         if (isRawStringLiteral(Start, First))
 341           skipRawString(First, End);
 342         else
 343           skipString(First, End);
 344         continue;
 345       }
 346
 347       // Iterate over comments correctly.
 348       if (*First != '/' || End - First < 2) {
 349         ++First;
 350         continue;
 351       }
 352
 353       if (First[1] == '/') {
 354         // "//...".
 355         skipLineComment(First, End);
 356         continue;
 357       }
 358
 359       if (First[1] != '*') {
 360         ++First;
 361         continue;
 362       }
 363
 364       // "/*...*/".
 365       skipBlockComment(First, End);
 366     }
 367     if (First == End)
 368       return;
 369
 370     // Skip over the newline.
 371     unsigned Len = skipNewline(First, End);
 372     if (!wasLineContinuation(First, Len)) // Continue past line-continuations.
 373       break;
 374   }
 375 }
 376
 377 static void skipDirective(StringRef Name, const char *&First,
 378                           const char *const End) {
 379   if (llvm::StringSwitch<bool>(Name)
 380           .Case("warning", true)
 381           .Case("error", true)
 382           .Default(false))
 383     // Do not process quotes or comments.
 384     skipToNewlineRaw(First, End);
 385   else
 386     skipLine(First, End);
 387 }
 388
 389 void Minimizer::printToNewline(const char *&First, const char *const End) {
 390   while (First != End && !isVerticalWhitespace(*First)) {
 391     const char *Last = First;
 392     do {
 393       // Iterate over strings correctly to avoid comments and newlines.
 394       if (*Last == '"' || *Last == '\'' ||
 395           (*Last == '<' && top() == pp_include)) {
 396         if (LLVM_UNLIKELY(isRawStringLiteral(First, Last)))
 397           skipRawString(Last, End);
 398         else
 399           skipString(Last, End);
 400         continue;
 401       }
 402       if (*Last != '/' || End - Last < 2) {
 403         ++Last;
 404         continue; // Gather the rest up to print verbatim.
 405       }
 406
 407       if (Last[1] != '/' && Last[1] != '*') {
 408         ++Last;
 409         continue;
 410       }
 411
 412       // Deal with "//..." and "/*...*/".
 413       append(First, findFirstTrailingSpace(First, Last));
 414       First = Last;
 415
 416       if (Last[1] == '/') {
 417         skipLineComment(First, End);
 418         return;
 419       }
 420
 421       put(' ');
 422       skipBlockComment(First, End);
 423       skipOverSpaces(First, End);
 424       Last = First;
 425     } while (Last != End && !isVerticalWhitespace(*Last));
 426
 427     // Print out the string.
 428     const char *LastBeforeTrailingSpace = findLastNonSpace(First, Last);
 429     if (Last == End || LastBeforeTrailingSpace == First ||
 430         LastBeforeTrailingSpace[-1] != '\\') {
 431       append(First, LastBeforeTrailingSpace);
 432       First = Last;
 433       skipNewline(First, End);
 434       return;
 435     }
 436
 437     // Print up to the backslash, backing up over spaces. Preserve at least one
 438     // space, as the space matters when tokens are separated by a line
 439     // continuation.
 440     append(First, findFirstTrailingSpace(
 441                       First, LastBeforeTrailingSpace - 1));
 442
 443     First = Last;
 444     skipNewline(First, End);
 445     skipOverSpaces(First, End);
 446   }
 447 }
 448
 449 static void skipWhitespace(const char *&First, const char *const End) {
 450   for (;;) {
 451     assert(First <= End);
 452     skipOverSpaces(First, End);
 453
 454     if (End - First < 2)
 455       return;
 456
 457     if (First[0] == '\\' && isVerticalWhitespace(First[1])) {
 458       skipNewline(++First, End);
 459       continue;
 460     }
 461
 462     // Check for a non-comment character.
 463     if (First[0] != '/')
 464       return;
 465
 466     // "// ...".
 467     if (First[1] == '/') {
 468       skipLineComment(First, End);
 469       return;
 470     }
 471
 472     // Cannot be a comment.
 473     if (First[1] != '*')
 474       return;
 475
 476     // "/*...*/".
 477     skipBlockComment(First, End);
 478   }
 479 }
 480
 481 void Minimizer::printAdjacentModuleNameParts(const char *&First,
 482                                              const char *const End) {
 483   // Skip over parts of the body.
 484   const char *Last = First;
 485   do
 486     ++Last;
 487   while (Last != End && (isIdentifierBody(*Last) || *Last == '.'));
 488   append(First, Last);
 489   First = Last;
 490 }
 491
 492 bool Minimizer::printAtImportBody(const char *&First, const char *const End) {
 493   for (;;) {
 494     skipWhitespace(First, End);
 495     if (First == End)
 496       return true;
 497
 498     if (isVerticalWhitespace(*First)) {
 499       skipNewline(First, End);
 500       continue;
 501     }
 502
 503     // Found a semicolon.
 504     if (*First == ';') {
 505       put(*First++).put('\n');
 506       return false;
 507     }
 508
 509     // Don't handle macro expansions inside @import for now.
 510     if (!isIdentifierBody(*First) && *First != '.')
 511       return true;
 512
 513     printAdjacentModuleNameParts(First, End);
 514   }
 515 }
 516
 517 void Minimizer::printDirectiveBody(const char *&First, const char *const End) {
 518   skipWhitespace(First, End); // Skip initial whitespace.
 519   printToNewline(First, End);
 520   while (Out.back() == ' ')
 521     Out.pop_back();
 522   put('\n');
 523 }
 524
 525 LLVM_NODISCARD static const char *lexRawIdentifier(const char *First,
 526                                                    const char *const End) {
 527   assert(isIdentifierBody(*First) && "invalid identifer");
 528   const char *Last = First + 1;
 529   while (Last != End && isIdentifierBody(*Last))
 530     ++Last;
 531   return Last;
 532 }
 533
 534 LLVM_NODISCARD static const char *
 535 getIdentifierContinuation(const char *First, const char *const End) {
 536   if (End - First < 3 || First[0] != '\\' || !isVerticalWhitespace(First[1]))
 537     return nullptr;
 538
 539   ++First;
 540   skipNewline(First, End);
 541   if (First == End)
 542     return nullptr;
 543   return isIdentifierBody(First[0]) ? First : nullptr;
 544 }
 545
 546 Minimizer::IdInfo Minimizer::lexIdentifier(const char *First,
 547                                            const char *const End) {
 548   const char *Last = lexRawIdentifier(First, End);
 549   const char *Next = getIdentifierContinuation(Last, End);
 550   if (LLVM_LIKELY(!Next))
 551     return IdInfo{Last, StringRef(First, Last - First)};
 552
 553   // Slow path, where identifiers are split over lines.
 554   SmallVector<char, 64> Id(First, Last);
 555   while (Next) {
 556     Last = lexRawIdentifier(Next, End);
 557     Id.append(Next, Last);
 558     Next = getIdentifierContinuation(Last, End);
 559   }
 560   return IdInfo{
 561       Last,
 562       SplitIds.try_emplace(StringRef(Id.begin(), Id.size()), 0).first->first()};
 563 }
 564
 565 void Minimizer::printAdjacentMacroArgs(const char *&First,
 566                                        const char *const End) {
 567   // Skip over parts of the body.
 568   const char *Last = First;
 569   do
 570     ++Last;
 571   while (Last != End &&
 572          (isIdentifierBody(*Last) || *Last == '.' || *Last == ','));
 573   append(First, Last);
 574   First = Last;
 575 }
 576
 577 bool Minimizer::printMacroArgs(const char *&First, const char *const End) {
 578   assert(*First == '(');
 579   put(*First++);
 580   for (;;) {
 581     skipWhitespace(First, End);
 582     if (First == End)
 583       return true;
 584
 585     if (*First == ')') {
 586       put(*First++);
 587       return false;
 588     }
 589
 590     // This is intentionally fairly liberal.
 591     if (!(isIdentifierBody(*First) || *First == '.' || *First == ','))
 592       return true;
 593
 594     printAdjacentMacroArgs(First, End);
 595   }
 596 }
 597
 598 /// Looks for an identifier starting from Last.
 599 ///
 600 /// Updates "First" to just past the next identifier, if any.  Returns true iff
 601 /// the identifier matches "Id".
 602 bool Minimizer::isNextIdentifier(StringRef Id, const char *&First,
 603                                  const char *const End) {
 604   skipWhitespace(First, End);
 605   if (First == End || !isIdentifierHead(*First))
 606     return false;
 607
 608   IdInfo FoundId = lexIdentifier(First, End);
 609   First = FoundId.Last;
 610   return FoundId.Name == Id;
 611 }
 612
 613 bool Minimizer::lexAt(const char *&First, const char *const End) {
 614   // Handle "@import".
 615   const char *ImportLoc = First++;
 616   if (!isNextIdentifier("import", First, End)) {
 617     skipLine(First, End);
 618     return false;
 619   }
 620   makeToken(decl_at_import);
 621   append("@import ");
 622   if (printAtImportBody(First, End))
 623     return reportError(
 624         ImportLoc, diag::err_dep_source_minimizer_missing_sema_after_at_import);
 625   skipWhitespace(First, End);
 626   if (First == End)
 627     return false;
 628   if (!isVerticalWhitespace(*First))
 629     return reportError(
 630         ImportLoc, diag::err_dep_source_minimizer_unexpected_tokens_at_import);
 631   skipNewline(First, End);
 632   return false;
 633 }
 634
 635 bool Minimizer::lexModule(const char *&First, const char *const End) {
 636   IdInfo Id = lexIdentifier(First, End);
 637   First = Id.Last;
 638   bool Export = false;
 639   if (Id.Name == "export") {
 640     Export = true;
 641     skipWhitespace(First, End);
 642     if (!isIdentifierBody(*First)) {
 643       skipLine(First, End);
 644       return false;
 645     }
 646     Id = lexIdentifier(First, End);
 647     First = Id.Last;
 648   }
 649
 650   if (Id.Name != "module" && Id.Name != "import") {
 651     skipLine(First, End);
 652     return false;
 653   }
 654
 655   skipWhitespace(First, End);
 656
 657   // Ignore this as a module directive if the next character can't be part of
 658   // an import.
 659
 660   switch (*First) {
 661   case ':':
 662   case '<':
 663   case '"':
 664     break;
 665   default:
 666     if (!isIdentifierBody(*First)) {
 667       skipLine(First, End);
 668       return false;
 669     }
 670   }
 671
 672   if (Export) {
 673     makeToken(cxx_export_decl);
 674     append("export ");
 675   }
 676
 677   if (Id.Name == "module")
 678     makeToken(cxx_module_decl);
 679   else
 680     makeToken(cxx_import_decl);
 681   append(Id.Name);
 682   append(" ");
 683   printToNewline(First, End);
 684   append("\n");
 685   return false;
 686 }
 687
 688 bool Minimizer::lexDefine(const char *&First, const char *const End) {
 689   makeToken(pp_define);
 690   append("#define ");
 691   skipWhitespace(First, End);
 692
 693   if (!isIdentifierHead(*First))
 694     return reportError(First, diag::err_pp_macro_not_identifier);
 695
 696   IdInfo Id = lexIdentifier(First, End);
 697   const char *Last = Id.Last;
 698   append(Id.Name);
 699   if (Last == End)
 700     return false;
 701   if (*Last == '(') {
 702     size_t Size = Out.size();
 703     if (printMacroArgs(Last, End)) {
 704       // Be robust to bad macro arguments, since they can show up in disabled
 705       // code.
 706       Out.resize(Size);
 707       append("(/* invalid */\n");
 708       skipLine(Last, End);
 709       return false;
 710     }
 711   }
 712   skipWhitespace(Last, End);
 713   if (Last == End)
 714     return false;
 715   if (!isVerticalWhitespace(*Last))
 716     put(' ');
 717   printDirectiveBody(Last, End);
 718   First = Last;
 719   return false;
 720 }
 721
 722 bool Minimizer::lexPragma(const char *&First, const char *const End) {
 723   // #pragma.
 724   skipWhitespace(First, End);
 725   if (First == End || !isIdentifierHead(*First))
 726     return false;
 727
 728   IdInfo FoundId = lexIdentifier(First, End);
 729   First = FoundId.Last;
 730   if (FoundId.Name == "once") {
 731     // #pragma once
 732     skipLine(First, End);
 733     makeToken(pp_pragma_once);
 734     append("#pragma once\n");
 735     return false;
 736   }
 737
 738   if (FoundId.Name != "clang") {
 739     skipLine(First, End);
 740     return false;
 741   }
 742
 743   // #pragma clang.
 744   if (!isNextIdentifier("module", First, End)) {
 745     skipLine(First, End);
 746     return false;
 747   }
 748
 749   // #pragma clang module.
 750   if (!isNextIdentifier("import", First, End)) {
 751     skipLine(First, End);
 752     return false;
 753   }
 754
 755   // #pragma clang module import.
 756   makeToken(pp_pragma_import);
 757   append("#pragma clang module import ");
 758   printDirectiveBody(First, End);
 759   return false;
 760 }
 761
 762 bool Minimizer::lexEndif(const char *&First, const char *const End) {
 763   // Strip out "#else" if it's empty.
 764   if (top() == pp_else)
 765     popToken();
 766
 767   // If "#ifdef" is empty, strip it and skip the "#endif".
 768   //
 769   // FIXME: Once/if Clang starts disallowing __has_include in macro expansions,
 770   // we can skip empty `#if` and `#elif` blocks as well after scanning for a
 771   // literal __has_include in the condition.  Even without that rule we could
 772   // drop the tokens if we scan for identifiers in the condition and find none.
 773   if (top() == pp_ifdef || top() == pp_ifndef) {
 774     popToken();
 775     skipLine(First, End);
 776     return false;
 777   }
 778
 779   return lexDefault(pp_endif, "endif", First, End);
 780 }
 781
 782 bool Minimizer::lexDefault(TokenKind Kind, StringRef Directive,
 783                            const char *&First, const char *const End) {
 784   makeToken(Kind);
 785   put('#').append(Directive).put(' ');
 786   printDirectiveBody(First, End);
 787   return false;
 788 }
 789
 790 static bool isStartOfRelevantLine(char First) {
 791   switch (First) {
 792   case '#':
 793   case '@':
 794   case 'i':
 795   case 'e':
 796   case 'm':
 797     return true;
 798   }
 799   return false;
 800 }
 801
 802 bool Minimizer::lexPPLine(const char *&First, const char *const End) {
 803   assert(First != End);
 804
 805   skipWhitespace(First, End);
 806   assert(First <= End);
 807   if (First == End)
 808     return false;
 809
 810   if (!isStartOfRelevantLine(*First)) {
 811     skipLine(First, End);
 812     assert(First <= End);
 813     return false;
 814   }
 815
 816   // Handle "@import".
 817   if (*First == '@')
 818     return lexAt(First, End);
 819
 820   if (*First == 'i' || *First == 'e' || *First == 'm')
 821     return lexModule(First, End);
 822
 823   // Handle preprocessing directives.
 824   ++First; // Skip over '#'.
 825   skipWhitespace(First, End);
 826
 827   if (First == End)
 828     return reportError(First, diag::err_pp_expected_eol);
 829
 830   if (!isIdentifierHead(*First)) {
 831     skipLine(First, End);
 832     return false;
 833   }
 834
 835   // Figure out the token.
 836   IdInfo Id = lexIdentifier(First, End);
 837   First = Id.Last;
 838   auto Kind = llvm::StringSwitch<TokenKind>(Id.Name)
 839                   .Case("include", pp_include)
 840                   .Case("__include_macros", pp___include_macros)
 841                   .Case("define", pp_define)
 842                   .Case("undef", pp_undef)
 843                   .Case("import", pp_import)
 844                   .Case("include_next", pp_include_next)
 845                   .Case("if", pp_if)
 846                   .Case("ifdef", pp_ifdef)
 847                   .Case("ifndef", pp_ifndef)
 848                   .Case("elif", pp_elif)
 849                   .Case("else", pp_else)
 850                   .Case("endif", pp_endif)
 851                   .Case("pragma", pp_pragma_import)
 852                   .Default(pp_none);
 853   if (Kind == pp_none) {
 854     skipDirective(Id.Name, First, End);
 855     return false;
 856   }
 857
 858   if (Kind == pp_endif)
 859     return lexEndif(First, End);
 860
 861   if (Kind == pp_define)
 862     return lexDefine(First, End);
 863
 864   if (Kind == pp_pragma_import)
 865     return lexPragma(First, End);
 866
 867   // Everything else.
 868   return lexDefault(Kind, Id.Name, First, End);
 869 }
 870
 871 static void skipUTF8ByteOrderMark(const char *&First, const char *const End) {
 872   if ((End - First) >= 3 && First[0] == '\xef' && First[1] == '\xbb' &&
 873       First[2] == '\xbf')
 874     First += 3;
 875 }
 876
 877 bool Minimizer::minimizeImpl(const char *First, const char *const End) {
 878   skipUTF8ByteOrderMark(First, End);
 879   while (First != End)
 880     if (lexPPLine(First, End))
 881       return true;
 882   return false;
 883 }
 884
 885 bool Minimizer::minimize() {
 886   bool Error = minimizeImpl(Input.begin(), Input.end());
 887
 888   if (!Error) {
 889     // Add a trailing newline and an EOF on success.
 890     if (!Out.empty() && Out.back() != '\n')
 891       Out.push_back('\n');
 892     makeToken(pp_eof);
 893   }
 894
 895   // Null-terminate the output. This way the memory buffer that's passed to
 896   // Clang will not have to worry about the terminating '\0'.
 897   Out.push_back(0);
 898   Out.pop_back();
 899   return Error;
 900 }
 901
 902 bool clang::minimize_source_to_dependency_directives::computeSkippedRanges(
 903     ArrayRef<Token> Input, llvm::SmallVectorImpl<SkippedRange> &Range) {
 904   struct Directive {
 905     enum DirectiveKind {
 906       If,  // if/ifdef/ifndef
 907       Else // elif,else
 908     };
 909     int Offset;
 910     DirectiveKind Kind;
 911   };
 912   llvm::SmallVector<Directive, 32> Offsets;
 913   for (const Token &T : Input) {
 914     switch (T.K) {
 915     case pp_if:
 916     case pp_ifdef:
 917     case pp_ifndef:
 918       Offsets.push_back({T.Offset, Directive::If});
 919       break;
 920
 921     case pp_elif:
 922     case pp_else: {
 923       if (Offsets.empty())
 924         return true;
 925       int PreviousOffset = Offsets.back().Offset;
 926       Range.push_back({PreviousOffset, T.Offset - PreviousOffset});
 927       Offsets.push_back({T.Offset, Directive::Else});
 928       break;
 929     }
 930
 931     case pp_endif: {
 932       if (Offsets.empty())
 933         return true;
 934       int PreviousOffset = Offsets.back().Offset;
 935       Range.push_back({PreviousOffset, T.Offset - PreviousOffset});
 936       do {
 937         Directive::DirectiveKind Kind = Offsets.pop_back_val().Kind;
 938         if (Kind == Directive::If)
 939           break;
 940       } while (!Offsets.empty());
 941       break;
 942     }
 943     default:
 944       break;
 945     }
 946   }
 947   return false;
 948 }
 949
 950 bool clang::minimizeSourceToDependencyDirectives(
 951     StringRef Input, SmallVectorImpl<char> &Output,
 952     SmallVectorImpl<Token> &Tokens, DiagnosticsEngine *Diags,
 953     SourceLocation InputSourceLoc) {
 954   Output.clear();
 955   Tokens.clear();
 956   return Minimizer(Output, Tokens, Input, Diags, InputSourceLoc).minimize();
 957 }