contrib/llvm-project/clang/lib/AST/RawCommentList.cpp

   1 //===--- RawCommentList.cpp - Processing raw comments -----------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "clang/AST/RawCommentList.h"
  10 #include "clang/AST/ASTContext.h"
  11 #include "clang/AST/Comment.h"
  12 #include "clang/AST/CommentBriefParser.h"
  13 #include "clang/AST/CommentCommandTraits.h"
  14 #include "clang/AST/CommentLexer.h"
  15 #include "clang/AST/CommentParser.h"
  16 #include "clang/AST/CommentSema.h"
  17 #include "clang/Basic/CharInfo.h"
  18 #include "llvm/ADT/STLExtras.h"
  19
  20 using namespace clang;
  21
  22 namespace {
  23 /// Get comment kind and bool describing if it is a trailing comment.
  24 std::pair<RawComment::CommentKind, bool> getCommentKind(StringRef Comment,
  25                                                         bool ParseAllComments) {
  26   const size_t MinCommentLength = ParseAllComments ? 2 : 3;
  27   if ((Comment.size() < MinCommentLength) || Comment[0] != '/')
  28     return std::make_pair(RawComment::RCK_Invalid, false);
  29
  30   RawComment::CommentKind K;
  31   if (Comment[1] == '/') {
  32     if (Comment.size() < 3)
  33       return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
  34
  35     if (Comment[2] == '/')
  36       K = RawComment::RCK_BCPLSlash;
  37     else if (Comment[2] == '!')
  38       K = RawComment::RCK_BCPLExcl;
  39     else
  40       return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
  41   } else {
  42     assert(Comment.size() >= 4);
  43
  44     // Comment lexer does not understand escapes in comment markers, so pretend
  45     // that this is not a comment.
  46     if (Comment[1] != '*' ||
  47         Comment[Comment.size() - 2] != '*' ||
  48         Comment[Comment.size() - 1] != '/')
  49       return std::make_pair(RawComment::RCK_Invalid, false);
  50
  51     if (Comment[2] == '*')
  52       K = RawComment::RCK_JavaDoc;
  53     else if (Comment[2] == '!')
  54       K = RawComment::RCK_Qt;
  55     else
  56       return std::make_pair(RawComment::RCK_OrdinaryC, false);
  57   }
  58   const bool TrailingComment = (Comment.size() > 3) && (Comment[3] == '<');
  59   return std::make_pair(K, TrailingComment);
  60 }
  61
  62 bool mergedCommentIsTrailingComment(StringRef Comment) {
  63   return (Comment.size() > 3) && (Comment[3] == '<');
  64 }
  65
  66 /// Returns true if R1 and R2 both have valid locations that start on the same
  67 /// column.
  68 bool commentsStartOnSameColumn(const SourceManager &SM, const RawComment &R1,
  69                                const RawComment &R2) {
  70   SourceLocation L1 = R1.getBeginLoc();
  71   SourceLocation L2 = R2.getBeginLoc();
  72   bool Invalid = false;
  73   unsigned C1 = SM.getPresumedColumnNumber(L1, &Invalid);
  74   if (!Invalid) {
  75     unsigned C2 = SM.getPresumedColumnNumber(L2, &Invalid);
  76     return !Invalid && (C1 == C2);
  77   }
  78   return false;
  79 }
  80 } // unnamed namespace
  81
  82 /// Determines whether there is only whitespace in `Buffer` between `P`
  83 /// and the previous line.
  84 /// \param Buffer The buffer to search in.
  85 /// \param P The offset from the beginning of `Buffer` to start from.
  86 /// \return true if all of the characters in `Buffer` ranging from the closest
  87 /// line-ending character before `P` (or the beginning of `Buffer`) to `P - 1`
  88 /// are whitespace.
  89 static bool onlyWhitespaceOnLineBefore(const char *Buffer, unsigned P) {
  90   // Search backwards until we see linefeed or carriage return.
  91   for (unsigned I = P; I != 0; --I) {
  92     char C = Buffer[I - 1];
  93     if (isVerticalWhitespace(C))
  94       return true;
  95     if (!isHorizontalWhitespace(C))
  96       return false;
  97   }
  98   // We hit the beginning of the buffer.
  99   return true;
 100 }
 101
 102 /// Returns whether `K` is an ordinary comment kind.
 103 static bool isOrdinaryKind(RawComment::CommentKind K) {
 104   return (K == RawComment::RCK_OrdinaryBCPL) ||
 105          (K == RawComment::RCK_OrdinaryC);
 106 }
 107
 108 RawComment::RawComment(const SourceManager &SourceMgr, SourceRange SR,
 109                        const CommentOptions &CommentOpts, bool Merged) :
 110     Range(SR), RawTextValid(false), BriefTextValid(false),
 111     IsAttached(false), IsTrailingComment(false),
 112     IsAlmostTrailingComment(false) {
 113   // Extract raw comment text, if possible.
 114   if (SR.getBegin() == SR.getEnd() || getRawText(SourceMgr).empty()) {
 115     Kind = RCK_Invalid;
 116     return;
 117   }
 118
 119   // Guess comment kind.
 120   std::pair<CommentKind, bool> K =
 121       getCommentKind(RawText, CommentOpts.ParseAllComments);
 122
 123   // Guess whether an ordinary comment is trailing.
 124   if (CommentOpts.ParseAllComments && isOrdinaryKind(K.first)) {
 125     FileID BeginFileID;
 126     unsigned BeginOffset;
 127     std::tie(BeginFileID, BeginOffset) =
 128         SourceMgr.getDecomposedLoc(Range.getBegin());
 129     if (BeginOffset != 0) {
 130       bool Invalid = false;
 131       const char *Buffer =
 132           SourceMgr.getBufferData(BeginFileID, &Invalid).data();
 133       IsTrailingComment |=
 134           (!Invalid && !onlyWhitespaceOnLineBefore(Buffer, BeginOffset));
 135     }
 136   }
 137
 138   if (!Merged) {
 139     Kind = K.first;
 140     IsTrailingComment |= K.second;
 141
 142     IsAlmostTrailingComment = RawText.startswith("//<") ||
 143                                  RawText.startswith("/*<");
 144   } else {
 145     Kind = RCK_Merged;
 146     IsTrailingComment =
 147         IsTrailingComment || mergedCommentIsTrailingComment(RawText);
 148   }
 149 }
 150
 151 StringRef RawComment::getRawTextSlow(const SourceManager &SourceMgr) const {
 152   FileID BeginFileID;
 153   FileID EndFileID;
 154   unsigned BeginOffset;
 155   unsigned EndOffset;
 156
 157   std::tie(BeginFileID, BeginOffset) =
 158       SourceMgr.getDecomposedLoc(Range.getBegin());
 159   std::tie(EndFileID, EndOffset) = SourceMgr.getDecomposedLoc(Range.getEnd());
 160
 161   const unsigned Length = EndOffset - BeginOffset;
 162   if (Length < 2)
 163     return StringRef();
 164
 165   // The comment can't begin in one file and end in another.
 166   assert(BeginFileID == EndFileID);
 167
 168   bool Invalid = false;
 169   const char *BufferStart = SourceMgr.getBufferData(BeginFileID,
 170                                                     &Invalid).data();
 171   if (Invalid)
 172     return StringRef();
 173
 174   return StringRef(BufferStart + BeginOffset, Length);
 175 }
 176
 177 const char *RawComment::extractBriefText(const ASTContext &Context) const {
 178   // Lazily initialize RawText using the accessor before using it.
 179   (void)getRawText(Context.getSourceManager());
 180
 181   // Since we will be copying the resulting text, all allocations made during
 182   // parsing are garbage after resulting string is formed.  Thus we can use
 183   // a separate allocator for all temporary stuff.
 184   llvm::BumpPtrAllocator Allocator;
 185
 186   comments::Lexer L(Allocator, Context.getDiagnostics(),
 187                     Context.getCommentCommandTraits(),
 188                     Range.getBegin(),
 189                     RawText.begin(), RawText.end());
 190   comments::BriefParser P(L, Context.getCommentCommandTraits());
 191
 192   const std::string Result = P.Parse();
 193   const unsigned BriefTextLength = Result.size();
 194   char *BriefTextPtr = new (Context) char[BriefTextLength + 1];
 195   memcpy(BriefTextPtr, Result.c_str(), BriefTextLength + 1);
 196   BriefText = BriefTextPtr;
 197   BriefTextValid = true;
 198
 199   return BriefTextPtr;
 200 }
 201
 202 comments::FullComment *RawComment::parse(const ASTContext &Context,
 203                                          const Preprocessor *PP,
 204                                          const Decl *D) const {
 205   // Lazily initialize RawText using the accessor before using it.
 206   (void)getRawText(Context.getSourceManager());
 207
 208   comments::Lexer L(Context.getAllocator(), Context.getDiagnostics(),
 209                     Context.getCommentCommandTraits(),
 210                     getSourceRange().getBegin(),
 211                     RawText.begin(), RawText.end());
 212   comments::Sema S(Context.getAllocator(), Context.getSourceManager(),
 213                    Context.getDiagnostics(),
 214                    Context.getCommentCommandTraits(),
 215                    PP);
 216   S.setDecl(D);
 217   comments::Parser P(L, S, Context.getAllocator(), Context.getSourceManager(),
 218                      Context.getDiagnostics(),
 219                      Context.getCommentCommandTraits());
 220
 221   return P.parseFullComment();
 222 }
 223
 224 static bool onlyWhitespaceBetween(SourceManager &SM,
 225                                   SourceLocation Loc1, SourceLocation Loc2,
 226                                   unsigned MaxNewlinesAllowed) {
 227   std::pair<FileID, unsigned> Loc1Info = SM.getDecomposedLoc(Loc1);
 228   std::pair<FileID, unsigned> Loc2Info = SM.getDecomposedLoc(Loc2);
 229
 230   // Question does not make sense if locations are in different files.
 231   if (Loc1Info.first != Loc2Info.first)
 232     return false;
 233
 234   bool Invalid = false;
 235   const char *Buffer = SM.getBufferData(Loc1Info.first, &Invalid).data();
 236   if (Invalid)
 237     return false;
 238
 239   unsigned NumNewlines = 0;
 240   assert(Loc1Info.second <= Loc2Info.second && "Loc1 after Loc2!");
 241   // Look for non-whitespace characters and remember any newlines seen.
 242   for (unsigned I = Loc1Info.second; I != Loc2Info.second; ++I) {
 243     switch (Buffer[I]) {
 244     default:
 245       return false;
 246     case ' ':
 247     case '\t':
 248     case '\f':
 249     case '\v':
 250       break;
 251     case '\r':
 252     case '\n':
 253       ++NumNewlines;
 254
 255       // Check if we have found more than the maximum allowed number of
 256       // newlines.
 257       if (NumNewlines > MaxNewlinesAllowed)
 258         return false;
 259
 260       // Collapse \r\n and \n\r into a single newline.
 261       if (I + 1 != Loc2Info.second &&
 262           (Buffer[I + 1] == '\n' || Buffer[I + 1] == '\r') &&
 263           Buffer[I] != Buffer[I + 1])
 264         ++I;
 265       break;
 266     }
 267   }
 268
 269   return true;
 270 }
 271
 272 void RawCommentList::addComment(const RawComment &RC,
 273                                 const CommentOptions &CommentOpts,
 274                                 llvm::BumpPtrAllocator &Allocator) {
 275   if (RC.isInvalid())
 276     return;
 277
 278   // Check if the comments are not in source order.
 279   while (!Comments.empty() &&
 280          !SourceMgr.isBeforeInTranslationUnit(Comments.back()->getBeginLoc(),
 281                                               RC.getBeginLoc())) {
 282     // If they are, just pop a few last comments that don't fit.
 283     // This happens if an \#include directive contains comments.
 284     Comments.pop_back();
 285   }
 286
 287   // Ordinary comments are not interesting for us.
 288   if (RC.isOrdinary() && !CommentOpts.ParseAllComments)
 289     return;
 290
 291   // If this is the first Doxygen comment, save it (because there isn't
 292   // anything to merge it with).
 293   if (Comments.empty()) {
 294     Comments.push_back(new (Allocator) RawComment(RC));
 295     return;
 296   }
 297
 298   const RawComment &C1 = *Comments.back();
 299   const RawComment &C2 = RC;
 300
 301   // Merge comments only if there is only whitespace between them.
 302   // Can't merge trailing and non-trailing comments unless the second is
 303   // non-trailing ordinary in the same column, as in the case:
 304   //   int x; // documents x
 305   //          // more text
 306   // versus:
 307   //   int x; // documents x
 308   //   int y; // documents y
 309   // or:
 310   //   int x; // documents x
 311   //   // documents y
 312   //   int y;
 313   // Merge comments if they are on same or consecutive lines.
 314   if ((C1.isTrailingComment() == C2.isTrailingComment() ||
 315        (C1.isTrailingComment() && !C2.isTrailingComment() &&
 316         isOrdinaryKind(C2.getKind()) &&
 317         commentsStartOnSameColumn(SourceMgr, C1, C2))) &&
 318       onlyWhitespaceBetween(SourceMgr, C1.getEndLoc(), C2.getBeginLoc(),
 319                             /*MaxNewlinesAllowed=*/1)) {
 320     SourceRange MergedRange(C1.getBeginLoc(), C2.getEndLoc());
 321     *Comments.back() = RawComment(SourceMgr, MergedRange, CommentOpts, true);
 322   } else {
 323     Comments.push_back(new (Allocator) RawComment(RC));
 324   }
 325 }
 326
 327 void RawCommentList::addDeserializedComments(ArrayRef<RawComment *> DeserializedComments) {
 328   std::vector<RawComment *> MergedComments;
 329   MergedComments.reserve(Comments.size() + DeserializedComments.size());
 330
 331   std::merge(Comments.begin(), Comments.end(),
 332              DeserializedComments.begin(), DeserializedComments.end(),
 333              std::back_inserter(MergedComments),
 334              BeforeThanCompare<RawComment>(SourceMgr));
 335   std::swap(Comments, MergedComments);
 336 }
 337
 338 std::string RawComment::getFormattedText(const SourceManager &SourceMgr,
 339                                          DiagnosticsEngine &Diags) const {
 340   llvm::StringRef CommentText = getRawText(SourceMgr);
 341   if (CommentText.empty())
 342     return "";
 343
 344   llvm::BumpPtrAllocator Allocator;
 345   // We do not parse any commands, so CommentOptions are ignored by
 346   // comments::Lexer. Therefore, we just use default-constructed options.
 347   CommentOptions DefOpts;
 348   comments::CommandTraits EmptyTraits(Allocator, DefOpts);
 349   comments::Lexer L(Allocator, Diags, EmptyTraits, getSourceRange().getBegin(),
 350                     CommentText.begin(), CommentText.end(),
 351                     /*ParseCommands=*/false);
 352
 353   std::string Result;
 354   // A column number of the first non-whitespace token in the comment text.
 355   // We skip whitespace up to this column, but keep the whitespace after this
 356   // column. IndentColumn is calculated when lexing the first line and reused
 357   // for the rest of lines.
 358   unsigned IndentColumn = 0;
 359
 360   // Processes one line of the comment and adds it to the result.
 361   // Handles skipping the indent at the start of the line.
 362   // Returns false when eof is reached and true otherwise.
 363   auto LexLine = [&](bool IsFirstLine) -> bool {
 364     comments::Token Tok;
 365     // Lex the first token on the line. We handle it separately, because we to
 366     // fix up its indentation.
 367     L.lex(Tok);
 368     if (Tok.is(comments::tok::eof))
 369       return false;
 370     if (Tok.is(comments::tok::newline)) {
 371       Result += "\n";
 372       return true;
 373     }
 374     llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr);
 375     bool LocInvalid = false;
 376     unsigned TokColumn =
 377         SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid);
 378     assert(!LocInvalid && "getFormattedText for invalid location");
 379
 380     // Amount of leading whitespace in TokText.
 381     size_t WhitespaceLen = TokText.find_first_not_of(" \t");
 382     if (WhitespaceLen == StringRef::npos)
 383       WhitespaceLen = TokText.size();
 384     // Remember the amount of whitespace we skipped in the first line to remove
 385     // indent up to that column in the following lines.
 386     if (IsFirstLine)
 387       IndentColumn = TokColumn + WhitespaceLen;
 388
 389     // Amount of leading whitespace we actually want to skip.
 390     // For the first line we skip all the whitespace.
 391     // For the rest of the lines, we skip whitespace up to IndentColumn.
 392     unsigned SkipLen =
 393         IsFirstLine
 394             ? WhitespaceLen
 395             : std::min<size_t>(
 396                   WhitespaceLen,
 397                   std::max<int>(static_cast<int>(IndentColumn) - TokColumn, 0));
 398     llvm::StringRef Trimmed = TokText.drop_front(SkipLen);
 399     Result += Trimmed;
 400     // Lex all tokens in the rest of the line.
 401     for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) {
 402       if (Tok.is(comments::tok::newline)) {
 403         Result += "\n";
 404         return true;
 405       }
 406       Result += L.getSpelling(Tok, SourceMgr);
 407     }
 408     // We've reached the end of file token.
 409     return false;
 410   };
 411
 412   auto DropTrailingNewLines = [](std::string &Str) {
 413     while (Str.back() == '\n')
 414       Str.pop_back();
 415   };
 416
 417   // Process first line separately to remember indent for the following lines.
 418   if (!LexLine(/*IsFirstLine=*/true)) {
 419     DropTrailingNewLines(Result);
 420     return Result;
 421   }
 422   // Process the rest of the lines.
 423   while (LexLine(/*IsFirstLine=*/false))
 424     ;
 425   DropTrailingNewLines(Result);
 426   return Result;
 427 }