1 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// This file defines classes for searching and anlyzing source code clones.
13 //===----------------------------------------------------------------------===//
15 #ifndef LLVM_CLANG_AST_CLONEDETECTION_H
16 #define LLVM_CLANG_AST_CLONEDETECTION_H
18 #include "clang/AST/DeclTemplate.h"
19 #include "clang/AST/StmtVisitor.h"
20 #include "clang/Basic/SourceLocation.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/StringRef.h"
23 #include "llvm/Support/Regex.h"
34 namespace clone_detection {
36 /// Returns a string that represents all macro expansions that expanded into the
37 /// given SourceLocation.
39 /// If 'getMacroStack(A) == getMacroStack(B)' is true, then the SourceLocations
40 /// A and B are expanded from the same macros in the same order.
41 std::string getMacroStack(SourceLocation Loc, ASTContext &Context);
43 /// Collects the data of a single Stmt.
45 /// This class defines what a code clone is: If it collects for two statements
46 /// the same data, then those two statements are considered to be clones of each
49 /// All collected data is forwarded to the given data consumer of the type T.
50 /// The data consumer class needs to provide a member method with the signature:
51 /// update(StringRef Str)
53 class StmtDataCollector : public ConstStmtVisitor<StmtDataCollector<T>> {
56 /// The data sink to which all data is forwarded.
60 /// Collects data of the given Stmt.
61 /// \param S The given statement.
62 /// \param Context The ASTContext of S.
63 /// \param DataConsumer The data sink to which all data is forwarded.
64 StmtDataCollector(const Stmt *S, ASTContext &Context, T &DataConsumer)
65 : Context(Context), DataConsumer(DataConsumer) {
69 typedef unsigned DataPiece;
71 // Below are utility methods for appending different data to the vector.
73 void addData(DataPiece Integer) {
75 StringRef(reinterpret_cast<char *>(&Integer), sizeof(Integer)));
78 void addData(llvm::StringRef Str) { DataConsumer.update(Str); }
80 void addData(const QualType &QT) { addData(QT.getAsString()); }
82 // The functions below collect the class specific data of each Stmt subclass.
84 // Utility macro for defining a visit method for a given class. This method
85 // calls back to the ConstStmtVisitor to visit all parent classes.
86 #define DEF_ADD_DATA(CLASS, CODE) \
87 void Visit##CLASS(const CLASS *S) { \
89 ConstStmtVisitor<StmtDataCollector>::Visit##CLASS(S); \
93 addData(S->getStmtClass());
94 // This ensures that macro generated code isn't identical to macro-generated
96 addData(getMacroStack(S->getLocStart(), Context));
97 addData(getMacroStack(S->getLocEnd(), Context));
99 DEF_ADD_DATA(Expr, { addData(S->getType()); })
101 //--- Builtin functionality ----------------------------------------------//
102 DEF_ADD_DATA(ArrayTypeTraitExpr, { addData(S->getTrait()); })
103 DEF_ADD_DATA(ExpressionTraitExpr, { addData(S->getTrait()); })
104 DEF_ADD_DATA(PredefinedExpr, { addData(S->getIdentType()); })
105 DEF_ADD_DATA(TypeTraitExpr, {
106 addData(S->getTrait());
107 for (unsigned i = 0; i < S->getNumArgs(); ++i)
108 addData(S->getArg(i)->getType());
111 //--- Calls --------------------------------------------------------------//
112 DEF_ADD_DATA(CallExpr, {
113 // Function pointers don't have a callee and we just skip hashing it.
114 if (const FunctionDecl *D = S->getDirectCallee()) {
115 // If the function is a template specialization, we also need to handle
116 // the template arguments as they are not included in the qualified name.
117 if (auto Args = D->getTemplateSpecializationArgs()) {
118 std::string ArgString;
120 // Print all template arguments into ArgString
121 llvm::raw_string_ostream OS(ArgString);
122 for (unsigned i = 0; i < Args->size(); ++i) {
123 Args->get(i).print(Context.getLangOpts(), OS);
124 // Add a padding character so that 'foo<X, XX>()' != 'foo<XX, X>()'.
131 addData(D->getQualifiedNameAsString());
135 //--- Exceptions ---------------------------------------------------------//
136 DEF_ADD_DATA(CXXCatchStmt, { addData(S->getCaughtType()); })
138 //--- C++ OOP Stmts ------------------------------------------------------//
139 DEF_ADD_DATA(CXXDeleteExpr, {
140 addData(S->isArrayFormAsWritten());
141 addData(S->isGlobalDelete());
144 //--- Casts --------------------------------------------------------------//
145 DEF_ADD_DATA(ObjCBridgedCastExpr, { addData(S->getBridgeKind()); })
147 //--- Miscellaneous Exprs ------------------------------------------------//
148 DEF_ADD_DATA(BinaryOperator, { addData(S->getOpcode()); })
149 DEF_ADD_DATA(UnaryOperator, { addData(S->getOpcode()); })
151 //--- Control flow -------------------------------------------------------//
152 DEF_ADD_DATA(GotoStmt, { addData(S->getLabel()->getName()); })
153 DEF_ADD_DATA(IndirectGotoStmt, {
154 if (S->getConstantTarget())
155 addData(S->getConstantTarget()->getName());
157 DEF_ADD_DATA(LabelStmt, { addData(S->getDecl()->getName()); })
158 DEF_ADD_DATA(MSDependentExistsStmt, { addData(S->isIfExists()); })
159 DEF_ADD_DATA(AddrLabelExpr, { addData(S->getLabel()->getName()); })
161 //--- Objective-C --------------------------------------------------------//
162 DEF_ADD_DATA(ObjCIndirectCopyRestoreExpr, { addData(S->shouldCopy()); })
163 DEF_ADD_DATA(ObjCPropertyRefExpr, {
164 addData(S->isSuperReceiver());
165 addData(S->isImplicitProperty());
167 DEF_ADD_DATA(ObjCAtCatchStmt, { addData(S->hasEllipsis()); })
169 //--- Miscellaneous Stmts ------------------------------------------------//
170 DEF_ADD_DATA(CXXFoldExpr, {
171 addData(S->isRightFold());
172 addData(S->getOperator());
174 DEF_ADD_DATA(GenericSelectionExpr, {
175 for (unsigned i = 0; i < S->getNumAssocs(); ++i) {
176 addData(S->getAssocType(i));
179 DEF_ADD_DATA(LambdaExpr, {
180 for (const LambdaCapture &C : S->captures()) {
181 addData(C.isPackExpansion());
182 addData(C.getCaptureKind());
183 if (C.capturesVariable())
184 addData(C.getCapturedVar()->getType());
186 addData(S->isGenericLambda());
187 addData(S->isMutable());
189 DEF_ADD_DATA(DeclStmt, {
190 auto numDecls = std::distance(S->decl_begin(), S->decl_end());
191 addData(static_cast<DataPiece>(numDecls));
192 for (const Decl *D : S->decls()) {
193 if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
194 addData(VD->getType());
198 DEF_ADD_DATA(AsmStmt, {
199 addData(S->isSimple());
200 addData(S->isVolatile());
201 addData(S->generateAsmString(Context));
202 for (unsigned i = 0; i < S->getNumInputs(); ++i) {
203 addData(S->getInputConstraint(i));
205 for (unsigned i = 0; i < S->getNumOutputs(); ++i) {
206 addData(S->getOutputConstraint(i));
208 for (unsigned i = 0; i < S->getNumClobbers(); ++i) {
209 addData(S->getClobber(i));
212 DEF_ADD_DATA(AttributedStmt, {
213 for (const Attr *A : S->getAttrs()) {
214 addData(std::string(A->getSpelling()));
218 } // namespace clone_detection
220 /// Identifies a list of statements.
222 /// Can either identify a single arbitrary Stmt object, a continuous sequence of
223 /// child statements inside a CompoundStmt or no statements at all.
225 /// If this object identifies a sequence of statements inside a CompoundStmt,
226 /// S points to this CompoundStmt. If this object only identifies a single
227 /// Stmt, then S is a pointer to this Stmt.
230 /// The declaration that contains the statements.
233 /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
234 /// instance is representing the CompoundStmt children inside the array
235 /// [StartIndex, EndIndex).
240 /// Constructs a StmtSequence holding multiple statements.
242 /// The resulting StmtSequence identifies a continuous sequence of statements
243 /// in the body of the given CompoundStmt. Which statements of the body should
244 /// be identified needs to be specified by providing a start and end index
245 /// that describe a non-empty sub-array in the body of the given CompoundStmt.
247 /// \param Stmt A CompoundStmt that contains all statements in its body.
248 /// \param D The Decl containing this Stmt.
249 /// \param StartIndex The inclusive start index in the children array of
251 /// \param EndIndex The exclusive end index in the children array of \p Stmt.
252 StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
255 /// Constructs a StmtSequence holding a single statement.
257 /// \param Stmt An arbitrary Stmt.
258 /// \param D The Decl containing this Stmt.
259 StmtSequence(const Stmt *Stmt, const Decl *D);
261 /// Constructs an empty StmtSequence.
264 typedef const Stmt *const *iterator;
266 /// Returns an iterator pointing to the first statement in this sequence.
267 iterator begin() const;
269 /// Returns an iterator pointing behind the last statement in this sequence.
270 iterator end() const;
272 /// Returns the first statement in this sequence.
274 /// This method should only be called on a non-empty StmtSequence object.
275 const Stmt *front() const {
280 /// Returns the last statement in this sequence.
282 /// This method should only be called on a non-empty StmtSequence object.
283 const Stmt *back() const {
285 return begin()[size() - 1];
288 /// Returns the number of statements this object holds.
289 unsigned size() const {
291 return EndIndex - StartIndex;
297 /// Returns true if and only if this StmtSequence contains no statements.
298 bool empty() const { return size() == 0; }
300 /// Returns the related ASTContext for the stored Stmts.
301 ASTContext &getASTContext() const;
303 /// Returns the declaration that contains the stored Stmts.
304 const Decl *getContainingDecl() const {
309 /// Returns true if this objects holds a list of statements.
310 bool holdsSequence() const { return EndIndex != 0; }
312 /// Returns the start sourcelocation of the first statement in this sequence.
314 /// This method should only be called on a non-empty StmtSequence object.
315 SourceLocation getStartLoc() const;
317 /// Returns the end sourcelocation of the last statement in this sequence.
319 /// This method should only be called on a non-empty StmtSequence object.
320 SourceLocation getEndLoc() const;
322 /// Returns the source range of the whole sequence - from the beginning
323 /// of the first statement to the end of the last statement.
324 SourceRange getSourceRange() const;
326 bool operator==(const StmtSequence &Other) const {
327 return std::tie(S, StartIndex, EndIndex) ==
328 std::tie(Other.S, Other.StartIndex, Other.EndIndex);
331 bool operator!=(const StmtSequence &Other) const {
332 return std::tie(S, StartIndex, EndIndex) !=
333 std::tie(Other.S, Other.StartIndex, Other.EndIndex);
336 /// Returns true if and only if this sequence covers a source range that
337 /// contains the source range of the given sequence \p Other.
339 /// This method should only be called on a non-empty StmtSequence object
340 /// and passed a non-empty StmtSequence object.
341 bool contains(const StmtSequence &Other) const;
344 /// Searches for similar subtrees in the AST.
346 /// First, this class needs several declarations with statement bodies which
347 /// can be passed via analyzeCodeBody. Afterwards all statements can be
348 /// searched for clones by calling findClones with a given list of constraints
349 /// that should specify the wanted properties of the clones.
351 /// The result of findClones can be further constrained with the constrainClones
354 /// This class only searches for clones in exectuable source code
355 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
356 /// are not supported.
357 class CloneDetector {
360 /// A collection of StmtSequences that share an arbitrary property.
361 typedef llvm::SmallVector<StmtSequence, 8> CloneGroup;
363 /// Generates and stores search data for all statements in the body of
365 void analyzeCodeBody(const Decl *D);
367 /// Constrains the given list of clone groups with the given constraint.
369 /// The constraint is expected to have a method with the signature
370 /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
371 /// as this is the interface that the CloneDetector uses for applying the
372 /// constraint. The constraint is supposed to directly modify the passed list
373 /// so that all clones in the list fulfill the specific property this
374 /// constraint ensures.
375 template <typename T>
376 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
377 C.constrain(CloneGroups);
380 /// Constrains the given list of clone groups with the given list of
383 /// The constraints are applied in sequence in the order in which they are
384 /// passed to this function.
385 template <typename T1, typename... Ts>
386 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
387 Ts... ConstraintList) {
388 constrainClones(CloneGroups, C);
389 constrainClones(CloneGroups, ConstraintList...);
392 /// Searches for clones in all previously passed statements.
393 /// \param Result Output parameter to which all created clone groups are
395 /// \param ConstraintList The constraints that should be applied to the
397 template <typename... Ts>
398 void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
399 // The initial assumption is that there is only one clone group and every
400 // statement is a clone of the others. This clone group will then be
401 // split up with the help of the constraints.
402 CloneGroup AllClones;
403 AllClones.reserve(Sequences.size());
404 for (const auto &C : Sequences) {
405 AllClones.push_back(C);
408 Result.push_back(AllClones);
410 constrainClones(Result, ConstraintList...);
414 CloneGroup Sequences;
417 /// This class is a utility class that contains utility functions for building
418 /// custom constraints.
419 class CloneConstraint {
421 /// Removes all groups by using a filter function.
422 /// \param CloneGroups The list of CloneGroups that is supposed to be
424 /// \param Filter The filter function that should return true for all groups
425 /// that should be removed from the list.
427 filterGroups(std::vector<CloneDetector::CloneGroup> &CloneGroups,
428 std::function<bool(const CloneDetector::CloneGroup &)> Filter) {
430 std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter),
434 /// Splits the given CloneGroups until the given Compare function returns true
435 /// for all clones in a single group.
436 /// \param CloneGroups A list of CloneGroups that should be modified.
437 /// \param Compare The comparison function that all clones are supposed to
438 /// pass. Should return true if and only if two clones belong
439 /// to the same CloneGroup.
440 static void splitCloneGroups(
441 std::vector<CloneDetector::CloneGroup> &CloneGroups,
442 std::function<bool(const StmtSequence &, const StmtSequence &)> Compare);
445 /// Searches all children of the given clones for type II clones (i.e. they are
446 /// identical in every aspect beside the used variable names).
447 class RecursiveCloneTypeIIConstraint {
449 /// Generates and saves a hash code for the given Stmt.
450 /// \param S The given Stmt.
451 /// \param D The Decl containing S.
452 /// \param StmtsByHash Output parameter that will contain the hash codes for
453 /// each StmtSequence in the given Stmt.
454 /// \return The hash code of the given Stmt.
456 /// If the given Stmt is a CompoundStmt, this method will also generate
457 /// hashes for all possible StmtSequences in the children of this Stmt.
458 size_t saveHash(const Stmt *S, const Decl *D,
459 std::vector<std::pair<size_t, StmtSequence>> &StmtsByHash);
462 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
465 /// Ensures that every clone has at least the given complexity.
467 /// Complexity is here defined as the total amount of children of a statement.
468 /// This constraint assumes the first statement in the group is representative
469 /// for all other statements in the group in terms of complexity.
470 class MinComplexityConstraint {
471 unsigned MinComplexity;
474 MinComplexityConstraint(unsigned MinComplexity)
475 : MinComplexity(MinComplexity) {}
477 size_t calculateStmtComplexity(const StmtSequence &Seq,
478 const std::string &ParentMacroStack = "");
480 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
481 CloneConstraint::filterGroups(
482 CloneGroups, [this](const CloneDetector::CloneGroup &A) {
484 return calculateStmtComplexity(A.front()) < MinComplexity;
491 /// Ensures that all clone groups contain at least the given amount of clones.
492 class MinGroupSizeConstraint {
493 unsigned MinGroupSize;
496 MinGroupSizeConstraint(unsigned MinGroupSize = 2)
497 : MinGroupSize(MinGroupSize) {}
499 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
500 CloneConstraint::filterGroups(CloneGroups,
501 [this](const CloneDetector::CloneGroup &A) {
502 return A.size() < MinGroupSize;
507 /// Ensures that no clone group fully contains another clone group.
508 struct OnlyLargestCloneConstraint {
509 void constrain(std::vector<CloneDetector::CloneGroup> &Result);
512 struct FilenamePatternConstraint {
513 StringRef IgnoredFilesPattern;
514 std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
516 FilenamePatternConstraint(StringRef IgnoredFilesPattern)
517 : IgnoredFilesPattern(IgnoredFilesPattern) {
518 IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
519 IgnoredFilesPattern.str() + "$)");
522 bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
524 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
525 CloneConstraint::filterGroups(
526 CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
527 return isAutoGenerated(Group);
532 /// Analyzes the pattern of the referenced variables in a statement.
533 class VariablePattern {
535 /// Describes an occurence of a variable reference in a statement.
536 struct VariableOccurence {
537 /// The index of the associated VarDecl in the Variables vector.
539 /// The statement in the code where the variable was referenced.
542 VariableOccurence(size_t KindID, const Stmt *Mention)
543 : KindID(KindID), Mention(Mention) {}
546 /// All occurences of referenced variables in the order of appearance.
547 std::vector<VariableOccurence> Occurences;
548 /// List of referenced variables in the order of appearance.
549 /// Every item in this list is unique.
550 std::vector<const VarDecl *> Variables;
552 /// Adds a new variable referenced to this pattern.
553 /// \param VarDecl The declaration of the variable that is referenced.
554 /// \param Mention The SourceRange where this variable is referenced.
555 void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
557 /// Adds each referenced variable from the given statement.
558 void addVariables(const Stmt *S);
561 /// Creates an VariablePattern object with information about the given
563 VariablePattern(const StmtSequence &Sequence) {
564 for (const Stmt *S : Sequence)
568 /// Describes two clones that reference their variables in a different pattern
569 /// which could indicate a programming error.
570 struct SuspiciousClonePair {
571 /// Utility class holding the relevant information about a single
572 /// clone in this pair.
573 struct SuspiciousCloneInfo {
574 /// The variable which referencing in this clone was against the pattern.
575 const VarDecl *Variable;
576 /// Where the variable was referenced.
578 /// The variable that should have been referenced to follow the pattern.
579 /// If Suggestion is a nullptr then it's not possible to fix the pattern
580 /// by referencing a different variable in this clone.
581 const VarDecl *Suggestion;
582 SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
583 const VarDecl *Suggestion)
584 : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
585 SuspiciousCloneInfo() {}
587 /// The first clone in the pair which always has a suggested variable.
588 SuspiciousCloneInfo FirstCloneInfo;
589 /// This other clone in the pair which can have a suggested variable.
590 SuspiciousCloneInfo SecondCloneInfo;
593 /// Counts the differences between this pattern and the given one.
594 /// \param Other The given VariablePattern to compare with.
595 /// \param FirstMismatch Output parameter that will be filled with information
596 /// about the first difference between the two patterns. This parameter
597 /// can be a nullptr, in which case it will be ignored.
598 /// \return Returns the number of differences between the pattern this object
599 /// is following and the given VariablePattern.
601 /// For example, the following statements all have the same pattern and this
602 /// function would return zero:
604 /// if (a < b) return a; return b;
605 /// if (x < y) return x; return y;
606 /// if (u2 < u1) return u2; return u1;
608 /// But the following statement has a different pattern (note the changed
609 /// variables in the return statements) and would have two differences when
610 /// compared with one of the statements above.
612 /// if (a < b) return b; return a;
614 /// This function should only be called if the related statements of the given
615 /// pattern and the statements of this objects are clones of each other.
616 unsigned countPatternDifferences(
617 const VariablePattern &Other,
618 VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
621 /// Ensures that all clones reference variables in the same pattern.
622 struct MatchingVariablePatternConstraint {
623 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
626 } // end namespace clang
628 #endif // LLVM_CLANG_AST_CLONEDETECTION_H