1 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// This file defines classes for searching and analyzing source code clones.
13 //===----------------------------------------------------------------------===//
15 #ifndef LLVM_CLANG_AST_CLONEDETECTION_H
16 #define LLVM_CLANG_AST_CLONEDETECTION_H
18 #include "clang/AST/StmtVisitor.h"
19 #include "llvm/Support/Regex.h"
30 /// Identifies a list of statements.
32 /// Can either identify a single arbitrary Stmt object, a continuous sequence of
33 /// child statements inside a CompoundStmt or no statements at all.
35 /// If this object identifies a sequence of statements inside a CompoundStmt,
36 /// S points to this CompoundStmt. If this object only identifies a single
37 /// Stmt, then S is a pointer to this Stmt.
40 /// The declaration that contains the statements.
43 /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
44 /// instance is representing the CompoundStmt children inside the array
45 /// [StartIndex, EndIndex).
50 /// Constructs a StmtSequence holding multiple statements.
52 /// The resulting StmtSequence identifies a continuous sequence of statements
53 /// in the body of the given CompoundStmt. Which statements of the body should
54 /// be identified needs to be specified by providing a start and end index
55 /// that describe a non-empty sub-array in the body of the given CompoundStmt.
57 /// \param Stmt A CompoundStmt that contains all statements in its body.
58 /// \param D The Decl containing this Stmt.
59 /// \param StartIndex The inclusive start index in the children array of
61 /// \param EndIndex The exclusive end index in the children array of \p Stmt.
62 StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
65 /// Constructs a StmtSequence holding a single statement.
67 /// \param Stmt An arbitrary Stmt.
68 /// \param D The Decl containing this Stmt.
69 StmtSequence(const Stmt *Stmt, const Decl *D);
71 /// Constructs an empty StmtSequence.
74 typedef const Stmt *const *iterator;
76 /// Returns an iterator pointing to the first statement in this sequence.
77 iterator begin() const;
79 /// Returns an iterator pointing behind the last statement in this sequence.
82 /// Returns the first statement in this sequence.
84 /// This method should only be called on a non-empty StmtSequence object.
85 const Stmt *front() const {
90 /// Returns the last statement in this sequence.
92 /// This method should only be called on a non-empty StmtSequence object.
93 const Stmt *back() const {
95 return begin()[size() - 1];
98 /// Returns the number of statements this object holds.
99 unsigned size() const {
101 return EndIndex - StartIndex;
107 /// Returns true if and only if this StmtSequence contains no statements.
108 bool empty() const { return size() == 0; }
110 /// Returns the related ASTContext for the stored Stmts.
111 ASTContext &getASTContext() const;
113 /// Returns the declaration that contains the stored Stmts.
114 const Decl *getContainingDecl() const {
119 /// Returns true if this objects holds a list of statements.
120 bool holdsSequence() const { return EndIndex != 0; }
122 /// Returns the start sourcelocation of the first statement in this sequence.
124 /// This method should only be called on a non-empty StmtSequence object.
125 SourceLocation getStartLoc() const LLVM_READONLY { return getBeginLoc(); }
126 SourceLocation getBeginLoc() const;
128 /// Returns the end sourcelocation of the last statement in this sequence.
130 /// This method should only be called on a non-empty StmtSequence object.
131 SourceLocation getEndLoc() const;
133 /// Returns the source range of the whole sequence - from the beginning
134 /// of the first statement to the end of the last statement.
135 SourceRange getSourceRange() const;
137 bool operator==(const StmtSequence &Other) const {
138 return std::tie(S, StartIndex, EndIndex) ==
139 std::tie(Other.S, Other.StartIndex, Other.EndIndex);
142 bool operator!=(const StmtSequence &Other) const {
143 return std::tie(S, StartIndex, EndIndex) !=
144 std::tie(Other.S, Other.StartIndex, Other.EndIndex);
147 /// Returns true if and only if this sequence covers a source range that
148 /// contains the source range of the given sequence \p Other.
150 /// This method should only be called on a non-empty StmtSequence object
151 /// and passed a non-empty StmtSequence object.
152 bool contains(const StmtSequence &Other) const;
155 /// Searches for similar subtrees in the AST.
157 /// First, this class needs several declarations with statement bodies which
158 /// can be passed via analyzeCodeBody. Afterwards all statements can be
159 /// searched for clones by calling findClones with a given list of constraints
160 /// that should specify the wanted properties of the clones.
162 /// The result of findClones can be further constrained with the constrainClones
165 /// This class only searches for clones in executable source code
166 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
167 /// are not supported.
168 class CloneDetector {
171 /// A collection of StmtSequences that share an arbitrary property.
172 typedef llvm::SmallVector<StmtSequence, 8> CloneGroup;
174 /// Generates and stores search data for all statements in the body of
176 void analyzeCodeBody(const Decl *D);
178 /// Constrains the given list of clone groups with the given constraint.
180 /// The constraint is expected to have a method with the signature
181 /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
182 /// as this is the interface that the CloneDetector uses for applying the
183 /// constraint. The constraint is supposed to directly modify the passed list
184 /// so that all clones in the list fulfill the specific property this
185 /// constraint ensures.
186 template <typename T>
187 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
188 C.constrain(CloneGroups);
191 /// Constrains the given list of clone groups with the given list of
194 /// The constraints are applied in sequence in the order in which they are
195 /// passed to this function.
196 template <typename T1, typename... Ts>
197 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
198 Ts... ConstraintList) {
199 constrainClones(CloneGroups, C);
200 constrainClones(CloneGroups, ConstraintList...);
203 /// Searches for clones in all previously passed statements.
204 /// \param Result Output parameter to which all created clone groups are
206 /// \param ConstraintList The constraints that should be applied to the
208 template <typename... Ts>
209 void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
210 // The initial assumption is that there is only one clone group and every
211 // statement is a clone of the others. This clone group will then be
212 // split up with the help of the constraints.
213 CloneGroup AllClones;
214 AllClones.reserve(Sequences.size());
215 for (const auto &C : Sequences) {
216 AllClones.push_back(C);
219 Result.push_back(AllClones);
221 constrainClones(Result, ConstraintList...);
225 CloneGroup Sequences;
228 /// This class is a utility class that contains utility functions for building
229 /// custom constraints.
230 class CloneConstraint {
232 /// Removes all groups by using a filter function.
233 /// \param CloneGroups The list of CloneGroups that is supposed to be
235 /// \param Filter The filter function that should return true for all groups
236 /// that should be removed from the list.
237 static void filterGroups(
238 std::vector<CloneDetector::CloneGroup> &CloneGroups,
239 llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) {
241 std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter),
245 /// Splits the given CloneGroups until the given Compare function returns true
246 /// for all clones in a single group.
247 /// \param CloneGroups A list of CloneGroups that should be modified.
248 /// \param Compare The comparison function that all clones are supposed to
249 /// pass. Should return true if and only if two clones belong
250 /// to the same CloneGroup.
251 static void splitCloneGroups(
252 std::vector<CloneDetector::CloneGroup> &CloneGroups,
253 llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
257 /// This constraint moves clones into clone groups of type II via hashing.
259 /// Clones with different hash values are moved into separate clone groups.
260 /// Collisions are possible, and this constraint does nothing to address this
261 /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the
262 /// constraint chain, not necessarily immediately, to eliminate hash collisions
263 /// through a more detailed analysis.
264 class RecursiveCloneTypeIIHashConstraint {
266 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
269 /// This constraint moves clones into clone groups of type II by comparing them.
271 /// Clones that aren't type II clones are moved into separate clone groups.
272 /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone
273 /// group are guaranteed to be be type II clones of each other, but it is too
274 /// slow to efficiently handle large amounts of clones.
275 class RecursiveCloneTypeIIVerifyConstraint {
277 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
280 /// Ensures that every clone has at least the given complexity.
282 /// Complexity is here defined as the total amount of children of a statement.
283 /// This constraint assumes the first statement in the group is representative
284 /// for all other statements in the group in terms of complexity.
285 class MinComplexityConstraint {
286 unsigned MinComplexity;
289 MinComplexityConstraint(unsigned MinComplexity)
290 : MinComplexity(MinComplexity) {}
292 /// Calculates the complexity of the given StmtSequence.
293 /// \param Limit The limit of complexity we probe for. After reaching
294 /// this limit during calculation, this method is exiting
295 /// early to improve performance and returns this limit.
296 size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit,
297 const std::string &ParentMacroStack = "");
299 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
300 CloneConstraint::filterGroups(
301 CloneGroups, [this](const CloneDetector::CloneGroup &A) {
303 return calculateStmtComplexity(A.front(), MinComplexity) <
311 /// Ensures that all clone groups contain at least the given amount of clones.
312 class MinGroupSizeConstraint {
313 unsigned MinGroupSize;
316 MinGroupSizeConstraint(unsigned MinGroupSize = 2)
317 : MinGroupSize(MinGroupSize) {}
319 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
320 CloneConstraint::filterGroups(CloneGroups,
321 [this](const CloneDetector::CloneGroup &A) {
322 return A.size() < MinGroupSize;
327 /// Ensures that no clone group fully contains another clone group.
328 struct OnlyLargestCloneConstraint {
329 void constrain(std::vector<CloneDetector::CloneGroup> &Result);
332 struct FilenamePatternConstraint {
333 StringRef IgnoredFilesPattern;
334 std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
336 FilenamePatternConstraint(StringRef IgnoredFilesPattern)
337 : IgnoredFilesPattern(IgnoredFilesPattern) {
338 IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
339 IgnoredFilesPattern.str() + "$)");
342 bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
344 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
345 CloneConstraint::filterGroups(
346 CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
347 return isAutoGenerated(Group);
352 /// Analyzes the pattern of the referenced variables in a statement.
353 class VariablePattern {
355 /// Describes an occurrence of a variable reference in a statement.
356 struct VariableOccurence {
357 /// The index of the associated VarDecl in the Variables vector.
359 /// The statement in the code where the variable was referenced.
362 VariableOccurence(size_t KindID, const Stmt *Mention)
363 : KindID(KindID), Mention(Mention) {}
366 /// All occurrences of referenced variables in the order of appearance.
367 std::vector<VariableOccurence> Occurences;
368 /// List of referenced variables in the order of appearance.
369 /// Every item in this list is unique.
370 std::vector<const VarDecl *> Variables;
372 /// Adds a new variable referenced to this pattern.
373 /// \param VarDecl The declaration of the variable that is referenced.
374 /// \param Mention The SourceRange where this variable is referenced.
375 void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
377 /// Adds each referenced variable from the given statement.
378 void addVariables(const Stmt *S);
381 /// Creates an VariablePattern object with information about the given
383 VariablePattern(const StmtSequence &Sequence) {
384 for (const Stmt *S : Sequence)
388 /// Describes two clones that reference their variables in a different pattern
389 /// which could indicate a programming error.
390 struct SuspiciousClonePair {
391 /// Utility class holding the relevant information about a single
392 /// clone in this pair.
393 struct SuspiciousCloneInfo {
394 /// The variable which referencing in this clone was against the pattern.
395 const VarDecl *Variable;
396 /// Where the variable was referenced.
398 /// The variable that should have been referenced to follow the pattern.
399 /// If Suggestion is a nullptr then it's not possible to fix the pattern
400 /// by referencing a different variable in this clone.
401 const VarDecl *Suggestion;
402 SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
403 const VarDecl *Suggestion)
404 : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
405 SuspiciousCloneInfo() {}
407 /// The first clone in the pair which always has a suggested variable.
408 SuspiciousCloneInfo FirstCloneInfo;
409 /// This other clone in the pair which can have a suggested variable.
410 SuspiciousCloneInfo SecondCloneInfo;
413 /// Counts the differences between this pattern and the given one.
414 /// \param Other The given VariablePattern to compare with.
415 /// \param FirstMismatch Output parameter that will be filled with information
416 /// about the first difference between the two patterns. This parameter
417 /// can be a nullptr, in which case it will be ignored.
418 /// \return Returns the number of differences between the pattern this object
419 /// is following and the given VariablePattern.
421 /// For example, the following statements all have the same pattern and this
422 /// function would return zero:
424 /// if (a < b) return a; return b;
425 /// if (x < y) return x; return y;
426 /// if (u2 < u1) return u2; return u1;
428 /// But the following statement has a different pattern (note the changed
429 /// variables in the return statements) and would have two differences when
430 /// compared with one of the statements above.
432 /// if (a < b) return b; return a;
434 /// This function should only be called if the related statements of the given
435 /// pattern and the statements of this objects are clones of each other.
436 unsigned countPatternDifferences(
437 const VariablePattern &Other,
438 VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
441 /// Ensures that all clones reference variables in the same pattern.
442 struct MatchingVariablePatternConstraint {
443 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
446 } // end namespace clang
448 #endif // LLVM_CLANG_AST_CLONEDETECTION_H