1 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// This file defines classes for searching and anlyzing source code clones.
13 //===----------------------------------------------------------------------===//
15 #ifndef LLVM_CLANG_AST_CLONEDETECTION_H
16 #define LLVM_CLANG_AST_CLONEDETECTION_H
18 #include "clang/Basic/SourceLocation.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/Support/Regex.h"
32 /// Identifies a list of statements.
34 /// Can either identify a single arbitrary Stmt object, a continuous sequence of
35 /// child statements inside a CompoundStmt or no statements at all.
37 /// If this object identifies a sequence of statements inside a CompoundStmt,
38 /// S points to this CompoundStmt. If this object only identifies a single
39 /// Stmt, then S is a pointer to this Stmt.
42 /// The declaration that contains the statements.
45 /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
46 /// instance is representing the CompoundStmt children inside the array
47 /// [StartIndex, EndIndex).
52 /// Constructs a StmtSequence holding multiple statements.
54 /// The resulting StmtSequence identifies a continuous sequence of statements
55 /// in the body of the given CompoundStmt. Which statements of the body should
56 /// be identified needs to be specified by providing a start and end index
57 /// that describe a non-empty sub-array in the body of the given CompoundStmt.
59 /// \param Stmt A CompoundStmt that contains all statements in its body.
60 /// \param D The Decl containing this Stmt.
61 /// \param StartIndex The inclusive start index in the children array of
63 /// \param EndIndex The exclusive end index in the children array of \p Stmt.
64 StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
67 /// Constructs a StmtSequence holding a single statement.
69 /// \param Stmt An arbitrary Stmt.
70 /// \param D The Decl containing this Stmt.
71 StmtSequence(const Stmt *Stmt, const Decl *D);
73 /// Constructs an empty StmtSequence.
76 typedef const Stmt *const *iterator;
78 /// Returns an iterator pointing to the first statement in this sequence.
79 iterator begin() const;
81 /// Returns an iterator pointing behind the last statement in this sequence.
84 /// Returns the first statement in this sequence.
86 /// This method should only be called on a non-empty StmtSequence object.
87 const Stmt *front() const {
92 /// Returns the last statement in this sequence.
94 /// This method should only be called on a non-empty StmtSequence object.
95 const Stmt *back() const {
97 return begin()[size() - 1];
100 /// Returns the number of statements this object holds.
101 unsigned size() const {
103 return EndIndex - StartIndex;
109 /// Returns true if and only if this StmtSequence contains no statements.
110 bool empty() const { return size() == 0; }
112 /// Returns the related ASTContext for the stored Stmts.
113 ASTContext &getASTContext() const;
115 /// Returns the declaration that contains the stored Stmts.
116 const Decl *getContainingDecl() const {
121 /// Returns true if this objects holds a list of statements.
122 bool holdsSequence() const { return EndIndex != 0; }
124 /// Returns the start sourcelocation of the first statement in this sequence.
126 /// This method should only be called on a non-empty StmtSequence object.
127 SourceLocation getStartLoc() const;
129 /// Returns the end sourcelocation of the last statement in this sequence.
131 /// This method should only be called on a non-empty StmtSequence object.
132 SourceLocation getEndLoc() const;
134 /// Returns the source range of the whole sequence - from the beginning
135 /// of the first statement to the end of the last statement.
136 SourceRange getSourceRange() const;
138 bool operator==(const StmtSequence &Other) const {
139 return std::tie(S, StartIndex, EndIndex) ==
140 std::tie(Other.S, Other.StartIndex, Other.EndIndex);
143 bool operator!=(const StmtSequence &Other) const {
144 return std::tie(S, StartIndex, EndIndex) !=
145 std::tie(Other.S, Other.StartIndex, Other.EndIndex);
148 /// Returns true if and only if this sequence covers a source range that
149 /// contains the source range of the given sequence \p Other.
151 /// This method should only be called on a non-empty StmtSequence object
152 /// and passed a non-empty StmtSequence object.
153 bool contains(const StmtSequence &Other) const;
156 /// Searches for similar subtrees in the AST.
158 /// First, this class needs several declarations with statement bodies which
159 /// can be passed via analyzeCodeBody. Afterwards all statements can be
160 /// searched for clones by calling findClones with a given list of constraints
161 /// that should specify the wanted properties of the clones.
163 /// The result of findClones can be further constrained with the constrainClones
166 /// This class only searches for clones in exectuable source code
167 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
168 /// are not supported.
169 class CloneDetector {
172 /// A collection of StmtSequences that share an arbitrary property.
173 typedef llvm::SmallVector<StmtSequence, 8> CloneGroup;
175 /// Generates and stores search data for all statements in the body of
177 void analyzeCodeBody(const Decl *D);
179 /// Constrains the given list of clone groups with the given constraint.
181 /// The constraint is expected to have a method with the signature
182 /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
183 /// as this is the interface that the CloneDetector uses for applying the
184 /// constraint. The constraint is supposed to directly modify the passed list
185 /// so that all clones in the list fulfill the specific property this
186 /// constraint ensures.
187 template <typename T>
188 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
189 C.constrain(CloneGroups);
192 /// Constrains the given list of clone groups with the given list of
195 /// The constraints are applied in sequence in the order in which they are
196 /// passed to this function.
197 template <typename T1, typename... Ts>
198 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
199 Ts... ConstraintList) {
200 constrainClones(CloneGroups, C);
201 constrainClones(CloneGroups, ConstraintList...);
204 /// Searches for clones in all previously passed statements.
205 /// \param Result Output parameter to which all created clone groups are
207 /// \param ConstraintList The constraints that should be applied to the
209 template <typename... Ts>
210 void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
211 // The initial assumption is that there is only one clone group and every
212 // statement is a clone of the others. This clone group will then be
213 // split up with the help of the constraints.
214 CloneGroup AllClones;
215 AllClones.reserve(Sequences.size());
216 for (const auto &C : Sequences) {
217 AllClones.push_back(C);
220 Result.push_back(AllClones);
222 constrainClones(Result, ConstraintList...);
226 CloneGroup Sequences;
229 /// This class is a utility class that contains utility functions for building
230 /// custom constraints.
231 class CloneConstraint {
233 /// Removes all groups by using a filter function.
234 /// \param CloneGroups The list of CloneGroups that is supposed to be
236 /// \param Filter The filter function that should return true for all groups
237 /// that should be removed from the list.
239 filterGroups(std::vector<CloneDetector::CloneGroup> &CloneGroups,
240 std::function<bool(const CloneDetector::CloneGroup &)> Filter) {
242 std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter),
246 /// Splits the given CloneGroups until the given Compare function returns true
247 /// for all clones in a single group.
248 /// \param CloneGroups A list of CloneGroups that should be modified.
249 /// \param Compare The comparison function that all clones are supposed to
250 /// pass. Should return true if and only if two clones belong
251 /// to the same CloneGroup.
252 static void splitCloneGroups(
253 std::vector<CloneDetector::CloneGroup> &CloneGroups,
254 std::function<bool(const StmtSequence &, const StmtSequence &)> Compare);
257 /// Searches all children of the given clones for type II clones (i.e. they are
258 /// identical in every aspect beside the used variable names).
259 class RecursiveCloneTypeIIConstraint {
261 /// Generates and saves a hash code for the given Stmt.
262 /// \param S The given Stmt.
263 /// \param D The Decl containing S.
264 /// \param StmtsByHash Output parameter that will contain the hash codes for
265 /// each StmtSequence in the given Stmt.
266 /// \return The hash code of the given Stmt.
268 /// If the given Stmt is a CompoundStmt, this method will also generate
269 /// hashes for all possible StmtSequences in the children of this Stmt.
270 size_t saveHash(const Stmt *S, const Decl *D,
271 std::vector<std::pair<size_t, StmtSequence>> &StmtsByHash);
274 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
277 /// Ensures that every clone has at least the given complexity.
279 /// Complexity is here defined as the total amount of children of a statement.
280 /// This constraint assumes the first statement in the group is representative
281 /// for all other statements in the group in terms of complexity.
282 class MinComplexityConstraint {
283 unsigned MinComplexity;
286 MinComplexityConstraint(unsigned MinComplexity)
287 : MinComplexity(MinComplexity) {}
289 size_t calculateStmtComplexity(const StmtSequence &Seq,
290 const std::string &ParentMacroStack = "");
292 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
293 CloneConstraint::filterGroups(
294 CloneGroups, [this](const CloneDetector::CloneGroup &A) {
296 return calculateStmtComplexity(A.front()) < MinComplexity;
303 /// Ensures that all clone groups contain at least the given amount of clones.
304 class MinGroupSizeConstraint {
305 unsigned MinGroupSize;
308 MinGroupSizeConstraint(unsigned MinGroupSize = 2)
309 : MinGroupSize(MinGroupSize) {}
311 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
312 CloneConstraint::filterGroups(CloneGroups,
313 [this](const CloneDetector::CloneGroup &A) {
314 return A.size() < MinGroupSize;
319 /// Ensures that no clone group fully contains another clone group.
320 struct OnlyLargestCloneConstraint {
321 void constrain(std::vector<CloneDetector::CloneGroup> &Result);
324 struct FilenamePatternConstraint {
325 StringRef IgnoredFilesPattern;
326 std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
328 FilenamePatternConstraint(StringRef IgnoredFilesPattern)
329 : IgnoredFilesPattern(IgnoredFilesPattern) {
330 IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
331 IgnoredFilesPattern.str() + "$)");
334 bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
336 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
337 CloneConstraint::filterGroups(
338 CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
339 return isAutoGenerated(Group);
344 /// Analyzes the pattern of the referenced variables in a statement.
345 class VariablePattern {
347 /// Describes an occurence of a variable reference in a statement.
348 struct VariableOccurence {
349 /// The index of the associated VarDecl in the Variables vector.
351 /// The statement in the code where the variable was referenced.
354 VariableOccurence(size_t KindID, const Stmt *Mention)
355 : KindID(KindID), Mention(Mention) {}
358 /// All occurences of referenced variables in the order of appearance.
359 std::vector<VariableOccurence> Occurences;
360 /// List of referenced variables in the order of appearance.
361 /// Every item in this list is unique.
362 std::vector<const VarDecl *> Variables;
364 /// Adds a new variable referenced to this pattern.
365 /// \param VarDecl The declaration of the variable that is referenced.
366 /// \param Mention The SourceRange where this variable is referenced.
367 void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
369 /// Adds each referenced variable from the given statement.
370 void addVariables(const Stmt *S);
373 /// Creates an VariablePattern object with information about the given
375 VariablePattern(const StmtSequence &Sequence) {
376 for (const Stmt *S : Sequence)
380 /// Describes two clones that reference their variables in a different pattern
381 /// which could indicate a programming error.
382 struct SuspiciousClonePair {
383 /// Utility class holding the relevant information about a single
384 /// clone in this pair.
385 struct SuspiciousCloneInfo {
386 /// The variable which referencing in this clone was against the pattern.
387 const VarDecl *Variable;
388 /// Where the variable was referenced.
390 /// The variable that should have been referenced to follow the pattern.
391 /// If Suggestion is a nullptr then it's not possible to fix the pattern
392 /// by referencing a different variable in this clone.
393 const VarDecl *Suggestion;
394 SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
395 const VarDecl *Suggestion)
396 : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
397 SuspiciousCloneInfo() {}
399 /// The first clone in the pair which always has a suggested variable.
400 SuspiciousCloneInfo FirstCloneInfo;
401 /// This other clone in the pair which can have a suggested variable.
402 SuspiciousCloneInfo SecondCloneInfo;
405 /// Counts the differences between this pattern and the given one.
406 /// \param Other The given VariablePattern to compare with.
407 /// \param FirstMismatch Output parameter that will be filled with information
408 /// about the first difference between the two patterns. This parameter
409 /// can be a nullptr, in which case it will be ignored.
410 /// \return Returns the number of differences between the pattern this object
411 /// is following and the given VariablePattern.
413 /// For example, the following statements all have the same pattern and this
414 /// function would return zero:
416 /// if (a < b) return a; return b;
417 /// if (x < y) return x; return y;
418 /// if (u2 < u1) return u2; return u1;
420 /// But the following statement has a different pattern (note the changed
421 /// variables in the return statements) and would have two differences when
422 /// compared with one of the statements above.
424 /// if (a < b) return b; return a;
426 /// This function should only be called if the related statements of the given
427 /// pattern and the statements of this objects are clones of each other.
428 unsigned countPatternDifferences(
429 const VariablePattern &Other,
430 VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
433 /// Ensures that all clones reference variables in the same pattern.
434 struct MatchingVariablePatternConstraint {
435 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
438 } // end namespace clang
440 #endif // LLVM_CLANG_AST_CLONEDETECTION_H