1 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// This file defines classes for searching and anlyzing source code clones.
13 //===----------------------------------------------------------------------===//
15 #ifndef LLVM_CLANG_AST_CLONEDETECTION_H
16 #define LLVM_CLANG_AST_CLONEDETECTION_H
18 #include "clang/Basic/SourceLocation.h"
19 #include "llvm/ADT/Hashing.h"
20 #include "llvm/ADT/StringMap.h"
32 /// \brief Identifies a list of statements.
34 /// Can either identify a single arbitrary Stmt object, a continuous sequence of
35 /// child statements inside a CompoundStmt or no statements at all.
37 /// If this object identifies a sequence of statements inside a CompoundStmt,
38 /// S points to this CompoundStmt. If this object only identifies a single
39 /// Stmt, then S is a pointer to this Stmt.
42 /// The related ASTContext for S.
45 /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
46 /// instance is representing the CompoundStmt children inside the array
47 /// [StartIndex, EndIndex).
52 /// \brief Constructs a StmtSequence holding multiple statements.
54 /// The resulting StmtSequence identifies a continuous sequence of statements
55 /// in the body of the given CompoundStmt. Which statements of the body should
56 /// be identified needs to be specified by providing a start and end index
57 /// that describe a non-empty sub-array in the body of the given CompoundStmt.
59 /// \param Stmt A CompoundStmt that contains all statements in its body.
60 /// \param Context The ASTContext for the given CompoundStmt.
61 /// \param StartIndex The inclusive start index in the children array of
63 /// \param EndIndex The exclusive end index in the children array of \p Stmt.
64 StmtSequence(const CompoundStmt *Stmt, ASTContext &Context,
65 unsigned StartIndex, unsigned EndIndex);
67 /// \brief Constructs a StmtSequence holding a single statement.
69 /// \param Stmt An arbitrary Stmt.
70 /// \param Context The ASTContext for the given Stmt.
71 StmtSequence(const Stmt *Stmt, ASTContext &Context);
73 /// \brief Constructs an empty StmtSequence.
76 typedef const Stmt *const *iterator;
78 /// Returns an iterator pointing to the first statement in this sequence.
79 iterator begin() const;
81 /// Returns an iterator pointing behind the last statement in this sequence.
84 /// Returns the first statement in this sequence.
86 /// This method should only be called on a non-empty StmtSequence object.
87 const Stmt *front() const {
92 /// Returns the last statement in this sequence.
94 /// This method should only be called on a non-empty StmtSequence object.
95 const Stmt *back() const {
97 return begin()[size() - 1];
100 /// Returns the number of statements this object holds.
101 unsigned size() const {
103 return EndIndex - StartIndex;
109 /// Returns true if and only if this StmtSequence contains no statements.
110 bool empty() const { return size() == 0; }
112 /// Returns the related ASTContext for the stored Stmts.
113 ASTContext &getASTContext() const {
118 /// Returns true if this objects holds a list of statements.
119 bool holdsSequence() const { return EndIndex != 0; }
121 /// Returns the start sourcelocation of the first statement in this sequence.
123 /// This method should only be called on a non-empty StmtSequence object.
124 SourceLocation getStartLoc() const;
126 /// Returns the end sourcelocation of the last statement in this sequence.
128 /// This method should only be called on a non-empty StmtSequence object.
129 SourceLocation getEndLoc() const;
131 /// Returns the source range of the whole sequence - from the beginning
132 /// of the first statement to the end of the last statement.
133 SourceRange getSourceRange() const;
135 bool operator==(const StmtSequence &Other) const {
136 return std::tie(S, StartIndex, EndIndex) ==
137 std::tie(Other.S, Other.StartIndex, Other.EndIndex);
140 bool operator!=(const StmtSequence &Other) const {
141 return std::tie(S, StartIndex, EndIndex) !=
142 std::tie(Other.S, Other.StartIndex, Other.EndIndex);
145 /// Returns true if and only if this sequence covers a source range that
146 /// contains the source range of the given sequence \p Other.
148 /// This method should only be called on a non-empty StmtSequence object
149 /// and passed a non-empty StmtSequence object.
150 bool contains(const StmtSequence &Other) const;
153 /// \brief Searches for clones in source code.
155 /// First, this class needs a translation unit which is passed via
156 /// \p analyzeTranslationUnit . It will then generate and store search data
157 /// for all statements inside the given translation unit.
158 /// Afterwards the generated data can be used to find code clones by calling
161 /// This class only searches for clones in exectuable source code
162 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
163 /// are not supported.
164 class CloneDetector {
166 typedef unsigned DataPiece;
168 /// Holds the data about a StmtSequence that is needed during the search for
170 struct CloneSignature {
171 /// \brief The hash code of the StmtSequence.
173 /// The initial clone groups that are formed during the search for clones
174 /// consist only of Sequences that share the same hash code. This makes this
175 /// value the central part of this heuristic that is needed to find clones
176 /// in a performant way. For this to work, the type of this variable
177 /// always needs to be small and fast to compare.
179 /// Also, StmtSequences that are clones of each others have to share
180 /// the same hash code. StmtSequences that are not clones of each other
181 /// shouldn't share the same hash code, but if they do, it will only
182 /// degrade the performance of the hash search but doesn't influence
183 /// the correctness of the result.
186 /// \brief The complexity of the StmtSequence.
188 /// This value gives an approximation on how many direct or indirect child
189 /// statements are contained in the related StmtSequence. In general, the
190 /// greater this value, the greater the amount of statements. However, this
191 /// is only an approximation and the actual amount of statements can be
192 /// higher or lower than this value. Statements that are generated by the
193 /// compiler (e.g. macro expansions) for example barely influence the
194 /// complexity value.
196 /// The main purpose of this value is to filter clones that are too small
197 /// and therefore probably not interesting enough for the user.
200 /// \brief Creates an empty CloneSignature without any data.
201 CloneSignature() : Complexity(1) {}
203 CloneSignature(llvm::hash_code Hash, unsigned Complexity)
204 : Hash(Hash), Complexity(Complexity) {}
207 /// Holds group of StmtSequences that are clones of each other and the
208 /// complexity value (see CloneSignature::Complexity) that all stored
209 /// StmtSequences have in common.
211 std::vector<StmtSequence> Sequences;
212 CloneSignature Signature;
216 CloneGroup(const StmtSequence &Seq, CloneSignature Signature)
217 : Signature(Signature) {
218 Sequences.push_back(Seq);
221 /// \brief Returns false if and only if this group should be skipped when
222 /// searching for clones.
223 bool isValid() const {
224 // A clone group with only one member makes no sense, so we skip them.
225 return Sequences.size() > 1;
229 /// \brief Generates and stores search data for all statements in the body of
231 void analyzeCodeBody(const Decl *D);
233 /// \brief Stores the CloneSignature to allow future querying.
234 void add(const StmtSequence &S, const CloneSignature &Signature);
236 /// \brief Searches the provided statements for clones.
238 /// \param Result Output parameter that is filled with a list of found
239 /// clone groups. Each group contains multiple StmtSequences
240 /// that were identified to be clones of each other.
241 /// \param MinGroupComplexity Only return clones which have at least this
242 /// complexity value.
243 /// \param CheckPatterns Returns only clone groups in which the referenced
244 /// variables follow the same pattern.
245 void findClones(std::vector<CloneGroup> &Result, unsigned MinGroupComplexity,
246 bool CheckPatterns = true);
248 /// \brief Describes two clones that reference their variables in a different
249 /// pattern which could indicate a programming error.
250 struct SuspiciousClonePair {
251 /// \brief Utility class holding the relevant information about a single
252 /// clone in this pair.
253 struct SuspiciousCloneInfo {
254 /// The variable which referencing in this clone was against the pattern.
255 const VarDecl *Variable;
256 /// Where the variable was referenced.
258 /// The variable that should have been referenced to follow the pattern.
259 /// If Suggestion is a nullptr then it's not possible to fix the pattern
260 /// by referencing a different variable in this clone.
261 const VarDecl *Suggestion;
262 SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
263 const VarDecl *Suggestion)
264 : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
265 SuspiciousCloneInfo() {}
267 /// The first clone in the pair which always has a suggested variable.
268 SuspiciousCloneInfo FirstCloneInfo;
269 /// This other clone in the pair which can have a suggested variable.
270 SuspiciousCloneInfo SecondCloneInfo;
273 /// \brief Searches the provided statements for pairs of clones that don't
274 /// follow the same pattern when referencing variables.
275 /// \param Result Output parameter that will contain the clone pairs.
276 /// \param MinGroupComplexity Only clone pairs in which the clones have at
277 /// least this complexity value.
278 void findSuspiciousClones(std::vector<SuspiciousClonePair> &Result,
279 unsigned MinGroupComplexity);
282 /// Stores all encountered StmtSequences alongside their CloneSignature.
283 std::vector<std::pair<CloneSignature, StmtSequence>> Sequences;
286 } // end namespace clang
288 #endif // LLVM_CLANG_AST_CLONEDETECTION_H