1 //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines the Suffix Tree class and Suffix Tree Node struct.
11 //===----------------------------------------------------------------------===//
12 #ifndef LLVM_SUPPORT_SUFFIXTREE_H
13 #define LLVM_SUPPORT_SUFFIXTREE_H
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/Support/Allocator.h"
22 /// Represents an undefined index in the suffix tree.
23 const unsigned EmptyIdx = -1;
25 /// A node in a suffix tree which represents a substring or suffix.
27 /// Each node has either no children or at least two children, with the root
28 /// being a exception in the empty tree.
30 /// Children are represented as a map between unsigned integers and nodes. If
31 /// a node N has a child M on unsigned integer k, then the mapping represented
32 /// by N is a proper prefix of the mapping represented by M. Note that this,
33 /// although similar to a trie is somewhat different: each node stores a full
34 /// substring of the full mapping rather than a single character state.
36 /// Each internal node contains a pointer to the internal node representing
37 /// the same string, but with the first character chopped off. This is stored
38 /// in \p Link. Each leaf node stores the start index of its respective
39 /// suffix in \p SuffixIdx.
40 struct SuffixTreeNode {
42 /// The children of this node.
44 /// A child existing on an unsigned integer implies that from the mapping
45 /// represented by the current node, there is a way to reach another
46 /// mapping by tacking that character on the end of the current string.
47 llvm::DenseMap<unsigned, SuffixTreeNode *> Children;
49 /// The start index of this node's substring in the main string.
50 unsigned StartIdx = EmptyIdx;
52 /// The end index of this node's substring in the main string.
54 /// Every leaf node must have its \p EndIdx incremented at the end of every
55 /// step in the construction algorithm. To avoid having to update O(N)
56 /// nodes individually at the end of every step, the end index is stored
58 unsigned *EndIdx = nullptr;
60 /// For leaves, the start index of the suffix represented by this node.
62 /// For all other nodes, this is ignored.
63 unsigned SuffixIdx = EmptyIdx;
65 /// For internal nodes, a pointer to the internal node representing
66 /// the same sequence with the first character chopped off.
68 /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
69 /// Ukkonen's algorithm does to achieve linear-time construction is
70 /// keep track of which node the next insert should be at. This makes each
71 /// insert O(1), and there are a total of O(N) inserts. The suffix link
72 /// helps with inserting children of internal nodes.
74 /// Say we add a child to an internal node with associated mapping S. The
75 /// next insertion must be at the node representing S - its first character.
76 /// This is given by the way that we iteratively build the tree in Ukkonen's
77 /// algorithm. The main idea is to look at the suffixes of each prefix in the
78 /// string, starting with the longest suffix of the prefix, and ending with
79 /// the shortest. Therefore, if we keep pointers between such nodes, we can
80 /// move to the next insertion point in O(1) time. If we don't, then we'd
81 /// have to query from the root, which takes O(N) time. This would make the
82 /// construction algorithm O(N^2) rather than O(N).
83 SuffixTreeNode *Link = nullptr;
85 /// The length of the string formed by concatenating the edge labels from the
86 /// root to this node.
87 unsigned ConcatLen = 0;
89 /// Returns true if this node is a leaf.
90 bool isLeaf() const { return SuffixIdx != EmptyIdx; }
92 /// Returns true if this node is the root of its owning \p SuffixTree.
93 bool isRoot() const { return StartIdx == EmptyIdx; }
95 /// Return the number of elements in the substring associated with this node.
98 // Is it the root? If so, it's the empty string so return 0.
102 assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
104 // Size = the number of elements in the string.
105 // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
106 return *EndIdx - StartIdx + 1;
109 SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
110 : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {}
115 /// A data structure for fast substring queries.
117 /// Suffix trees represent the suffixes of their input strings in their leaves.
118 /// A suffix tree is a type of compressed trie structure where each node
119 /// represents an entire substring rather than a single character. Each leaf
120 /// of the tree is a suffix.
122 /// A suffix tree can be seen as a type of state machine where each state is a
123 /// substring of the full string. The tree is structured so that, for a string
124 /// of length N, there are exactly N leaves in the tree. This structure allows
125 /// us to quickly find repeated substrings of the input string.
127 /// In this implementation, a "string" is a vector of unsigned integers.
128 /// These integers may result from hashing some data type. A suffix tree can
129 /// contain 1 or many strings, which can then be queried as one large string.
131 /// The suffix tree is implemented using Ukkonen's algorithm for linear-time
132 /// suffix tree construction. Ukkonen's algorithm is explained in more detail
133 /// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
134 /// paper is available at
136 /// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
139 /// Each element is an integer representing an instruction in the module.
140 llvm::ArrayRef<unsigned> Str;
142 /// A repeated substring in the tree.
143 struct RepeatedSubstring {
144 /// The length of the string.
147 /// The start indices of each occurrence.
148 std::vector<unsigned> StartIndices;
152 /// Maintains each node in the tree.
153 llvm::SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
155 /// The root of the suffix tree.
157 /// The root represents the empty string. It is maintained by the
158 /// \p NodeAllocator like every other node in the tree.
159 SuffixTreeNode *Root = nullptr;
161 /// Maintains the end indices of the internal nodes in the tree.
163 /// Each internal node is guaranteed to never have its end index change
164 /// during the construction algorithm; however, leaves must be updated at
165 /// every step. Therefore, we need to store leaf end indices by reference
166 /// to avoid updating O(N) leaves at every step of construction. Thus,
167 /// every internal node must be allocated its own end index.
168 llvm::BumpPtrAllocator InternalEndIdxAllocator;
170 /// The end index of each leaf in the tree.
171 unsigned LeafEndIdx = -1;
173 /// Helper struct which keeps track of the next insertion point in
174 /// Ukkonen's algorithm.
176 /// The next node to insert at.
177 SuffixTreeNode *Node = nullptr;
179 /// The index of the first character in the substring currently being added.
180 unsigned Idx = EmptyIdx;
182 /// The length of the substring we have to add at the current step.
186 /// The point the next insertion will take place at in the
187 /// construction algorithm.
190 /// Allocate a leaf node and add it to the tree.
192 /// \param Parent The parent of this node.
193 /// \param StartIdx The start index of this node's associated string.
194 /// \param Edge The label on the edge leaving \p Parent to this node.
196 /// \returns A pointer to the allocated leaf node.
197 SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx,
200 /// Allocate an internal node and add it to the tree.
202 /// \param Parent The parent of this node. Only null when allocating the root.
203 /// \param StartIdx The start index of this node's associated string.
204 /// \param EndIdx The end index of this node's associated string.
205 /// \param Edge The label on the edge leaving \p Parent to this node.
207 /// \returns A pointer to the allocated internal node.
208 SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx,
209 unsigned EndIdx, unsigned Edge);
211 /// Set the suffix indices of the leaves to the start indices of their
212 /// respective suffixes.
213 void setSuffixIndices();
215 /// Construct the suffix tree for the prefix of the input ending at
218 /// Used to construct the full suffix tree iteratively. At the end of each
219 /// step, the constructed suffix tree is either a valid suffix tree, or a
220 /// suffix tree with implicit suffixes. At the end of the final step, the
221 /// suffix tree is a valid tree.
223 /// \param EndIdx The end index of the current prefix in the main string.
224 /// \param SuffixesToAdd The number of suffixes that must be added
225 /// to complete the suffix tree at the current phase.
227 /// \returns The number of suffixes that have not been added at the end of
229 unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
232 /// Construct a suffix tree from a sequence of unsigned integers.
234 /// \param Str The string to construct the suffix tree for.
235 SuffixTree(const std::vector<unsigned> &Str);
237 /// Iterator for finding all repeated substrings in the suffix tree.
238 struct RepeatedSubstringIterator {
240 /// The current node we're visiting.
241 SuffixTreeNode *N = nullptr;
243 /// The repeated substring associated with this node.
244 RepeatedSubstring RS;
246 /// The nodes left to visit.
247 std::vector<SuffixTreeNode *> ToVisit;
249 /// The minimum length of a repeated substring to find.
250 /// Since we're outlining, we want at least two instructions in the range.
251 /// FIXME: This may not be true for targets like X86 which support many
252 /// instruction lengths.
253 const unsigned MinLength = 2;
255 /// Move the iterator to the next repeated substring.
257 // Clear the current state. If we're at the end of the range, then this
258 // is the state we want to be in.
259 RS = RepeatedSubstring();
262 // Each leaf node represents a repeat of a string.
263 std::vector<SuffixTreeNode *> LeafChildren;
265 // Continue visiting nodes until we find one which repeats more than once.
266 while (!ToVisit.empty()) {
267 SuffixTreeNode *Curr = ToVisit.back();
269 LeafChildren.clear();
271 // Keep track of the length of the string associated with the node. If
272 // it's too short, we'll quit.
273 unsigned Length = Curr->ConcatLen;
275 // Iterate over each child, saving internal nodes for visiting, and
276 // leaf nodes in LeafChildren. Internal nodes represent individual
277 // strings, which may repeat.
278 for (auto &ChildPair : Curr->Children) {
279 // Save all of this node's children for processing.
280 if (!ChildPair.second->isLeaf())
281 ToVisit.push_back(ChildPair.second);
283 // It's not an internal node, so it must be a leaf. If we have a
284 // long enough string, then save the leaf children.
285 else if (Length >= MinLength)
286 LeafChildren.push_back(ChildPair.second);
289 // The root never represents a repeated substring. If we're looking at
290 // that, then skip it.
294 // Do we have any repeated substrings?
295 if (LeafChildren.size() >= 2) {
296 // Yes. Update the state to reflect this, and then bail out.
299 for (SuffixTreeNode *Leaf : LeafChildren)
300 RS.StartIndices.push_back(Leaf->SuffixIdx);
305 // At this point, either NewRS is an empty RepeatedSubstring, or it was
306 // set in the above loop. Similarly, N is either nullptr, or the node
307 // associated with NewRS.
311 /// Return the current repeated substring.
312 RepeatedSubstring &operator*() { return RS; }
314 RepeatedSubstringIterator &operator++() {
319 RepeatedSubstringIterator operator++(int I) {
320 RepeatedSubstringIterator It(*this);
325 bool operator==(const RepeatedSubstringIterator &Other) {
328 bool operator!=(const RepeatedSubstringIterator &Other) {
329 return !(*this == Other);
332 RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
333 // Do we have a non-null node?
335 // Yes. At the first step, we need to visit all of N's children.
336 // Note: This means that we visit N last.
337 ToVisit.push_back(N);
343 typedef RepeatedSubstringIterator iterator;
344 iterator begin() { return iterator(Root); }
345 iterator end() { return iterator(nullptr); }
350 #endif // LLVM_SUPPORT_SUFFIXTREE_H