1 //===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines nodes for use within a SuffixTree.
11 // Each node has either no children or at least two children, with the root
12 // being a exception in the empty tree.
14 // Children are represented as a map between unsigned integers and nodes. If
15 // a node N has a child M on unsigned integer k, then the mapping represented
16 // by N is a proper prefix of the mapping represented by M. Note that this,
17 // although similar to a trie is somewhat different: each node stores a full
18 // substring of the full mapping rather than a single character state.
20 // Each internal node contains a pointer to the internal node representing
21 // the same string, but with the first character chopped off. This is stored
22 // in \p Link. Each leaf node stores the start index of its respective
23 // suffix in \p SuffixIdx.
24 //===----------------------------------------------------------------------===//
26 #ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H
27 #define LLVM_SUPPORT_SUFFIXTREE_NODE_H
28 #include "llvm/ADT/DenseMap.h"
32 /// A node in a suffix tree which represents a substring or suffix.
33 struct SuffixTreeNode {
35 /// Represents an undefined index in the suffix tree.
36 static const unsigned EmptyIdx = -1;
37 enum class NodeKind { ST_Leaf, ST_Internal };
42 /// The start index of this node's substring in the main string.
43 unsigned StartIdx = EmptyIdx;
45 /// The length of the string formed by concatenating the edge labels from
46 /// the root to this node.
47 unsigned ConcatLen = 0;
50 // LLVM RTTI boilerplate.
51 NodeKind getKind() const { return Kind; }
53 /// \return the start index of this node's substring in the entire string.
54 unsigned getStartIdx() const;
56 /// \returns the end index of this node.
57 virtual unsigned getEndIdx() const = 0;
59 /// Advance this node's StartIdx by \p Inc.
60 void incrementStartIdx(unsigned Inc);
62 /// Set the length of the string from the root to this node to \p Len.
63 void setConcatLen(unsigned Len);
65 /// \returns the length of the string from the root to this node.
66 unsigned getConcatLen() const;
68 SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
69 : Kind(Kind), StartIdx(StartIdx) {}
70 virtual ~SuffixTreeNode() = default;
73 // A node with two or more children, or the root.
74 struct SuffixTreeInternalNode : SuffixTreeNode {
76 /// The end index of this node's substring in the main string.
78 /// Every leaf node must have its \p EndIdx incremented at the end of every
79 /// step in the construction algorithm. To avoid having to update O(N)
80 /// nodes individually at the end of every step, the end index is stored
82 unsigned EndIdx = EmptyIdx;
84 /// A pointer to the internal node representing the same sequence with the
85 /// first character chopped off.
87 /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
88 /// Ukkonen's algorithm does to achieve linear-time construction is
89 /// keep track of which node the next insert should be at. This makes each
90 /// insert O(1), and there are a total of O(N) inserts. The suffix link
91 /// helps with inserting children of internal nodes.
93 /// Say we add a child to an internal node with associated mapping S. The
94 /// next insertion must be at the node representing S - its first character.
95 /// This is given by the way that we iteratively build the tree in Ukkonen's
96 /// algorithm. The main idea is to look at the suffixes of each prefix in the
97 /// string, starting with the longest suffix of the prefix, and ending with
98 /// the shortest. Therefore, if we keep pointers between such nodes, we can
99 /// move to the next insertion point in O(1) time. If we don't, then we'd
100 /// have to query from the root, which takes O(N) time. This would make the
101 /// construction algorithm O(N^2) rather than O(N).
102 SuffixTreeInternalNode *Link = nullptr;
105 // LLVM RTTI boilerplate.
106 static bool classof(const SuffixTreeNode *N) {
107 return N->getKind() == NodeKind::ST_Internal;
110 /// \returns true if this node is the root of its owning \p SuffixTree.
113 /// \returns the end index of this node's substring in the entire string.
114 unsigned getEndIdx() const override;
116 /// Sets \p Link to \p L. Assumes \p L is not null.
117 void setLink(SuffixTreeInternalNode *L);
119 /// \returns the pointer to the Link node.
120 SuffixTreeInternalNode *getLink() const;
122 /// The children of this node.
124 /// A child existing on an unsigned integer implies that from the mapping
125 /// represented by the current node, there is a way to reach another
126 /// mapping by tacking that character on the end of the current string.
127 DenseMap<unsigned, SuffixTreeNode *> Children;
129 SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
130 SuffixTreeInternalNode *Link)
131 : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
134 virtual ~SuffixTreeInternalNode() = default;
137 // A node representing a suffix.
138 struct SuffixTreeLeafNode : SuffixTreeNode {
140 /// The start index of the suffix represented by this leaf.
141 unsigned SuffixIdx = EmptyIdx;
143 /// The end index of this node's substring in the main string.
145 /// Every leaf node must have its \p EndIdx incremented at the end of every
146 /// step in the construction algorithm. To avoid having to update O(N)
147 /// nodes individually at the end of every step, the end index is stored
149 unsigned *EndIdx = nullptr;
152 // LLVM RTTI boilerplate.
153 static bool classof(const SuffixTreeNode *N) {
154 return N->getKind() == NodeKind::ST_Leaf;
157 /// \returns the end index of this node's substring in the entire string.
158 unsigned getEndIdx() const override;
160 /// \returns the start index of the suffix represented by this leaf.
161 unsigned getSuffixIdx() const;
163 /// Sets the start index of the suffix represented by this leaf to \p Idx.
164 void setSuffixIdx(unsigned Idx);
165 SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
166 : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
168 virtual ~SuffixTreeLeafNode() = default;
171 #endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H