blob: 6aa9c2f203b8a5884bcf2cecd5949fe0eab4a720 [file] [log] [blame] [edit]
// This file began as an import from LLVM, and so it has the same license as
// LLVM, copied below together with the code.
//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// A data structure for fast substring queries.
//
// Suffix trees represent the suffixes of their input strings in their leaves.
// A suffix tree is a type of compressed trie structure where each node
// represents an entire substring rather than a single character. Each leaf
// of the tree is a suffix.
//
// A suffix tree can be seen as a type of state machine where each state is a
// substring of the full string. The tree is structured so that, for a string
// of length N, there are exactly N leaves in the tree. This structure allows
// us to quickly find repeated substrings of the input string.
//
// In this implementation, a "string" is a vector of unsigned integers.
// These integers may result from hashing some data type. A suffix tree can
// contain 1 or many strings, which can then be queried as one large string.
//
// The suffix tree is implemented using Ukkonen's algorithm for linear-time
// suffix tree construction. Ukkonen's algorithm is explained in more detail
// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
// paper is available at
//
// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
//===----------------------------------------------------------------------===//
#ifndef wasm_support_suffix_tree_h
#define wasm_support_suffix_tree_h
#include "llvm/Support/Allocator.h"
#include <cassert>
#include <cstddef>
#include <ostream>
#include <vector>
#include "support/suffix_tree_node.h"
using namespace llvm;
namespace wasm {
class SuffixTree {
public:
/// Each element is an integer representing an instruction in the module.
// TODO: This was an ArrayRef originally
std::vector<unsigned> Str;
/// A repeated substring in the tree.
struct RepeatedSubstring {
/// The length of the string.
unsigned Length;
/// The start indices of each occurrence.
std::vector<unsigned> StartIndices;
bool operator==(const RepeatedSubstring& other) const {
return Length == other.Length && StartIndices == other.StartIndices;
}
};
friend std::ostream& operator<<(std::ostream& os,
RepeatedSubstring substring) {
os << "SuffixTree::RepeatedSubstring{" << substring.Length
<< "u, (std::vector<unsigned>{";
for (unsigned idx = 0; idx < substring.StartIndices.size(); idx++) {
os << substring.StartIndices[idx];
if (idx != substring.StartIndices.size() - 1) {
os << ", ";
}
}
return os << "})}";
}
private:
/// Maintains internal nodes in the tree.
SpecificBumpPtrAllocator<SuffixTreeInternalNode> InternalNodeAllocator;
/// Maintains leaf nodes in the tree.
SpecificBumpPtrAllocator<SuffixTreeLeafNode> LeafNodeAllocator;
/// The root of the suffix tree.
///
/// The root represents the empty string. It is maintained by the
/// \p NodeAllocator like every other node in the tree.
SuffixTreeInternalNode* Root = nullptr;
/// The end index of each leaf in the tree.
unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx;
/// Helper struct which keeps track of the next insertion point in
/// Ukkonen's algorithm.
struct ActiveState {
/// The next node to insert at.
SuffixTreeInternalNode* Node = nullptr;
/// The index of the first character in the substring currently being added.
unsigned Idx = SuffixTreeNode::EmptyIdx;
/// The length of the substring we have to add at the current step.
unsigned Len = 0;
};
/// The point the next insertion will take place at in the
/// construction algorithm.
ActiveState Active;
/// Allocate a leaf node and add it to the tree.
///
/// \param Parent The parent of this node.
/// \param StartIdx The start index of this node's associated string.
/// \param Edge The label on the edge leaving \p Parent to this node.
///
/// \returns A pointer to the allocated leaf node.
SuffixTreeNode*
insertLeaf(SuffixTreeInternalNode& Parent, unsigned StartIdx, unsigned Edge);
/// Allocate an internal node and add it to the tree.
///
/// \param Parent The parent of this node. Only null when allocating the root.
/// \param StartIdx The start index of this node's associated string.
/// \param EndIdx The end index of this node's associated string.
/// \param Edge The label on the edge leaving \p Parent to this node.
///
/// \returns A pointer to the allocated internal node.
SuffixTreeInternalNode* insertInternalNode(SuffixTreeInternalNode* Parent,
unsigned StartIdx,
unsigned EndIdx,
unsigned Edge);
/// Allocate the root node and add it to the tree.
///
/// \returns A pointer to the root.
SuffixTreeInternalNode* insertRoot();
/// Set the suffix indices of the leaves to the start indices of their
/// respective suffixes.
void setSuffixIndices();
/// Construct the suffix tree for the prefix of the input ending at
/// \p EndIdx.
///
/// Used to construct the full suffix tree iteratively. At the end of each
/// step, the constructed suffix tree is either a valid suffix tree, or a
/// suffix tree with implicit suffixes. At the end of the final step, the
/// suffix tree is a valid tree.
///
/// \param EndIdx The end index of the current prefix in the main string.
/// \param SuffixesToAdd The number of suffixes that must be added
/// to complete the suffix tree at the current phase.
///
/// \returns The number of suffixes that have not been added at the end of
/// this step.
unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
public:
/// Construct a suffix tree from a sequence of unsigned integers.
///
/// \param Str The string to construct the suffix tree for.
SuffixTree(const std::vector<unsigned>& Str);
/// Iterator for finding all repeated substrings in the suffix tree.
struct RepeatedSubstringIterator {
private:
/// The current node we're visiting.
SuffixTreeNode* N = nullptr;
/// The repeated substring associated with this node.
RepeatedSubstring RS;
/// The nodes left to visit.
std::vector<SuffixTreeInternalNode*> InternalNodesToVisit;
/// The minimum length of a repeated substring to find.
/// Since we're outlining, we want at least two instructions in the range.
/// FIXME: This may not be true for targets like X86 which support many
/// instruction lengths.
const unsigned MinLength = 2;
/// Move the iterator to the next repeated substring.
void advance();
public:
using iterator_category = std::input_iterator_tag;
using value_type = RepeatedSubstring;
using difference_type = std::ptrdiff_t;
using pointer = const RepeatedSubstring*;
using reference = const RepeatedSubstring&;
/// Return the current repeated substring.
RepeatedSubstring& operator*() { return RS; }
RepeatedSubstring* operator->() { return &RS; }
RepeatedSubstringIterator& operator++() {
advance();
return *this;
}
RepeatedSubstringIterator operator++(int I) {
RepeatedSubstringIterator It(*this);
advance();
return It;
}
bool operator==(const RepeatedSubstringIterator& Other) const {
return N == Other.N;
}
bool operator!=(const RepeatedSubstringIterator& Other) const {
return !(*this == Other);
}
RepeatedSubstringIterator(SuffixTreeInternalNode* N) : N(N) {
// Do we have a non-null node?
if (!N) {
return;
}
// Yes. At the first step, we need to visit all of N's children.
// Note: This means that we visit N last.
InternalNodesToVisit.push_back(N);
advance();
}
};
typedef RepeatedSubstringIterator iterator;
iterator begin() { return iterator(Root); }
iterator end() { return iterator(nullptr); }
};
} // namespace wasm
#endif // wasm_support_suffix_tree_h