blob: ae45408a726383d38f25112c3d5b80a61fcffee8 [file] [log] [blame]
// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
#define BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
#include <stdint.h>
#include <limits>
#include <set>
#include <string>
#include <vector>
#include "base/base_export.h"
#include "base/check_op.h"
#include "base/substring_set_matcher/matcher_string_pattern.h"
namespace base {
// Class that store a set of string patterns and can find for a string S,
// which string patterns occur in S.
class BASE_EXPORT SubstringSetMatcher {
public:
SubstringSetMatcher();
SubstringSetMatcher(const SubstringSetMatcher&) = delete;
SubstringSetMatcher& operator=(const SubstringSetMatcher&) = delete;
~SubstringSetMatcher();
// Registers all |patterns|. Each pattern needs to have a unique ID and all
// pattern strings must be unique. Build() should be called exactly once
// (before it is called, the tree is empty).
//
// Complexity:
// Let n = number of patterns.
// Let S = sum of pattern lengths.
// Let k = range of char. Generally 256.
// Complexity = O(nlogn + S * logk)
// nlogn comes from sorting the patterns.
// log(k) comes from our usage of std::map to store edges.
//
// Returns true on success (may fail if e.g. if the tree gets too many nodes).
bool Build(const std::vector<MatcherStringPattern>& patterns);
bool Build(std::vector<const MatcherStringPattern*> patterns);
// Matches |text| against all registered MatcherStringPatterns. Stores the IDs
// of matching patterns in |matches|. |matches| is not cleared before adding
// to it.
// Complexity:
// Let t = length of |text|.
// Let k = range of char. Generally 256.
// Let z = number of matches returned.
// Complexity = O(t * logk + zlogz)
bool Match(const std::string& text,
std::set<MatcherStringPattern::ID>* matches) const;
// As Match(), except it returns immediately on the first match.
// This allows true/false matching to be done without any dynamic
// memory allocation.
// Complexity = O(t * logk)
bool AnyMatch(const std::string& text) const;
// Returns true if this object retains no allocated data.
bool IsEmpty() const { return is_empty_; }
// Returns the dynamically allocated memory usage in bytes. See
// base/trace_event/memory_usage_estimator.h for details.
size_t EstimateMemoryUsage() const;
private:
// Represents the index of the node within |tree_|. It is specifically
// uint32_t so that we can be sure it takes up 4 bytes when stored together
// with the 9-bit label (so 23 bits are allocated to the NodeID, even though
// it is exposed as uint32_t). If the computed size of |tree_| is
// larger than what can be stored within 23 bits, Build() will fail.
using NodeID = uint32_t;
// This is the maximum possible size of |tree_| and hence can't be a valid ID.
static constexpr NodeID kInvalidNodeID = (1u << 23) - 1;
static constexpr NodeID kRootID = 0;
// A node of an Aho Corasick Tree. See
// http://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/02/Small02.pdf
// to understand the algorithm.
//
// The algorithm is based on the idea of building a trie of all registered
// patterns. Each node of the tree is annotated with a set of pattern
// IDs that are used to report matches.
//
// The root of the trie represents an empty match. If we were looking whether
// any registered pattern matches a text at the beginning of the text (i.e.
// whether any pattern is a prefix of the text), we could just follow
// nodes in the trie according to the matching characters in the text.
// E.g., if text == "foobar", we would follow the trie from the root node
// to its child labeled 'f', from there to child 'o', etc. In this process we
// would report all pattern IDs associated with the trie nodes as matches.
//
// As we are not looking for all prefix matches but all substring matches,
// this algorithm would need to compare text.substr(0), text.substr(1), ...
// against the trie, which is in O(|text|^2).
//
// The Aho Corasick algorithm improves this runtime by using failure edges.
// In case we have found a partial match of length k in the text
// (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
// a node at depth k, but cannot find a match in the trie for character
// text[i + k] at depth k + 1, we follow a failure edge. This edge
// corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
// is a prefix of any registered pattern.
//
// If your brain thinks "Forget it, let's go shopping.", don't worry.
// Take a nap and read an introductory text on the Aho Corasick algorithm.
// It will make sense. Eventually.
// An edge internal to the tree. We pack the label (character we are
// matching on) and the destination node ID into 32 bits, to save memory.
// We also use these edges as a sort of generic key/value store for
// some special values that not all nodes will have; this also saves on
// memory over the otherwise obvious choice of having them as struct fields,
// as it means we do not to store them when they are not present.
struct AhoCorasickEdge {
// char (unsigned, so [0..255]), or a special label below.
uint32_t label : 9;
NodeID node_id : 23;
};
// Node index that failure edge leads to. The failure node corresponds to
// the node which represents the longest proper suffix (include empty
// string) of the string represented by this node. Not stored if it is
// equal to kRootID (since that is the most common value).
//
// NOTE: Assigning |root| as the failure edge for itself doesn't strictly
// abide by the definition of "proper" suffix. The proper suffix of an empty
// string should probably be defined as null, but we assign it to the |root|
// to simplify the code and have the invariant that the failure edge is always
// defined.
static constexpr uint32_t kFailureNodeLabel = 0x100;
static constexpr uint32_t kFirstSpecialLabel = kFailureNodeLabel;
// Node index that corresponds to the longest proper suffix (including empty
// suffix) of this node and which also represents the end of a pattern.
// Does not have to exist.
static constexpr uint32_t kOutputLinkLabel = 0x101;
// If present, this node represents the end of a pattern. It stores the ID of
// the corresponding pattern (ie., it is not really a NodeID, but a
// MatcherStringPattern::ID).
static constexpr uint32_t kMatchIDLabel = 0x102;
// Used for uninitialized label slots; used so that we do not have to test for
// them in other ways, since we know the data will be initialized and never
// match any other labels.
static constexpr uint32_t kEmptyLabel = 0x103;
// A node in the trie, packed tightly together so that it occupies 12 bytes
// (both on 32- and 64-bit platforms), but aligned to at least 4 (see the
// comment on edges_).
class alignas(AhoCorasickEdge) AhoCorasickNode {
public:
AhoCorasickNode();
~AhoCorasickNode();
AhoCorasickNode(AhoCorasickNode&& other);
AhoCorasickNode& operator=(AhoCorasickNode&& other);
NodeID GetEdge(uint32_t label) const {
if (edges_capacity_ != 0) {
return GetEdgeNoInline(label);
}
static_assert(kNumInlineEdges == 2, "Code below needs updating");
if (edges_.inline_edges[0].label == label) {
return edges_.inline_edges[0].node_id;
}
if (edges_.inline_edges[1].label == label) {
return edges_.inline_edges[1].node_id;
}
return kInvalidNodeID;
}
NodeID GetEdgeNoInline(uint32_t label) const;
void SetEdge(uint32_t label, NodeID node);
const AhoCorasickEdge* edges() const {
// NOTE: Returning edges_.inline_edges here is fine, because it's
// the first thing in the struct (see the comment on edges_).
DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(edges_.inline_edges) %
alignof(AhoCorasickEdge));
return edges_capacity_ == 0 ? edges_.inline_edges : edges_.edges;
}
NodeID failure() const {
// NOTE: Even if num_edges_ == 0, we are not doing anything
// undefined, as we will have room for at least two edges
// and empty edges are set to kEmptyLabel.
const AhoCorasickEdge& first_edge = *edges();
if (first_edge.label == kFailureNodeLabel) {
return first_edge.node_id;
} else {
return kRootID;
}
}
void SetFailure(NodeID failure);
void SetMatchID(MatcherStringPattern::ID id) {
DCHECK(!IsEndOfPattern());
DCHECK(id < kInvalidNodeID); // This is enforced by Build().
SetEdge(kMatchIDLabel, static_cast<NodeID>(id));
has_outputs_ = true;
}
// Returns true if this node corresponds to a pattern.
bool IsEndOfPattern() const {
if (!has_outputs_) {
// Fast reject.
return false;
}
return GetEdge(kMatchIDLabel) != kInvalidNodeID;
}
// Must only be called if |IsEndOfPattern| returns true for this node.
MatcherStringPattern::ID GetMatchID() const {
DCHECK(IsEndOfPattern());
return GetEdge(kMatchIDLabel);
}
void SetOutputLink(NodeID node) {
if (node != kInvalidNodeID) {
SetEdge(kOutputLinkLabel, node);
has_outputs_ = true;
}
}
NodeID output_link() const { return GetEdge(kOutputLinkLabel); }
size_t EstimateMemoryUsage() const;
size_t num_edges() const {
if (edges_capacity_ == 0) {
return kNumInlineEdges - num_free_edges_;
} else {
return edges_capacity_ - num_free_edges_;
}
}
bool has_outputs() const { return has_outputs_; }
private:
// Outgoing edges of current node, including failure edge and output links.
// Most nodes have only one or two (or even zero) edges, not the last
// because many of them are leaves. Thus, we make an optimization for this
// common case; instead of a pointer to an edge array on the heap, we can
// pack two edges inline where the pointer would otherwise be. This reduces
// memory usage dramatically, as well as saving us a cache-line fetch.
//
// Note that even though most nodes have fewer outgoing edges, most nodes
// that we actually traverse will have any of them. This apparent
// contradiction is because we tend to spend more of our time near the root
// of the trie, where it is wide. This means that another layout would be
// possible: If we wanted to, non-inline nodes could simply store an array
// of 259 (256 possible characters plus the three special label types)
// edges, indexed directly by label type. This would use 20–50% more RAM,
// but also increases the speed of lookups due to removing the search loop.
//
// The nodes are generally unordered; since we typically index text, even
// the root will rarely be more than 20–30 wide, and at that point, it's
// better to just do a linear search than a binary one (which fares poorly
// on branch predictors). However, a special case, we put kFailureNodeLabel
// in the first slot if it exists (ie., is not equal to kRootID), since we
// need to access that label during every single node we look at during
// traversal.
//
// NOTE: Keep this the first member in the struct, so that inline_edges gets
// 4-aligned (since the class is marked as such, despite being packed.
// Otherwise, edges() can return an unaligned pointer marked as aligned
// (the unalignedness gets lost).
static constexpr int kNumInlineEdges = 2;
union {
// Out-of-line edge storage, having room for edges_capacity_ elements.
// Note that due to __attribute__((packed)) below, this pointer may be
// unaligned on 64-bit platforms, causing slightly less efficient
// access to it in some cases.
AhoCorasickEdge* edges;
// Inline edge storage, used if edges_capacity_ == 0.
AhoCorasickEdge inline_edges[kNumInlineEdges];
} edges_;
// Whether we have an edge for kMatchIDLabel or kOutputLinkLabel,
// ie., hitting this node during traversal will create one or more
// matches. This is redundant, but since every single lookup during
// traversal needs this, it saves a few searches for us.
bool has_outputs_ = false;
// Number of unused left in edges_. Edges are always allocated from the
// beginning and never deleted; those after num_edges_ will be marked with
// kEmptyLabel (and have an undefined node_id). We store the number of
// free edges instead of the more common number of _used_ edges, to be
// sure that we are able to fit it in an uint8_t. num_edges() provides
// a useful abstraction over this.
uint8_t num_free_edges_ = kNumInlineEdges;
// How many edges we have allocated room for (can never be more than
// kEmptyLabel + 1). If equal to zero, we are not using heap storage,
// but instead are using inline_edges.
//
// If not equal to zero, will be a multiple of 4, so that we can use
// SIMD to accelerate looking for edges.
uint16_t edges_capacity_ = 0;
} __attribute__((packed));
using SubstringPatternVector = std::vector<const MatcherStringPattern*>;
// Given the set of patterns, compute how many nodes will the corresponding
// Aho-Corasick tree have. Note that |patterns| need to be sorted.
NodeID GetTreeSize(
const std::vector<const MatcherStringPattern*>& patterns) const;
void BuildAhoCorasickTree(const SubstringPatternVector& patterns);
// Inserts a path for |pattern->pattern()| into the tree and adds
// |pattern->id()| to the set of matches.
void InsertPatternIntoAhoCorasickTree(const MatcherStringPattern* pattern);
void CreateFailureAndOutputEdges();
// Adds all pattern IDs to |matches| which are a suffix of the string
// represented by |node|.
void AccumulateMatchesForNode(
const AhoCorasickNode* node,
std::set<MatcherStringPattern::ID>* matches) const;
// The nodes of a Aho-Corasick tree.
std::vector<AhoCorasickNode> tree_;
bool is_empty_ = true;
};
} // namespace base
#endif // BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_