blob: 90daceef4732060bcdb71ca45251360f7f3c8546 [file] [log] [blame]
// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/url_matcher/substring_set_matcher.h"
#include <stddef.h>
#include <algorithm>
#include <queue>
#include "base/containers/queue.h"
#include "base/logging.h"
#include "base/numerics/checked_math.h"
#include "base/stl_util.h"
#include "base/trace_event/memory_usage_estimator.h"
namespace url_matcher {
namespace {
// Compare StringPattern instances based on their string patterns.
bool ComparePatterns(const StringPattern* a, const StringPattern* b) {
return a->pattern() < b->pattern();
}
std::vector<const StringPattern*> GetVectorOfPointers(
const std::vector<StringPattern>& patterns) {
std::vector<const StringPattern*> pattern_pointers;
pattern_pointers.reserve(patterns.size());
for (const StringPattern& pattern : patterns)
pattern_pointers.push_back(&pattern);
return pattern_pointers;
}
} // namespace
SubstringSetMatcher::SubstringSetMatcher(
const std::vector<StringPattern>& patterns)
: SubstringSetMatcher(GetVectorOfPointers(patterns)) {}
SubstringSetMatcher::SubstringSetMatcher(
std::vector<const StringPattern*> patterns) {
// Ensure there are no duplicate IDs and all pattern strings are distinct.
#if DCHECK_IS_ON()
{
std::set<StringPattern::ID> ids;
std::set<std::string> pattern_strings;
for (const StringPattern* pattern : patterns) {
CHECK(!base::Contains(ids, pattern->id()));
CHECK(!base::Contains(pattern_strings, pattern->pattern()));
ids.insert(pattern->id());
pattern_strings.insert(pattern->pattern());
}
}
#endif
// Compute the total number of tree nodes needed.
std::sort(patterns.begin(), patterns.end(), ComparePatterns);
tree_.reserve(GetTreeSize(patterns));
BuildAhoCorasickTree(patterns);
// Sanity check that no new allocations happened in the tree and our computed
// size was correct.
DCHECK_EQ(tree_.size(), static_cast<size_t>(GetTreeSize(patterns)));
is_empty_ = patterns.empty() && tree_.size() == 1u;
}
SubstringSetMatcher::~SubstringSetMatcher() = default;
bool SubstringSetMatcher::Match(const std::string& text,
std::set<StringPattern::ID>* matches) const {
const size_t old_number_of_matches = matches->size();
// Handle patterns matching the empty string.
const AhoCorasickNode* const root = &tree_[kRootID];
AccumulateMatchesForNode(root, matches);
const AhoCorasickNode* current_node = root;
for (const char c : text) {
NodeID child = current_node->GetEdge(c);
// If the child not can't be found, progressively iterate over the longest
// proper suffix of the string represented by the current node. In a sense
// we are pruning prefixes from the text.
while (child == kInvalidNodeID && current_node != root) {
current_node = &tree_[current_node->failure()];
child = current_node->GetEdge(c);
}
if (child != kInvalidNodeID) {
// The string represented by |child| is the longest possible suffix of the
// current position of |text| in the trie.
current_node = &tree_[child];
AccumulateMatchesForNode(current_node, matches);
} else {
// The empty string is the longest possible suffix of the current position
// of |text| in the trie.
DCHECK_EQ(root, current_node);
}
}
return old_number_of_matches != matches->size();
}
size_t SubstringSetMatcher::EstimateMemoryUsage() const {
return base::trace_event::EstimateMemoryUsage(tree_);
}
// static
constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kInvalidNodeID;
constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kRootID;
SubstringSetMatcher::NodeID SubstringSetMatcher::GetTreeSize(
const std::vector<const StringPattern*>& patterns) const {
DCHECK(std::is_sorted(patterns.begin(), patterns.end(), ComparePatterns));
base::CheckedNumeric<NodeID> result = 1u; // 1 for the root node.
if (patterns.empty())
return result.ValueOrDie();
auto last = patterns.begin();
auto current = last + 1;
// For the first pattern, each letter is a label of an edge to a new node.
result += (*last)->pattern().size();
// For the subsequent patterns, only count the edges which were not counted
// yet. For this it suffices to test against the previous pattern, because the
// patterns are sorted.
for (; current != patterns.end(); ++last, ++current) {
const std::string& last_pattern = (*last)->pattern();
const std::string& current_pattern = (*current)->pattern();
size_t prefix_bound = std::min(last_pattern.size(), current_pattern.size());
size_t common_prefix = 0;
while (common_prefix < prefix_bound &&
last_pattern[common_prefix] == current_pattern[common_prefix]) {
++common_prefix;
}
result -= common_prefix;
result += current_pattern.size();
}
return result.ValueOrDie();
}
void SubstringSetMatcher::BuildAhoCorasickTree(
const SubstringPatternVector& patterns) {
DCHECK(tree_.empty());
// Initialize root node of tree.
tree_.emplace_back();
// Build the initial trie for all the patterns.
for (const StringPattern* pattern : patterns)
InsertPatternIntoAhoCorasickTree(pattern);
// Trie creation is complete and edges are finalized. Shrink to fit each edge
// map to save on memory.
for (AhoCorasickNode& node : tree_)
node.ShrinkEdges();
CreateFailureAndOutputEdges();
}
void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
const StringPattern* pattern) {
const std::string& text = pattern->pattern();
const std::string::const_iterator text_end = text.end();
// Iterators on the tree and the text.
AhoCorasickNode* current_node = &tree_[kRootID];
std::string::const_iterator i = text.begin();
// Follow existing paths for as long as possible.
while (i != text_end) {
NodeID child = current_node->GetEdge(*i);
if (child == kInvalidNodeID)
break;
current_node = &tree_[child];
++i;
}
// Create new nodes if necessary.
while (i != text_end) {
tree_.emplace_back();
current_node->SetEdge(*i, tree_.size() - 1);
current_node = &tree_.back();
++i;
}
// Register match.
current_node->SetMatchID(pattern->id());
}
void SubstringSetMatcher::CreateFailureAndOutputEdges() {
base::queue<AhoCorasickNode*> queue;
// Initialize the failure edges for |root| and its children.
AhoCorasickNode* const root = &tree_[0];
// Assigning |root| as the failure edge for itself doesn't strictly abide by
// the definition of "proper" suffix. The proper suffix of an empty string
// should probably be defined as null, but we assign it to the |root| to
// simplify the code and have the invariant that the failure edge is always
// defined.
root->SetFailure(kRootID);
root->SetOutputLink(kInvalidNodeID);
NodeID root_output_link = root->IsEndOfPattern() ? kRootID : kInvalidNodeID;
for (const auto& edge : root->edges()) {
AhoCorasickNode* child = &tree_[edge.second];
child->SetFailure(kRootID);
child->SetOutputLink(root_output_link);
queue.push(child);
}
// Do a breadth first search over the trie to create failure edges. We
// maintain the invariant that any node in |queue| has had its |failure_| and
// |output_link_| edge already initialized.
while (!queue.empty()) {
AhoCorasickNode* current_node = queue.front();
queue.pop();
// Compute the failure and output edges of children using the failure edges
// of the current node.
for (const auto& edge : current_node->edges()) {
const char edge_label = edge.first;
AhoCorasickNode* child = &tree_[edge.second];
const AhoCorasickNode* failure_candidate_parent =
&tree_[current_node->failure()];
NodeID failure_candidate_id =
failure_candidate_parent->GetEdge(edge_label);
while (failure_candidate_id == kInvalidNodeID &&
failure_candidate_parent != root) {
failure_candidate_parent = &tree_[failure_candidate_parent->failure()];
failure_candidate_id = failure_candidate_parent->GetEdge(edge_label);
}
if (failure_candidate_id == kInvalidNodeID) {
DCHECK_EQ(root, failure_candidate_parent);
// |failure_candidate| is invalid and we can't proceed further since we
// have reached the root. Hence the longest proper suffix of this string
// represented by this node is the empty string (represented by root).
failure_candidate_id = kRootID;
}
child->SetFailure(failure_candidate_id);
const AhoCorasickNode* failure_candidate = &tree_[failure_candidate_id];
// Now |failure_candidate| is |child|'s longest possible proper suffix in
// the trie. We also know that since we are doing a breadth first search,
// we would have established |failure_candidate|'s output link by now.
// Hence we can define |child|'s output link as follows:
child->SetOutputLink(failure_candidate->IsEndOfPattern()
? failure_candidate_id
: failure_candidate->output_link());
queue.push(child);
}
}
}
void SubstringSetMatcher::AccumulateMatchesForNode(
const AhoCorasickNode* node,
std::set<StringPattern::ID>* matches) const {
DCHECK(matches);
if (node->IsEndOfPattern())
matches->insert(node->GetMatchID());
NodeID node_id = node->output_link();
while (node_id != kInvalidNodeID) {
node = &tree_[node_id];
matches->insert(node->GetMatchID());
node_id = node->output_link();
}
}
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() = default;
SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() = default;
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) =
default;
SubstringSetMatcher::AhoCorasickNode& SubstringSetMatcher::AhoCorasickNode::
operator=(AhoCorasickNode&& other) = default;
SubstringSetMatcher::NodeID SubstringSetMatcher::AhoCorasickNode::GetEdge(
char c) const {
auto i = edges_.find(c);
return i == edges_.end() ? kInvalidNodeID : i->second;
}
void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, NodeID node) {
DCHECK_NE(kInvalidNodeID, node);
edges_[c] = node;
}
void SubstringSetMatcher::AhoCorasickNode::SetFailure(NodeID node) {
DCHECK_NE(kInvalidNodeID, node);
failure_ = node;
}
size_t SubstringSetMatcher::AhoCorasickNode::EstimateMemoryUsage() const {
return base::trace_event::EstimateMemoryUsage(edges_);
}
} // namespace url_matcher