|  | // Copyright 2013 The Chromium Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | #include "components/url_matcher/substring_set_matcher.h" | 
|  |  | 
|  | #include <stddef.h> | 
|  |  | 
|  | #include <algorithm> | 
|  | #include <queue> | 
|  |  | 
|  | #include "base/logging.h" | 
|  | #include "base/stl_util.h" | 
|  |  | 
|  | namespace url_matcher { | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | // Compare StringPattern instances based on their string patterns. | 
|  | bool ComparePatterns(const StringPattern* a, const StringPattern* b) { | 
|  | return a->pattern() < b->pattern(); | 
|  | } | 
|  |  | 
|  | // Given the set of patterns, compute how many nodes will the corresponding | 
|  | // Aho-Corasick tree have. Note that |patterns| need to be sorted. | 
|  | uint32_t TreeSize(const std::vector<const StringPattern*>& patterns) { | 
|  | uint32_t result = 1u;  // 1 for the root node. | 
|  | if (patterns.empty()) | 
|  | return result; | 
|  |  | 
|  | std::vector<const StringPattern*>::const_iterator last = patterns.begin(); | 
|  | std::vector<const StringPattern*>::const_iterator current = last + 1; | 
|  | // For the first pattern, each letter is a label of an edge to a new node. | 
|  | result += (*last)->pattern().size(); | 
|  |  | 
|  | // For the subsequent patterns, only count the edges which were not counted | 
|  | // yet. For this it suffices to test against the previous pattern, because the | 
|  | // patterns are sorted. | 
|  | for (; current != patterns.end(); ++last, ++current) { | 
|  | const std::string& last_pattern = (*last)->pattern(); | 
|  | const std::string& current_pattern = (*current)->pattern(); | 
|  | const uint32_t prefix_bound = | 
|  | std::min(last_pattern.size(), current_pattern.size()); | 
|  |  | 
|  | uint32_t common_prefix = 0; | 
|  | while (common_prefix < prefix_bound && | 
|  | last_pattern[common_prefix] == current_pattern[common_prefix]) | 
|  | ++common_prefix; | 
|  | result += current_pattern.size() - common_prefix; | 
|  | } | 
|  | return result; | 
|  | } | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | // | 
|  | // SubstringSetMatcher | 
|  | // | 
|  |  | 
|  | SubstringSetMatcher::SubstringSetMatcher() { | 
|  | RebuildAhoCorasickTree(SubstringPatternVector()); | 
|  | } | 
|  |  | 
|  | SubstringSetMatcher::~SubstringSetMatcher() {} | 
|  |  | 
|  | void SubstringSetMatcher::RegisterPatterns( | 
|  | const std::vector<const StringPattern*>& patterns) { | 
|  | RegisterAndUnregisterPatterns(patterns, | 
|  | std::vector<const StringPattern*>()); | 
|  | } | 
|  |  | 
|  | void SubstringSetMatcher::UnregisterPatterns( | 
|  | const std::vector<const StringPattern*>& patterns) { | 
|  | RegisterAndUnregisterPatterns(std::vector<const StringPattern*>(), | 
|  | patterns); | 
|  | } | 
|  |  | 
|  | void SubstringSetMatcher::RegisterAndUnregisterPatterns( | 
|  | const std::vector<const StringPattern*>& to_register, | 
|  | const std::vector<const StringPattern*>& to_unregister) { | 
|  | // Register patterns. | 
|  | for (std::vector<const StringPattern*>::const_iterator i = | 
|  | to_register.begin(); i != to_register.end(); ++i) { | 
|  | DCHECK(patterns_.find((*i)->id()) == patterns_.end()); | 
|  | patterns_[(*i)->id()] = *i; | 
|  | } | 
|  |  | 
|  | // Unregister patterns | 
|  | for (std::vector<const StringPattern*>::const_iterator i = | 
|  | to_unregister.begin(); i != to_unregister.end(); ++i) { | 
|  | patterns_.erase((*i)->id()); | 
|  | } | 
|  |  | 
|  | // Now we compute the total number of tree nodes needed. | 
|  | SubstringPatternVector sorted_patterns; | 
|  | sorted_patterns.resize(patterns_.size()); | 
|  |  | 
|  | size_t next = 0; | 
|  | for (SubstringPatternMap::const_iterator i = patterns_.begin(); | 
|  | i != patterns_.end(); | 
|  | ++i, ++next) { | 
|  | sorted_patterns[next] = i->second; | 
|  | } | 
|  |  | 
|  | std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); | 
|  | tree_.reserve(TreeSize(sorted_patterns)); | 
|  |  | 
|  | RebuildAhoCorasickTree(sorted_patterns); | 
|  | } | 
|  |  | 
|  | bool SubstringSetMatcher::Match(const std::string& text, | 
|  | std::set<StringPattern::ID>* matches) const { | 
|  | const size_t old_number_of_matches = matches->size(); | 
|  |  | 
|  | // Handle patterns matching the empty string. | 
|  | matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); | 
|  |  | 
|  | uint32_t current_node = 0; | 
|  | for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { | 
|  | uint32_t edge_from_current = tree_[current_node].GetEdge(*i); | 
|  | while (edge_from_current == AhoCorasickNode::kNoSuchEdge && | 
|  | current_node != 0) { | 
|  | current_node = tree_[current_node].failure(); | 
|  | edge_from_current = tree_[current_node].GetEdge(*i); | 
|  | } | 
|  | if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { | 
|  | current_node = edge_from_current; | 
|  | matches->insert(tree_[current_node].matches().begin(), | 
|  | tree_[current_node].matches().end()); | 
|  | } else { | 
|  | DCHECK_EQ(0u, current_node); | 
|  | } | 
|  | } | 
|  |  | 
|  | return old_number_of_matches != matches->size(); | 
|  | } | 
|  |  | 
|  | bool SubstringSetMatcher::IsEmpty() const { | 
|  | // An empty tree consists of only the root node. | 
|  | return patterns_.empty() && tree_.size() == 1u; | 
|  | } | 
|  |  | 
|  | void SubstringSetMatcher::RebuildAhoCorasickTree( | 
|  | const SubstringPatternVector& sorted_patterns) { | 
|  | tree_.clear(); | 
|  |  | 
|  | // Initialize root note of tree. | 
|  | AhoCorasickNode root; | 
|  | root.set_failure(0); | 
|  | tree_.push_back(root); | 
|  |  | 
|  | // Insert all patterns. | 
|  | for (SubstringPatternVector::const_iterator i = sorted_patterns.begin(); | 
|  | i != sorted_patterns.end(); | 
|  | ++i) { | 
|  | InsertPatternIntoAhoCorasickTree(*i); | 
|  | } | 
|  |  | 
|  | CreateFailureEdges(); | 
|  | } | 
|  |  | 
|  | void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( | 
|  | const StringPattern* pattern) { | 
|  | const std::string& text = pattern->pattern(); | 
|  | const std::string::const_iterator text_end = text.end(); | 
|  |  | 
|  | // Iterators on the tree and the text. | 
|  | uint32_t current_node = 0; | 
|  | std::string::const_iterator i = text.begin(); | 
|  |  | 
|  | // Follow existing paths for as long as possible. | 
|  | while (i != text_end) { | 
|  | uint32_t edge_from_current = tree_[current_node].GetEdge(*i); | 
|  | if (edge_from_current == AhoCorasickNode::kNoSuchEdge) | 
|  | break; | 
|  | current_node = edge_from_current; | 
|  | ++i; | 
|  | } | 
|  |  | 
|  | // Create new nodes if necessary. | 
|  | while (i != text_end) { | 
|  | tree_.push_back(AhoCorasickNode()); | 
|  | tree_[current_node].SetEdge(*i, tree_.size() - 1); | 
|  | current_node = tree_.size() - 1; | 
|  | ++i; | 
|  | } | 
|  |  | 
|  | // Register match. | 
|  | tree_[current_node].AddMatch(pattern->id()); | 
|  | } | 
|  |  | 
|  | void SubstringSetMatcher::CreateFailureEdges() { | 
|  | typedef AhoCorasickNode::Edges Edges; | 
|  |  | 
|  | std::queue<uint32_t> queue; | 
|  |  | 
|  | AhoCorasickNode& root = tree_[0]; | 
|  | root.set_failure(0); | 
|  | const Edges& root_edges = root.edges(); | 
|  | for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); | 
|  | ++e) { | 
|  | const uint32_t& leads_to = e->second; | 
|  | tree_[leads_to].set_failure(0); | 
|  | queue.push(leads_to); | 
|  | } | 
|  |  | 
|  | while (!queue.empty()) { | 
|  | AhoCorasickNode& current_node = tree_[queue.front()]; | 
|  | queue.pop(); | 
|  | for (Edges::const_iterator e = current_node.edges().begin(); | 
|  | e != current_node.edges().end(); ++e) { | 
|  | const char& edge_label = e->first; | 
|  | const uint32_t& leads_to = e->second; | 
|  | queue.push(leads_to); | 
|  |  | 
|  | uint32_t failure = current_node.failure(); | 
|  | uint32_t edge_from_failure = tree_[failure].GetEdge(edge_label); | 
|  | while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && | 
|  | failure != 0) { | 
|  | failure = tree_[failure].failure(); | 
|  | edge_from_failure = tree_[failure].GetEdge(edge_label); | 
|  | } | 
|  |  | 
|  | const uint32_t follow_in_case_of_failure = | 
|  | edge_from_failure != AhoCorasickNode::kNoSuchEdge ? edge_from_failure | 
|  | : 0; | 
|  | tree_[leads_to].set_failure(follow_in_case_of_failure); | 
|  | tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | const uint32_t SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = 0xFFFFFFFF; | 
|  |  | 
|  | SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() | 
|  | : failure_(kNoSuchEdge) {} | 
|  |  | 
|  | SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} | 
|  |  | 
|  | SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( | 
|  | const SubstringSetMatcher::AhoCorasickNode& other) | 
|  | : edges_(other.edges_), | 
|  | failure_(other.failure_), | 
|  | matches_(other.matches_) {} | 
|  |  | 
|  | SubstringSetMatcher::AhoCorasickNode& | 
|  | SubstringSetMatcher::AhoCorasickNode::operator=( | 
|  | const SubstringSetMatcher::AhoCorasickNode& other) { | 
|  | edges_ = other.edges_; | 
|  | failure_ = other.failure_; | 
|  | matches_ = other.matches_; | 
|  | return *this; | 
|  | } | 
|  |  | 
|  | uint32_t SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { | 
|  | Edges::const_iterator i = edges_.find(c); | 
|  | return i == edges_.end() ? kNoSuchEdge : i->second; | 
|  | } | 
|  |  | 
|  | void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32_t node) { | 
|  | edges_[c] = node; | 
|  | } | 
|  |  | 
|  | void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { | 
|  | matches_.insert(id); | 
|  | } | 
|  |  | 
|  | void SubstringSetMatcher::AhoCorasickNode::AddMatches( | 
|  | const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { | 
|  | matches_.insert(matches.begin(), matches.end()); | 
|  | } | 
|  |  | 
|  | }  // namespace url_matcher |