components/url_matcher/url_matcher.cc - chromium/src - Git at Google

 // Copyright 2013 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "components/url_matcher/url_matcher.h"

 #include <algorithm>
 #include <iterator>
 #include <utility>

 #include "base/logging.h"
 #include "base/stl_util.h"
 #include "url/gurl.h"
 #include "url/url_canon.h"

 namespace url_matcher {

 // This set of classes implement a mapping of URL Component Patterns, such as
 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
 // for use in substring comparisons.
 //
 // The idea of this mapping is to reduce the problem of comparing many
 // URL Component Patterns against one URL to the problem of searching many
 // substrings in one string:
 //
 // ----------------------                    -----------------
 // | URL Query operator | ----translate----> | StringPattern |
 // ----------------------                    -----------------
 //                                                   ^
 //                                                   |
 //                                                compare
 //                                                   |
 //                                                   v
 // ----------------------                    -----------------
 // | URL to compare     |                    |               |
 // | to all URL Query   | ----translate----> | String        |
 // | operators          |                    |               |
 // ----------------------                    -----------------
 //
 // The reason for this problem reduction is that there are efficient algorithms
 // for searching many substrings in one string (see Aho-Corasick algorithm).
 //
 // Additionally, some of the same pieces are reused to implement regular
 // expression comparisons. The FilteredRE2 implementation for matching many
 // regular expressions against one string uses prefiltering, in which a set
 // of substrings (derived from the regexes) are first searched for, to reduce
 // the number of regular expressions to test; the prefiltering step also
 // uses Aho-Corasick.
 //
 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
 // ==========================================================
 //
 // For searches in this class, we normalize URLs as follows:
 //
 // Step 1:
 // Remove scheme, port and segment from URL:
 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
 //    www.example.com/index.html?search=foo
 //
 // We remove the scheme and port number because they can be checked later
 // in a secondary filter step. We remove the segment (the #... part) because
 // this is not guaranteed to be ASCII-7 encoded.
 //
 // Step 2:
 // Translate URL to String and add the following position markers:
 // - BU = Beginning of URL
 // - ED = End of Domain
 // - EP = End of Path
 // - EU = End of URL
 // Furthermore, the hostname is canonicalized to start with a ".".
 //
 // Position markers are represented as characters >127, which are therefore
 // guaranteed not to be part of the ASCII-7 encoded URL character set.
 //
 // -> www.example.com/index.html?search=foo becomes
 // BU .www.example.com ED /index.html EP ?search=foo EU
 //
 // -> www.example.com/index.html becomes
 // BU .www.example.com ED /index.html EP EU
 //
 // Step 3:
 // Translate URL Component Patterns as follows:
 //
 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
 // -> host_prefix("www.example") = BU .www.example
 //
 // host_suffix(suffix) = suffix ED
 // -> host_suffix("example.com") = example.com ED
 // -> host_suffix(".example.com") = .example.com ED
 //
 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
 // -> host_equals("www.example.com") = BU .www.example.com ED
 //
 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
 //
 // With this, we can search the StringPatterns in the normalized URL.
 //
 //
 // Case 2: url_{prefix,suffix,equals,contains} searches.
 // =====================================================
 //
 // Step 1: as above, except that
 // - the scheme is not removed
 // - the port is not removed if it is specified and does not match the default
 //   port for the given scheme.
 //
 // Step 2:
 // Translate URL to String and add the following position markers:
 // - BU = Beginning of URL
 // - EU = End of URL
 //
 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
 // BU http://www.example.com:8080/index.html?search=foo EU
 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
 // BU http://www.example.com/index.html?search=foo EU
 //
 // url_prefix(prefix) = BU prefix
 // -> url_prefix("http://www.example") = BU http://www.example
 //
 // url_contains(substring) = substring
 // -> url_contains("index") = index
 //
 //
 // Case 3: {host,path,query}_contains searches.
 // ============================================
 //
 // These kinds of searches are not supported directly but can be derived
 // by a combination of a url_contains() query followed by an explicit test:
 //
 // host_contains(str) = url_contains(str) followed by test whether str occurs
 //   in host component of original URL.
 // -> host_contains("example.co") = example.co
 //    followed by gurl.host().find("example.co");
 //
 // [similarly for path_contains and query_contains].
 //
 //
 // Regular expression matching (url_matches searches)
 // ==================================================
 //
 // This class also supports matching regular expressions (RE2 syntax)
 // against full URLs, which are transformed as in case 2.

 namespace {

 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
   return criterion == URLMatcherCondition::URL_MATCHES;
 }

 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
   return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
 }

 }  // namespace

 //
 // URLMatcherCondition
 //

 URLMatcherCondition::URLMatcherCondition()
     : criterion_(HOST_PREFIX),
       string_pattern_(NULL) {}

 URLMatcherCondition::~URLMatcherCondition() {}

 URLMatcherCondition::URLMatcherCondition(
     Criterion criterion,
     const StringPattern* string_pattern)
     : criterion_(criterion),
       string_pattern_(string_pattern) {}

 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
     : criterion_(rhs.criterion_),
       string_pattern_(rhs.string_pattern_) {}

 URLMatcherCondition& URLMatcherCondition::operator=(
     const URLMatcherCondition& rhs) {
   criterion_ = rhs.criterion_;
   string_pattern_ = rhs.string_pattern_;
   return *this;
 }

 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
   if (criterion_ < rhs.criterion_) return true;
   if (criterion_ > rhs.criterion_) return false;
   if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
     return *string_pattern_ < *rhs.string_pattern_;
   if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
   // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
   // or both are NULL.
   return false;
 }

 bool URLMatcherCondition::IsFullURLCondition() const {
   // For these criteria the SubstringMatcher needs to be executed on the
   // GURL that is canonicalized with
   // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
   switch (criterion_) {
     case HOST_CONTAINS:
     case PATH_CONTAINS:
     case QUERY_CONTAINS:
     case URL_PREFIX:
     case URL_SUFFIX:
     case URL_CONTAINS:
     case URL_EQUALS:
       return true;
     default:
       break;
   }
   return false;
 }

 bool URLMatcherCondition::IsRegexCondition() const {
   return IsRegexCriterion(criterion_);
 }

 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
   return IsOriginAndPathRegexCriterion(criterion_);
 }

 bool URLMatcherCondition::IsMatch(
     const std::set<StringPattern::ID>& matching_patterns,
     const GURL& url) const {
   DCHECK(string_pattern_);
   if (!ContainsKey(matching_patterns, string_pattern_->id()))
     return false;
   // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
   // a substring match on the raw URL. In case of a match, we need to verify
   // that the match was found in the correct component of the URL.
   switch (criterion_) {
     case HOST_CONTAINS:
       return url.host().find(string_pattern_->pattern()) !=
           std::string::npos;
     case PATH_CONTAINS:
       return url.path().find(string_pattern_->pattern()) !=
           std::string::npos;
     case QUERY_CONTAINS:
       return url.query().find(string_pattern_->pattern()) !=
           std::string::npos;
     default:
       break;
   }
   return true;
 }

 //
 // URLMatcherConditionFactory
 //

 namespace {
 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
 const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
 const char kEndOfDomain[] = {static_cast<char>(-2), 0};
 const char kEndOfPath[] = {static_cast<char>(-3), 0};
 const char kQueryComponentDelimiter[] = {static_cast<char>(-4), 0};
 const char kEndOfURL[] = {static_cast<char>(-5), 0};

 // The delimiter for query parameters
 const char kQuerySeparator = '&';
 }  // namespace

 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}

 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
   STLDeleteElements(&substring_pattern_singletons_);
   STLDeleteElements(&regex_pattern_singletons_);
   STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
 }

 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
     const GURL& url) const {
   return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
          url.path() + kEndOfPath +
          (url.has_query() ? CanonicalizeQuery(url.query(), true, true)
                           : std::string()) +
          kEndOfURL;
 }

 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
     const std::string& prefix) {
   return CreateCondition(URLMatcherCondition::HOST_PREFIX,
       kBeginningOfURL + CanonicalizeHostPrefix(prefix));
 }

 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
     const std::string& suffix) {
   return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
       CanonicalizeHostSuffix(suffix) + kEndOfDomain);
 }

 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
     const std::string& str) {
   return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
 }

 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
     const std::string& str) {
   return CreateCondition(URLMatcherCondition::HOST_EQUALS,
       kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
 }

 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
     const std::string& prefix) {
   return CreateCondition(URLMatcherCondition::PATH_PREFIX,
       kEndOfDomain + prefix);
 }

 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
     const std::string& suffix) {
   return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
 }

 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
     const std::string& str) {
   return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
 }

 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
     const std::string& str) {
   return CreateCondition(URLMatcherCondition::PATH_EQUALS,
       kEndOfDomain + str + kEndOfPath);
 }

 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
     const std::string& prefix) {
   std::string pattern;
   if (!prefix.empty() && prefix[0] == '?')
     pattern = kEndOfPath + CanonicalizeQuery(prefix.substr(1), true, false);
   else
     pattern = kEndOfPath + CanonicalizeQuery(prefix, true, false);

   return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
 }

 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
     const std::string& suffix) {
   if (!suffix.empty() && suffix[0] == '?') {
     return CreateQueryEqualsCondition(suffix);
   } else {
     return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
                            CanonicalizeQuery(suffix, false, true) + kEndOfURL);
   }
 }

 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
     const std::string& str) {
   if (!str.empty() && str[0] == '?')
     return CreateQueryPrefixCondition(str);
   else
     return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
 }

 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
     const std::string& str) {
   std::string pattern;
   if (!str.empty() && str[0] == '?')
     pattern =
         kEndOfPath + CanonicalizeQuery(str.substr(1), true, true) + kEndOfURL;
   else
     pattern = kEndOfPath + CanonicalizeQuery(str, true, true) + kEndOfURL;

   return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
 }

 URLMatcherCondition
     URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
     const std::string& host_suffix,
     const std::string& path_prefix) {
   return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
       CanonicalizeHostSuffix(host_suffix) + kEndOfDomain + path_prefix);
 }

 URLMatcherCondition
 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
     const std::string& host,
     const std::string& path_prefix) {
   return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
       kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
       path_prefix);
 }

 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
     const GURL& url) const {
   GURL::Replacements replacements;
   replacements.ClearPassword();
   replacements.ClearUsername();
   replacements.ClearRef();
   // Clear port if it is implicit from scheme.
   if (url.has_port()) {
     const std::string& port = url.scheme();
     if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
         url.EffectiveIntPort()) {
       replacements.ClearPort();
     }
   }
   return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
       kEndOfURL;
 }

 static std::string CanonicalizeURLForRegexSearchesHelper(
     const GURL& url,
     bool clear_query) {
   GURL::Replacements replacements;
   replacements.ClearPassword();
   replacements.ClearUsername();
   replacements.ClearRef();
   if (clear_query)
     replacements.ClearQuery();
   // Clear port if it is implicit from scheme.
   if (url.has_port()) {
     const std::string& port = url.scheme();
     if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
         url.EffectiveIntPort()) {
       replacements.ClearPort();
     }
   }
   return url.ReplaceComponents(replacements).spec();
 }

 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
     const GURL& url) const {
   return CanonicalizeURLForRegexSearchesHelper(url, false);
 }

 std::string
 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
     const GURL& url) const {
   return CanonicalizeURLForRegexSearchesHelper(url, true);
 }

 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
     const std::string& prefix) {
   return CreateCondition(URLMatcherCondition::URL_PREFIX,
       kBeginningOfURL + prefix);
 }

 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
     const std::string& suffix) {
   return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
 }

 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
     const std::string& str) {
   return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
 }

 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
     const std::string& str) {
   return CreateCondition(URLMatcherCondition::URL_EQUALS,
       kBeginningOfURL + str + kEndOfURL);
 }

 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
     const std::string& regex) {
   return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
 }

 URLMatcherCondition
 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
     const std::string& regex) {
   return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
 }

 void URLMatcherConditionFactory::ForgetUnusedPatterns(
       const std::set<StringPattern::ID>& used_patterns) {
   PatternSingletons::iterator i = substring_pattern_singletons_.begin();
   while (i != substring_pattern_singletons_.end()) {
     if (ContainsKey(used_patterns, (*i)->id())) {
       ++i;
     } else {
       delete *i;
       substring_pattern_singletons_.erase(i++);
     }
   }
   i = regex_pattern_singletons_.begin();
   while (i != regex_pattern_singletons_.end()) {
     if (ContainsKey(used_patterns, (*i)->id())) {
       ++i;
     } else {
       delete *i;
       regex_pattern_singletons_.erase(i++);
     }
   }
   i = origin_and_path_regex_pattern_singletons_.begin();
   while (i != origin_and_path_regex_pattern_singletons_.end()) {
     if (ContainsKey(used_patterns, (*i)->id())) {
       ++i;
     } else {
       delete *i;
       origin_and_path_regex_pattern_singletons_.erase(i++);
     }
   }
 }

 bool URLMatcherConditionFactory::IsEmpty() const {
   return substring_pattern_singletons_.empty() &&
       regex_pattern_singletons_.empty() &&
       origin_and_path_regex_pattern_singletons_.empty();
 }

 URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
     URLMatcherCondition::Criterion criterion,
     const std::string& pattern) {
   StringPattern search_pattern(pattern, 0);
   PatternSingletons* pattern_singletons = NULL;
   if (IsRegexCriterion(criterion))
     pattern_singletons = &regex_pattern_singletons_;
   else if (IsOriginAndPathRegexCriterion(criterion))
     pattern_singletons = &origin_and_path_regex_pattern_singletons_;
   else
     pattern_singletons = &substring_pattern_singletons_;

   PatternSingletons::const_iterator iter =
       pattern_singletons->find(&search_pattern);

   if (iter != pattern_singletons->end()) {
     return URLMatcherCondition(criterion, *iter);
   } else {
     StringPattern* new_pattern =
         new StringPattern(pattern, id_counter_++);
     pattern_singletons->insert(new_pattern);
     return URLMatcherCondition(criterion, new_pattern);
   }
 }

 std::string URLMatcherConditionFactory::CanonicalizeHostSuffix(
     const std::string& suffix) const {
   if (!suffix.empty() && suffix[suffix.size() - 1] == '.')
     return suffix;
   else
     return suffix + ".";
 }

 std::string URLMatcherConditionFactory::CanonicalizeHostPrefix(
     const std::string& prefix) const {
   if (!prefix.empty() && prefix[0] == '.')
     return prefix;
   else
     return "." + prefix;
 }

 std::string URLMatcherConditionFactory::CanonicalizeHostname(
     const std::string& hostname) const {
   return CanonicalizeHostPrefix(CanonicalizeHostSuffix(hostname));
 }

 // This function prepares the query string by replacing query separator with a
 // magic value (|kQueryComponentDelimiter|). When the boolean
 // |prepend_beginning_of_query_component| is true the function prepends the
 // query with the same magic. This is done to locate the start of a key value
 // pair in the query string. The parameter |query| is passed by value
 // intentionally, since it is locally modified.
 std::string URLMatcherConditionFactory::CanonicalizeQuery(
     std::string query,
     bool prepend_beginning_of_query_component,
     bool append_end_of_query_component) const {
   for (std::string::iterator it = query.begin(); it != query.end(); ++it) {
     if (*it == kQuerySeparator)
       *it = kQueryComponentDelimiter[0];
   }
   if (prepend_beginning_of_query_component)
     query = kQueryComponentDelimiter + query;
   if (append_end_of_query_component)
     query += kQueryComponentDelimiter;
   return query;
 }

 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
     StringPattern* lhs,
     StringPattern* rhs) const {
   if (lhs == NULL && rhs != NULL) return true;
   if (lhs != NULL && rhs != NULL)
     return lhs->pattern() < rhs->pattern();
   // Either both are NULL or only rhs is NULL.
   return false;
 }

 //
 // URLQueryElementMatcherCondition
 //

 URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
     const std::string& key,
     const std::string& value,
     QueryValueMatchType query_value_match_type,
     QueryElementType query_element_type,
     Type match_type,
     URLMatcherConditionFactory* factory) {
   match_type_ = match_type;

   if (query_element_type == ELEMENT_TYPE_KEY_VALUE) {
     key_ = kQueryComponentDelimiter + key + "=";
     value_ = value;
   } else {
     key_ = kQueryComponentDelimiter + key;
     value_ = std::string();
   }

   if (query_value_match_type == QUERY_VALUE_MATCH_EXACT)
     value_ += kQueryComponentDelimiter;

   // If |value_| is empty no need to find the |key_| and verify if the value
   // matches. Simply checking the presence of key is sufficient, which is done
   // by MATCH_ANY
   if (value_.empty())
     match_type_ = MATCH_ANY;

   URLMatcherCondition condition;
   // If |match_type_| is MATCH_ANY, then we could simply look for the
   // combination of |key_| + |value_|, which can be efficiently done by
   // SubstringMatcher
   if (match_type_ == MATCH_ANY)
     condition = factory->CreateQueryContainsCondition(key_ + value_);
   else
     condition = factory->CreateQueryContainsCondition(key_);
   string_pattern_ = condition.string_pattern();

   key_length_ = key_.length();
   value_length_ = value_.length();
 }

 URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
     const URLQueryElementMatcherCondition& other) = default;

 URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {}

 bool URLQueryElementMatcherCondition::operator<(
     const URLQueryElementMatcherCondition& rhs) const {
   if (match_type_ != rhs.match_type_)
     return match_type_ < rhs.match_type_;
   if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
     return *string_pattern_ < *rhs.string_pattern_;
   if (string_pattern_ == NULL && rhs.string_pattern_ != NULL)
     return true;
   // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
   // or both are NULL.
   return false;
 }

 bool URLQueryElementMatcherCondition::IsMatch(
     const std::string& url_for_component_searches) const {
   switch (match_type_) {
     case MATCH_ANY: {
       // For MATCH_ANY, no additional verification step is needed. We can trust
       // the SubstringMatcher to do the verification.
       return true;
     }
     case MATCH_ALL: {
       size_t start = 0;
       int found = 0;
       size_t offset;
       while ((offset = url_for_component_searches.find(key_, start)) !=
              std::string::npos) {
         if (url_for_component_searches.compare(
                 offset + key_length_, value_length_, value_) != 0) {
           return false;
         } else {
           ++found;
         }
         start = offset + key_length_ + value_length_ - 1;
       }
       return !!found;
     }
     case MATCH_FIRST: {
       size_t offset = url_for_component_searches.find(key_);
       return url_for_component_searches.compare(
                  offset + key_length_, value_length_, value_) == 0;
     }
     case MATCH_LAST: {
       size_t offset = url_for_component_searches.rfind(key_);
       return url_for_component_searches.compare(
                  offset + key_length_, value_length_, value_) == 0;
     }
   }
   NOTREACHED();
   return false;
 }

 //
 // URLMatcherSchemeFilter
 //

 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
     : filters_(1) {
   filters_.push_back(filter);
 }

 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
     const std::vector<std::string>& filters)
     : filters_(filters) {}

 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}

 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
   return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
       filters_.end();
 }

 //
 // URLMatcherPortFilter
 //

 URLMatcherPortFilter::URLMatcherPortFilter(
     const std::vector<URLMatcherPortFilter::Range>& ranges)
     : ranges_(ranges) {}

 URLMatcherPortFilter::~URLMatcherPortFilter() {}

 bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
   int port = url.EffectiveIntPort();
   for (std::vector<Range>::const_iterator i = ranges_.begin();
        i != ranges_.end(); ++i) {
     if (i->first <= port && port <= i->second)
       return true;
   }
   return false;
 }

 // static
 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
                                                               int to) {
   return Range(from, to);
 }

 // static
 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
   return Range(port, port);
 }

 //
 // URLMatcherConditionSet
 //

 URLMatcherConditionSet::~URLMatcherConditionSet() {}

 URLMatcherConditionSet::URLMatcherConditionSet(
     ID id,
     const Conditions& conditions)
     : id_(id),
       conditions_(conditions) {}

 URLMatcherConditionSet::URLMatcherConditionSet(
     ID id,
     const Conditions& conditions,
     scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
     scoped_ptr<URLMatcherPortFilter> port_filter)
     : id_(id),
       conditions_(conditions),
       scheme_filter_(std::move(scheme_filter)),
       port_filter_(std::move(port_filter)) {}

 URLMatcherConditionSet::URLMatcherConditionSet(
     ID id,
     const Conditions& conditions,
     const QueryConditions& query_conditions,
     scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
     scoped_ptr<URLMatcherPortFilter> port_filter)
     : id_(id),
       conditions_(conditions),
       query_conditions_(query_conditions),
       scheme_filter_(std::move(scheme_filter)),
       port_filter_(std::move(port_filter)) {}

 bool URLMatcherConditionSet::IsMatch(
     const std::set<StringPattern::ID>& matching_patterns,
     const GURL& url) const {
   return IsMatch(matching_patterns, url, std::string());
 }

 bool URLMatcherConditionSet::IsMatch(
     const std::set<StringPattern::ID>& matching_patterns,
     const GURL& url,
     const std::string& url_for_component_searches) const {
   for (Conditions::const_iterator i = conditions_.begin();
        i != conditions_.end(); ++i) {
     if (!i->IsMatch(matching_patterns, url))
       return false;
   }
   if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
     return false;
   if (port_filter_.get() && !port_filter_->IsMatch(url))
     return false;
   if (query_conditions_.empty())
     return true;
   // The loop is duplicated below for performance reasons. If not all query
   // elements are found, no need to verify match that is expected to take more
   // cycles.
   for (QueryConditions::const_iterator i = query_conditions_.begin();
        i != query_conditions_.end();
        ++i) {
     if (!ContainsKey(matching_patterns, i->string_pattern()->id()))
       return false;
   }
   for (QueryConditions::const_iterator i = query_conditions_.begin();
        i != query_conditions_.end();
        ++i) {
     if (!i->IsMatch(url_for_component_searches))
       return false;
   }
   return true;
 }

 //
 // URLMatcher
 //

 URLMatcher::URLMatcher() {}

 URLMatcher::~URLMatcher() {}

 void URLMatcher::AddConditionSets(
     const URLMatcherConditionSet::Vector& condition_sets) {
   for (URLMatcherConditionSet::Vector::const_iterator i =
        condition_sets.begin(); i != condition_sets.end(); ++i) {
     DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
         url_matcher_condition_sets_.end());
     url_matcher_condition_sets_[(*i)->id()] = *i;
   }
   UpdateInternalDatastructures();
 }

 void URLMatcher::RemoveConditionSets(
     const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
   for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
        condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
     DCHECK(url_matcher_condition_sets_.find(*i) !=
         url_matcher_condition_sets_.end());
     url_matcher_condition_sets_.erase(*i);
   }
   UpdateInternalDatastructures();
 }

 void URLMatcher::ClearUnusedConditionSets() {
   UpdateConditionFactory();
 }

 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
     const GURL& url) const {
   // Find all IDs of StringPatterns that match |url|.
   // See URLMatcherConditionFactory for the canonicalization of URLs and the
   // distinction between full url searches and url component searches.
   std::set<StringPattern::ID> matches;
   std::string url_for_component_searches;

   if (!full_url_matcher_.IsEmpty()) {
     full_url_matcher_.Match(
         condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
   }
   if (!url_component_matcher_.IsEmpty()) {
     url_for_component_searches =
         condition_factory_.CanonicalizeURLForComponentSearches(url);
     url_component_matcher_.Match(url_for_component_searches, &matches);
   }
   if (!regex_set_matcher_.IsEmpty()) {
     regex_set_matcher_.Match(
         condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
   }
   if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
     origin_and_path_regex_set_matcher_.Match(
         condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
         &matches);
   }

   // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
   // were fulfilled.
   std::set<URLMatcherConditionSet::ID> result;
   for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
        i != matches.end(); ++i) {
     // For each URLMatcherConditionSet there is exactly one condition
     // registered in substring_match_triggers_. This means that the following
     // logic tests each URLMatcherConditionSet exactly once if it can be
     // completely fulfilled.
     StringPatternTriggers::const_iterator triggered_condition_sets_iter =
         substring_match_triggers_.find(*i);
     if (triggered_condition_sets_iter == substring_match_triggers_.end())
       continue;  // Not all substring matches are triggers for a condition set.
     const std::set<URLMatcherConditionSet::ID>& condition_sets =
         triggered_condition_sets_iter->second;
     for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
          condition_sets.begin(); j != condition_sets.end(); ++j) {
       URLMatcherConditionSets::const_iterator condition_set_iter =
           url_matcher_condition_sets_.find(*j);
       DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
       if (condition_set_iter->second->IsMatch(
               matches, url, url_for_component_searches))
         result.insert(*j);
     }
   }

   return result;
 }

 bool URLMatcher::IsEmpty() const {
   return condition_factory_.IsEmpty() &&
       url_matcher_condition_sets_.empty() &&
       substring_match_triggers_.empty() &&
       full_url_matcher_.IsEmpty() &&
       url_component_matcher_.IsEmpty() &&
       regex_set_matcher_.IsEmpty() &&
       origin_and_path_regex_set_matcher_.IsEmpty() &&
       registered_full_url_patterns_.empty() &&
       registered_url_component_patterns_.empty();
 }

 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
   // The purpose of |full_url_conditions| is just that we need to execute
   // the same logic once for Full URL searches and once for URL Component
   // searches (see URLMatcherConditionFactory).

   // Determine which patterns need to be registered when this function
   // terminates.
   std::set<const StringPattern*> new_patterns;
   for (URLMatcherConditionSets::const_iterator condition_set_iter =
       url_matcher_condition_sets_.begin();
       condition_set_iter != url_matcher_condition_sets_.end();
       ++condition_set_iter) {
     const URLMatcherConditionSet::Conditions& conditions =
         condition_set_iter->second->conditions();
     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
          conditions.begin(); condition_iter != conditions.end();
          ++condition_iter) {
       // If we are called to process Full URL searches, ignore others, and
       // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
       if (!condition_iter->IsRegexCondition() &&
           !condition_iter->IsOriginAndPathRegexCondition() &&
           full_url_conditions == condition_iter->IsFullURLCondition())
         new_patterns.insert(condition_iter->string_pattern());
     }

     if (full_url_conditions)
       continue;

     const URLMatcherConditionSet::QueryConditions& query_conditions =
         condition_set_iter->second->query_conditions();
     for (URLMatcherConditionSet::QueryConditions::const_iterator
              query_condition_iter = query_conditions.begin();
          query_condition_iter != query_conditions.end();
          ++query_condition_iter) {
       new_patterns.insert(query_condition_iter->string_pattern());
     }
   }

   // This is the set of patterns that were registered before this function
   // is called.
   std::set<const StringPattern*>& registered_patterns =
       full_url_conditions ? registered_full_url_patterns_
                           : registered_url_component_patterns_;

   // Add all patterns that are in new_patterns but not in registered_patterns.
   std::vector<const StringPattern*> patterns_to_register =
       base::STLSetDifference<std::vector<const StringPattern*> >(
           new_patterns, registered_patterns);

   // Remove all patterns that are in registered_patterns but not in
   // new_patterns.
   std::vector<const StringPattern*> patterns_to_unregister =
       base::STLSetDifference<std::vector<const StringPattern*> >(
            registered_patterns, new_patterns);

   // Update the SubstringSetMatcher.
   SubstringSetMatcher& url_matcher =
       full_url_conditions ? full_url_matcher_ : url_component_matcher_;
   url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
                                             patterns_to_unregister);

   // Update the set of registered_patterns for the next time this function
   // is being called.
   registered_patterns.swap(new_patterns);
 }

 void URLMatcher::UpdateRegexSetMatcher() {
   std::vector<const StringPattern*> new_patterns;
   std::vector<const StringPattern*> new_origin_and_path_patterns;

   for (URLMatcherConditionSets::const_iterator condition_set_iter =
       url_matcher_condition_sets_.begin();
       condition_set_iter != url_matcher_condition_sets_.end();
       ++condition_set_iter) {
     const URLMatcherConditionSet::Conditions& conditions =
         condition_set_iter->second->conditions();
     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
          conditions.begin(); condition_iter != conditions.end();
          ++condition_iter) {
       if (condition_iter->IsRegexCondition()) {
         new_patterns.push_back(condition_iter->string_pattern());
       } else if (condition_iter->IsOriginAndPathRegexCondition()) {
         new_origin_and_path_patterns.push_back(
             condition_iter->string_pattern());
       }
     }
   }

   // Start over from scratch. We can't really do better than this, since the
   // FilteredRE2 backend doesn't support incremental updates.
   regex_set_matcher_.ClearPatterns();
   regex_set_matcher_.AddPatterns(new_patterns);
   origin_and_path_regex_set_matcher_.ClearPatterns();
   origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
 }

 void URLMatcher::UpdateTriggers() {
   // Count substring pattern frequencies.
   std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
   for (URLMatcherConditionSets::const_iterator condition_set_iter =
       url_matcher_condition_sets_.begin();
       condition_set_iter != url_matcher_condition_sets_.end();
       ++condition_set_iter) {
     const URLMatcherConditionSet::Conditions& conditions =
         condition_set_iter->second->conditions();
     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
          conditions.begin(); condition_iter != conditions.end();
          ++condition_iter) {
       const StringPattern* pattern = condition_iter->string_pattern();
       substring_pattern_frequencies[pattern->id()]++;
     }

     const URLMatcherConditionSet::QueryConditions& query_conditions =
         condition_set_iter->second->query_conditions();
     for (URLMatcherConditionSet::QueryConditions::const_iterator
              query_condition_iter = query_conditions.begin();
          query_condition_iter != query_conditions.end();
          ++query_condition_iter) {
       const StringPattern* pattern = query_condition_iter->string_pattern();
       substring_pattern_frequencies[pattern->id()]++;
     }
   }

   // Update trigger conditions: Determine for each URLMatcherConditionSet which
   // URLMatcherCondition contains a StringPattern that occurs least
   // frequently in this URLMatcher. We assume that this condition is very
   // specific and occurs rarely in URLs. If a match occurs for this
   // URLMatcherCondition, we want to test all other URLMatcherCondition in the
   // respective URLMatcherConditionSet as well to see whether the entire
   // URLMatcherConditionSet is considered matching.
   substring_match_triggers_.clear();
   for (URLMatcherConditionSets::const_iterator condition_set_iter =
       url_matcher_condition_sets_.begin();
       condition_set_iter != url_matcher_condition_sets_.end();
       ++condition_set_iter) {
     const URLMatcherConditionSet::Conditions& conditions =
         condition_set_iter->second->conditions();
     if (conditions.empty())
       continue;
     URLMatcherConditionSet::Conditions::const_iterator condition_iter =
         conditions.begin();
     StringPattern::ID trigger = condition_iter->string_pattern()->id();
     // We skip the first element in the following loop.
     ++condition_iter;
     for (; condition_iter != conditions.end(); ++condition_iter) {
       StringPattern::ID current_id =
           condition_iter->string_pattern()->id();
       if (substring_pattern_frequencies[trigger] >
           substring_pattern_frequencies[current_id]) {
         trigger = current_id;
       }
     }

     const URLMatcherConditionSet::QueryConditions& query_conditions =
         condition_set_iter->second->query_conditions();
     for (URLMatcherConditionSet::QueryConditions::const_iterator
              query_condition_iter = query_conditions.begin();
          query_condition_iter != query_conditions.end();
          ++query_condition_iter) {
       StringPattern::ID current_id =
           query_condition_iter->string_pattern()->id();
       if (substring_pattern_frequencies[trigger] >
           substring_pattern_frequencies[current_id]) {
         trigger = current_id;
       }
     }

     substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
   }
 }

 void URLMatcher::UpdateConditionFactory() {
   std::set<StringPattern::ID> used_patterns;
   for (URLMatcherConditionSets::const_iterator condition_set_iter =
       url_matcher_condition_sets_.begin();
       condition_set_iter != url_matcher_condition_sets_.end();
       ++condition_set_iter) {
     const URLMatcherConditionSet::Conditions& conditions =
         condition_set_iter->second->conditions();
     for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
          conditions.begin(); condition_iter != conditions.end();
          ++condition_iter) {
       used_patterns.insert(condition_iter->string_pattern()->id());
     }
     const URLMatcherConditionSet::QueryConditions& query_conditions =
         condition_set_iter->second->query_conditions();
     for (URLMatcherConditionSet::QueryConditions::const_iterator
              query_condition_iter = query_conditions.begin();
          query_condition_iter != query_conditions.end();
          ++query_condition_iter) {
       used_patterns.insert(query_condition_iter->string_pattern()->id());
     }
   }
   condition_factory_.ForgetUnusedPatterns(used_patterns);
 }

 void URLMatcher::UpdateInternalDatastructures() {
   UpdateSubstringSetMatcher(false);
   UpdateSubstringSetMatcher(true);
   UpdateRegexSetMatcher();
   UpdateTriggers();
   UpdateConditionFactory();
 }

 }  // namespace url_matcher