| // Copyright 2013 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "components/url_matcher/url_matcher.h" |
| |
| #include <algorithm> |
| #include <iterator> |
| #include <utility> |
| |
| #include "base/logging.h" |
| #include "base/stl_util.h" |
| #include "url/gurl.h" |
| #include "url/url_canon.h" |
| |
| namespace url_matcher { |
| |
| // This set of classes implement a mapping of URL Component Patterns, such as |
| // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns |
| // for use in substring comparisons. |
| // |
| // The idea of this mapping is to reduce the problem of comparing many |
| // URL Component Patterns against one URL to the problem of searching many |
| // substrings in one string: |
| // |
| // ---------------------- ----------------- |
| // | URL Query operator | ----translate----> | StringPattern | |
| // ---------------------- ----------------- |
| // ^ |
| // | |
| // compare |
| // | |
| // v |
| // ---------------------- ----------------- |
| // | URL to compare | | | |
| // | to all URL Query | ----translate----> | String | |
| // | operators | | | |
| // ---------------------- ----------------- |
| // |
| // The reason for this problem reduction is that there are efficient algorithms |
| // for searching many substrings in one string (see Aho-Corasick algorithm). |
| // |
| // Additionally, some of the same pieces are reused to implement regular |
| // expression comparisons. The FilteredRE2 implementation for matching many |
| // regular expressions against one string uses prefiltering, in which a set |
| // of substrings (derived from the regexes) are first searched for, to reduce |
| // the number of regular expressions to test; the prefiltering step also |
| // uses Aho-Corasick. |
| // |
| // Case 1: {host,path,query}_{prefix,suffix,equals} searches. |
| // ========================================================== |
| // |
| // For searches in this class, we normalize URLs as follows: |
| // |
| // Step 1: |
| // Remove scheme, port and segment from URL: |
| // -> http://www.example.com:8080/index.html?search=foo#first_match becomes |
| // www.example.com/index.html?search=foo |
| // |
| // We remove the scheme and port number because they can be checked later |
| // in a secondary filter step. We remove the segment (the #... part) because |
| // this is not guaranteed to be ASCII-7 encoded. |
| // |
| // Step 2: |
| // Translate URL to String and add the following position markers: |
| // - BU = Beginning of URL |
| // - ED = End of Domain |
| // - EP = End of Path |
| // - EU = End of URL |
| // Furthermore, the hostname is canonicalized to start with a ".". |
| // |
| // Position markers are represented as characters >127, which are therefore |
| // guaranteed not to be part of the ASCII-7 encoded URL character set. |
| // |
| // -> www.example.com/index.html?search=foo becomes |
| // BU .www.example.com ED /index.html EP ?search=foo EU |
| // |
| // -> www.example.com/index.html becomes |
| // BU .www.example.com ED /index.html EP EU |
| // |
| // Step 3: |
| // Translate URL Component Patterns as follows: |
| // |
| // host_prefix(prefix) = BU add_missing_dot_prefix(prefix) |
| // -> host_prefix("www.example") = BU .www.example |
| // |
| // host_suffix(suffix) = suffix ED |
| // -> host_suffix("example.com") = example.com ED |
| // -> host_suffix(".example.com") = .example.com ED |
| // |
| // host_equals(domain) = BU add_missing_dot_prefix(domain) ED |
| // -> host_equals("www.example.com") = BU .www.example.com ED |
| // |
| // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}). |
| // |
| // With this, we can search the StringPatterns in the normalized URL. |
| // |
| // |
| // Case 2: url_{prefix,suffix,equals,contains} searches. |
| // ===================================================== |
| // |
| // Step 1: as above, except that |
| // - the scheme is not removed |
| // - the port is not removed if it is specified and does not match the default |
| // port for the given scheme. |
| // |
| // Step 2: |
| // Translate URL to String and add the following position markers: |
| // - BU = Beginning of URL |
| // - EU = End of URL |
| // |
| // -> http://www.example.com:8080/index.html?search=foo#first_match becomes |
| // BU http://www.example.com:8080/index.html?search=foo EU |
| // -> http://www.example.com:80/index.html?search=foo#first_match becomes |
| // BU http://www.example.com/index.html?search=foo EU |
| // |
| // url_prefix(prefix) = BU prefix |
| // -> url_prefix("http://www.example") = BU http://www.example |
| // |
| // url_contains(substring) = substring |
| // -> url_contains("index") = index |
| // |
| // |
| // Case 3: {host,path,query}_contains searches. |
| // ============================================ |
| // |
| // These kinds of searches are not supported directly but can be derived |
| // by a combination of a url_contains() query followed by an explicit test: |
| // |
| // host_contains(str) = url_contains(str) followed by test whether str occurs |
| // in host component of original URL. |
| // -> host_contains("example.co") = example.co |
| // followed by gurl.host().find("example.co"); |
| // |
| // [similarly for path_contains and query_contains]. |
| // |
| // |
| // Regular expression matching (url_matches searches) |
| // ================================================== |
| // |
| // This class also supports matching regular expressions (RE2 syntax) |
| // against full URLs, which are transformed as in case 2. |
| |
| namespace { |
| |
| bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) { |
| return criterion == URLMatcherCondition::URL_MATCHES; |
| } |
| |
| bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) { |
| return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES; |
| } |
| |
| } // namespace |
| |
| // |
| // URLMatcherCondition |
| // |
| |
| URLMatcherCondition::URLMatcherCondition() |
| : criterion_(HOST_PREFIX), |
| string_pattern_(NULL) {} |
| |
| URLMatcherCondition::~URLMatcherCondition() {} |
| |
| URLMatcherCondition::URLMatcherCondition( |
| Criterion criterion, |
| const StringPattern* string_pattern) |
| : criterion_(criterion), |
| string_pattern_(string_pattern) {} |
| |
| URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs) |
| : criterion_(rhs.criterion_), |
| string_pattern_(rhs.string_pattern_) {} |
| |
| URLMatcherCondition& URLMatcherCondition::operator=( |
| const URLMatcherCondition& rhs) { |
| criterion_ = rhs.criterion_; |
| string_pattern_ = rhs.string_pattern_; |
| return *this; |
| } |
| |
| bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const { |
| if (criterion_ < rhs.criterion_) return true; |
| if (criterion_ > rhs.criterion_) return false; |
| if (string_pattern_ != NULL && rhs.string_pattern_ != NULL) |
| return *string_pattern_ < *rhs.string_pattern_; |
| if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true; |
| // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL, |
| // or both are NULL. |
| return false; |
| } |
| |
| bool URLMatcherCondition::IsFullURLCondition() const { |
| // For these criteria the SubstringMatcher needs to be executed on the |
| // GURL that is canonicalized with |
| // URLMatcherConditionFactory::CanonicalizeURLForFullSearches. |
| switch (criterion_) { |
| case HOST_CONTAINS: |
| case PATH_CONTAINS: |
| case QUERY_CONTAINS: |
| case URL_PREFIX: |
| case URL_SUFFIX: |
| case URL_CONTAINS: |
| case URL_EQUALS: |
| return true; |
| default: |
| break; |
| } |
| return false; |
| } |
| |
| bool URLMatcherCondition::IsRegexCondition() const { |
| return IsRegexCriterion(criterion_); |
| } |
| |
| bool URLMatcherCondition::IsOriginAndPathRegexCondition() const { |
| return IsOriginAndPathRegexCriterion(criterion_); |
| } |
| |
| bool URLMatcherCondition::IsMatch( |
| const std::set<StringPattern::ID>& matching_patterns, |
| const GURL& url) const { |
| DCHECK(string_pattern_); |
| if (!ContainsKey(matching_patterns, string_pattern_->id())) |
| return false; |
| // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on |
| // a substring match on the raw URL. In case of a match, we need to verify |
| // that the match was found in the correct component of the URL. |
| switch (criterion_) { |
| case HOST_CONTAINS: |
| return url.host().find(string_pattern_->pattern()) != |
| std::string::npos; |
| case PATH_CONTAINS: |
| return url.path().find(string_pattern_->pattern()) != |
| std::string::npos; |
| case QUERY_CONTAINS: |
| return url.query().find(string_pattern_->pattern()) != |
| std::string::npos; |
| default: |
| break; |
| } |
| return true; |
| } |
| |
| // |
| // URLMatcherConditionFactory |
| // |
| |
| namespace { |
| // These are symbols that are not contained in 7-bit ASCII used in GURLs. |
| const char kBeginningOfURL[] = {static_cast<char>(-1), 0}; |
| const char kEndOfDomain[] = {static_cast<char>(-2), 0}; |
| const char kEndOfPath[] = {static_cast<char>(-3), 0}; |
| const char kQueryComponentDelimiter[] = {static_cast<char>(-4), 0}; |
| const char kEndOfURL[] = {static_cast<char>(-5), 0}; |
| |
| // The delimiter for query parameters |
| const char kQuerySeparator = '&'; |
| } // namespace |
| |
| URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {} |
| |
| URLMatcherConditionFactory::~URLMatcherConditionFactory() { |
| STLDeleteElements(&substring_pattern_singletons_); |
| STLDeleteElements(®ex_pattern_singletons_); |
| STLDeleteElements(&origin_and_path_regex_pattern_singletons_); |
| } |
| |
| std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches( |
| const GURL& url) const { |
| return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain + |
| url.path() + kEndOfPath + |
| (url.has_query() ? CanonicalizeQuery(url.query(), true, true) |
| : std::string()) + |
| kEndOfURL; |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition( |
| const std::string& prefix) { |
| return CreateCondition(URLMatcherCondition::HOST_PREFIX, |
| kBeginningOfURL + CanonicalizeHostPrefix(prefix)); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition( |
| const std::string& suffix) { |
| return CreateCondition(URLMatcherCondition::HOST_SUFFIX, |
| CanonicalizeHostSuffix(suffix) + kEndOfDomain); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition( |
| const std::string& str) { |
| return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition( |
| const std::string& str) { |
| return CreateCondition(URLMatcherCondition::HOST_EQUALS, |
| kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition( |
| const std::string& prefix) { |
| return CreateCondition(URLMatcherCondition::PATH_PREFIX, |
| kEndOfDomain + prefix); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition( |
| const std::string& suffix) { |
| return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition( |
| const std::string& str) { |
| return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition( |
| const std::string& str) { |
| return CreateCondition(URLMatcherCondition::PATH_EQUALS, |
| kEndOfDomain + str + kEndOfPath); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition( |
| const std::string& prefix) { |
| std::string pattern; |
| if (!prefix.empty() && prefix[0] == '?') |
| pattern = kEndOfPath + CanonicalizeQuery(prefix.substr(1), true, false); |
| else |
| pattern = kEndOfPath + CanonicalizeQuery(prefix, true, false); |
| |
| return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition( |
| const std::string& suffix) { |
| if (!suffix.empty() && suffix[0] == '?') { |
| return CreateQueryEqualsCondition(suffix); |
| } else { |
| return CreateCondition(URLMatcherCondition::QUERY_SUFFIX, |
| CanonicalizeQuery(suffix, false, true) + kEndOfURL); |
| } |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition( |
| const std::string& str) { |
| if (!str.empty() && str[0] == '?') |
| return CreateQueryPrefixCondition(str); |
| else |
| return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition( |
| const std::string& str) { |
| std::string pattern; |
| if (!str.empty() && str[0] == '?') |
| pattern = |
| kEndOfPath + CanonicalizeQuery(str.substr(1), true, true) + kEndOfURL; |
| else |
| pattern = kEndOfPath + CanonicalizeQuery(str, true, true) + kEndOfURL; |
| |
| return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern); |
| } |
| |
| URLMatcherCondition |
| URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition( |
| const std::string& host_suffix, |
| const std::string& path_prefix) { |
| return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX, |
| CanonicalizeHostSuffix(host_suffix) + kEndOfDomain + path_prefix); |
| } |
| |
| URLMatcherCondition |
| URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition( |
| const std::string& host, |
| const std::string& path_prefix) { |
| return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX, |
| kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain + |
| path_prefix); |
| } |
| |
| std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches( |
| const GURL& url) const { |
| GURL::Replacements replacements; |
| replacements.ClearPassword(); |
| replacements.ClearUsername(); |
| replacements.ClearRef(); |
| // Clear port if it is implicit from scheme. |
| if (url.has_port()) { |
| const std::string& port = url.scheme(); |
| if (url::DefaultPortForScheme(port.c_str(), port.size()) == |
| url.EffectiveIntPort()) { |
| replacements.ClearPort(); |
| } |
| } |
| return kBeginningOfURL + url.ReplaceComponents(replacements).spec() + |
| kEndOfURL; |
| } |
| |
| static std::string CanonicalizeURLForRegexSearchesHelper( |
| const GURL& url, |
| bool clear_query) { |
| GURL::Replacements replacements; |
| replacements.ClearPassword(); |
| replacements.ClearUsername(); |
| replacements.ClearRef(); |
| if (clear_query) |
| replacements.ClearQuery(); |
| // Clear port if it is implicit from scheme. |
| if (url.has_port()) { |
| const std::string& port = url.scheme(); |
| if (url::DefaultPortForScheme(port.c_str(), port.size()) == |
| url.EffectiveIntPort()) { |
| replacements.ClearPort(); |
| } |
| } |
| return url.ReplaceComponents(replacements).spec(); |
| } |
| |
| std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches( |
| const GURL& url) const { |
| return CanonicalizeURLForRegexSearchesHelper(url, false); |
| } |
| |
| std::string |
| URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches( |
| const GURL& url) const { |
| return CanonicalizeURLForRegexSearchesHelper(url, true); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition( |
| const std::string& prefix) { |
| return CreateCondition(URLMatcherCondition::URL_PREFIX, |
| kBeginningOfURL + prefix); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition( |
| const std::string& suffix) { |
| return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition( |
| const std::string& str) { |
| return CreateCondition(URLMatcherCondition::URL_CONTAINS, str); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition( |
| const std::string& str) { |
| return CreateCondition(URLMatcherCondition::URL_EQUALS, |
| kBeginningOfURL + str + kEndOfURL); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition( |
| const std::string& regex) { |
| return CreateCondition(URLMatcherCondition::URL_MATCHES, regex); |
| } |
| |
| URLMatcherCondition |
| URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition( |
| const std::string& regex) { |
| return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex); |
| } |
| |
| void URLMatcherConditionFactory::ForgetUnusedPatterns( |
| const std::set<StringPattern::ID>& used_patterns) { |
| PatternSingletons::iterator i = substring_pattern_singletons_.begin(); |
| while (i != substring_pattern_singletons_.end()) { |
| if (ContainsKey(used_patterns, (*i)->id())) { |
| ++i; |
| } else { |
| delete *i; |
| substring_pattern_singletons_.erase(i++); |
| } |
| } |
| i = regex_pattern_singletons_.begin(); |
| while (i != regex_pattern_singletons_.end()) { |
| if (ContainsKey(used_patterns, (*i)->id())) { |
| ++i; |
| } else { |
| delete *i; |
| regex_pattern_singletons_.erase(i++); |
| } |
| } |
| i = origin_and_path_regex_pattern_singletons_.begin(); |
| while (i != origin_and_path_regex_pattern_singletons_.end()) { |
| if (ContainsKey(used_patterns, (*i)->id())) { |
| ++i; |
| } else { |
| delete *i; |
| origin_and_path_regex_pattern_singletons_.erase(i++); |
| } |
| } |
| } |
| |
| bool URLMatcherConditionFactory::IsEmpty() const { |
| return substring_pattern_singletons_.empty() && |
| regex_pattern_singletons_.empty() && |
| origin_and_path_regex_pattern_singletons_.empty(); |
| } |
| |
| URLMatcherCondition URLMatcherConditionFactory::CreateCondition( |
| URLMatcherCondition::Criterion criterion, |
| const std::string& pattern) { |
| StringPattern search_pattern(pattern, 0); |
| PatternSingletons* pattern_singletons = NULL; |
| if (IsRegexCriterion(criterion)) |
| pattern_singletons = ®ex_pattern_singletons_; |
| else if (IsOriginAndPathRegexCriterion(criterion)) |
| pattern_singletons = &origin_and_path_regex_pattern_singletons_; |
| else |
| pattern_singletons = &substring_pattern_singletons_; |
| |
| PatternSingletons::const_iterator iter = |
| pattern_singletons->find(&search_pattern); |
| |
| if (iter != pattern_singletons->end()) { |
| return URLMatcherCondition(criterion, *iter); |
| } else { |
| StringPattern* new_pattern = |
| new StringPattern(pattern, id_counter_++); |
| pattern_singletons->insert(new_pattern); |
| return URLMatcherCondition(criterion, new_pattern); |
| } |
| } |
| |
| std::string URLMatcherConditionFactory::CanonicalizeHostSuffix( |
| const std::string& suffix) const { |
| if (!suffix.empty() && suffix[suffix.size() - 1] == '.') |
| return suffix; |
| else |
| return suffix + "."; |
| } |
| |
| std::string URLMatcherConditionFactory::CanonicalizeHostPrefix( |
| const std::string& prefix) const { |
| if (!prefix.empty() && prefix[0] == '.') |
| return prefix; |
| else |
| return "." + prefix; |
| } |
| |
| std::string URLMatcherConditionFactory::CanonicalizeHostname( |
| const std::string& hostname) const { |
| return CanonicalizeHostPrefix(CanonicalizeHostSuffix(hostname)); |
| } |
| |
| // This function prepares the query string by replacing query separator with a |
| // magic value (|kQueryComponentDelimiter|). When the boolean |
| // |prepend_beginning_of_query_component| is true the function prepends the |
| // query with the same magic. This is done to locate the start of a key value |
| // pair in the query string. The parameter |query| is passed by value |
| // intentionally, since it is locally modified. |
| std::string URLMatcherConditionFactory::CanonicalizeQuery( |
| std::string query, |
| bool prepend_beginning_of_query_component, |
| bool append_end_of_query_component) const { |
| for (std::string::iterator it = query.begin(); it != query.end(); ++it) { |
| if (*it == kQuerySeparator) |
| *it = kQueryComponentDelimiter[0]; |
| } |
| if (prepend_beginning_of_query_component) |
| query = kQueryComponentDelimiter + query; |
| if (append_end_of_query_component) |
| query += kQueryComponentDelimiter; |
| return query; |
| } |
| |
| bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()( |
| StringPattern* lhs, |
| StringPattern* rhs) const { |
| if (lhs == NULL && rhs != NULL) return true; |
| if (lhs != NULL && rhs != NULL) |
| return lhs->pattern() < rhs->pattern(); |
| // Either both are NULL or only rhs is NULL. |
| return false; |
| } |
| |
| // |
| // URLQueryElementMatcherCondition |
| // |
| |
| URLQueryElementMatcherCondition::URLQueryElementMatcherCondition( |
| const std::string& key, |
| const std::string& value, |
| QueryValueMatchType query_value_match_type, |
| QueryElementType query_element_type, |
| Type match_type, |
| URLMatcherConditionFactory* factory) { |
| match_type_ = match_type; |
| |
| if (query_element_type == ELEMENT_TYPE_KEY_VALUE) { |
| key_ = kQueryComponentDelimiter + key + "="; |
| value_ = value; |
| } else { |
| key_ = kQueryComponentDelimiter + key; |
| value_ = std::string(); |
| } |
| |
| if (query_value_match_type == QUERY_VALUE_MATCH_EXACT) |
| value_ += kQueryComponentDelimiter; |
| |
| // If |value_| is empty no need to find the |key_| and verify if the value |
| // matches. Simply checking the presence of key is sufficient, which is done |
| // by MATCH_ANY |
| if (value_.empty()) |
| match_type_ = MATCH_ANY; |
| |
| URLMatcherCondition condition; |
| // If |match_type_| is MATCH_ANY, then we could simply look for the |
| // combination of |key_| + |value_|, which can be efficiently done by |
| // SubstringMatcher |
| if (match_type_ == MATCH_ANY) |
| condition = factory->CreateQueryContainsCondition(key_ + value_); |
| else |
| condition = factory->CreateQueryContainsCondition(key_); |
| string_pattern_ = condition.string_pattern(); |
| |
| key_length_ = key_.length(); |
| value_length_ = value_.length(); |
| } |
| |
| URLQueryElementMatcherCondition::URLQueryElementMatcherCondition( |
| const URLQueryElementMatcherCondition& other) = default; |
| |
| URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {} |
| |
| bool URLQueryElementMatcherCondition::operator<( |
| const URLQueryElementMatcherCondition& rhs) const { |
| if (match_type_ != rhs.match_type_) |
| return match_type_ < rhs.match_type_; |
| if (string_pattern_ != NULL && rhs.string_pattern_ != NULL) |
| return *string_pattern_ < *rhs.string_pattern_; |
| if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) |
| return true; |
| // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL, |
| // or both are NULL. |
| return false; |
| } |
| |
| bool URLQueryElementMatcherCondition::IsMatch( |
| const std::string& url_for_component_searches) const { |
| switch (match_type_) { |
| case MATCH_ANY: { |
| // For MATCH_ANY, no additional verification step is needed. We can trust |
| // the SubstringMatcher to do the verification. |
| return true; |
| } |
| case MATCH_ALL: { |
| size_t start = 0; |
| int found = 0; |
| size_t offset; |
| while ((offset = url_for_component_searches.find(key_, start)) != |
| std::string::npos) { |
| if (url_for_component_searches.compare( |
| offset + key_length_, value_length_, value_) != 0) { |
| return false; |
| } else { |
| ++found; |
| } |
| start = offset + key_length_ + value_length_ - 1; |
| } |
| return !!found; |
| } |
| case MATCH_FIRST: { |
| size_t offset = url_for_component_searches.find(key_); |
| return url_for_component_searches.compare( |
| offset + key_length_, value_length_, value_) == 0; |
| } |
| case MATCH_LAST: { |
| size_t offset = url_for_component_searches.rfind(key_); |
| return url_for_component_searches.compare( |
| offset + key_length_, value_length_, value_) == 0; |
| } |
| } |
| NOTREACHED(); |
| return false; |
| } |
| |
| // |
| // URLMatcherSchemeFilter |
| // |
| |
| URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter) |
| : filters_(1) { |
| filters_.push_back(filter); |
| } |
| |
| URLMatcherSchemeFilter::URLMatcherSchemeFilter( |
| const std::vector<std::string>& filters) |
| : filters_(filters) {} |
| |
| URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {} |
| |
| bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const { |
| return std::find(filters_.begin(), filters_.end(), url.scheme()) != |
| filters_.end(); |
| } |
| |
| // |
| // URLMatcherPortFilter |
| // |
| |
| URLMatcherPortFilter::URLMatcherPortFilter( |
| const std::vector<URLMatcherPortFilter::Range>& ranges) |
| : ranges_(ranges) {} |
| |
| URLMatcherPortFilter::~URLMatcherPortFilter() {} |
| |
| bool URLMatcherPortFilter::IsMatch(const GURL& url) const { |
| int port = url.EffectiveIntPort(); |
| for (std::vector<Range>::const_iterator i = ranges_.begin(); |
| i != ranges_.end(); ++i) { |
| if (i->first <= port && port <= i->second) |
| return true; |
| } |
| return false; |
| } |
| |
| // static |
| URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from, |
| int to) { |
| return Range(from, to); |
| } |
| |
| // static |
| URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) { |
| return Range(port, port); |
| } |
| |
| // |
| // URLMatcherConditionSet |
| // |
| |
| URLMatcherConditionSet::~URLMatcherConditionSet() {} |
| |
| URLMatcherConditionSet::URLMatcherConditionSet( |
| ID id, |
| const Conditions& conditions) |
| : id_(id), |
| conditions_(conditions) {} |
| |
| URLMatcherConditionSet::URLMatcherConditionSet( |
| ID id, |
| const Conditions& conditions, |
| scoped_ptr<URLMatcherSchemeFilter> scheme_filter, |
| scoped_ptr<URLMatcherPortFilter> port_filter) |
| : id_(id), |
| conditions_(conditions), |
| scheme_filter_(std::move(scheme_filter)), |
| port_filter_(std::move(port_filter)) {} |
| |
| URLMatcherConditionSet::URLMatcherConditionSet( |
| ID id, |
| const Conditions& conditions, |
| const QueryConditions& query_conditions, |
| scoped_ptr<URLMatcherSchemeFilter> scheme_filter, |
| scoped_ptr<URLMatcherPortFilter> port_filter) |
| : id_(id), |
| conditions_(conditions), |
| query_conditions_(query_conditions), |
| scheme_filter_(std::move(scheme_filter)), |
| port_filter_(std::move(port_filter)) {} |
| |
| bool URLMatcherConditionSet::IsMatch( |
| const std::set<StringPattern::ID>& matching_patterns, |
| const GURL& url) const { |
| return IsMatch(matching_patterns, url, std::string()); |
| } |
| |
| bool URLMatcherConditionSet::IsMatch( |
| const std::set<StringPattern::ID>& matching_patterns, |
| const GURL& url, |
| const std::string& url_for_component_searches) const { |
| for (Conditions::const_iterator i = conditions_.begin(); |
| i != conditions_.end(); ++i) { |
| if (!i->IsMatch(matching_patterns, url)) |
| return false; |
| } |
| if (scheme_filter_.get() && !scheme_filter_->IsMatch(url)) |
| return false; |
| if (port_filter_.get() && !port_filter_->IsMatch(url)) |
| return false; |
| if (query_conditions_.empty()) |
| return true; |
| // The loop is duplicated below for performance reasons. If not all query |
| // elements are found, no need to verify match that is expected to take more |
| // cycles. |
| for (QueryConditions::const_iterator i = query_conditions_.begin(); |
| i != query_conditions_.end(); |
| ++i) { |
| if (!ContainsKey(matching_patterns, i->string_pattern()->id())) |
| return false; |
| } |
| for (QueryConditions::const_iterator i = query_conditions_.begin(); |
| i != query_conditions_.end(); |
| ++i) { |
| if (!i->IsMatch(url_for_component_searches)) |
| return false; |
| } |
| return true; |
| } |
| |
| // |
| // URLMatcher |
| // |
| |
| URLMatcher::URLMatcher() {} |
| |
| URLMatcher::~URLMatcher() {} |
| |
| void URLMatcher::AddConditionSets( |
| const URLMatcherConditionSet::Vector& condition_sets) { |
| for (URLMatcherConditionSet::Vector::const_iterator i = |
| condition_sets.begin(); i != condition_sets.end(); ++i) { |
| DCHECK(url_matcher_condition_sets_.find((*i)->id()) == |
| url_matcher_condition_sets_.end()); |
| url_matcher_condition_sets_[(*i)->id()] = *i; |
| } |
| UpdateInternalDatastructures(); |
| } |
| |
| void URLMatcher::RemoveConditionSets( |
| const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) { |
| for (std::vector<URLMatcherConditionSet::ID>::const_iterator i = |
| condition_set_ids.begin(); i != condition_set_ids.end(); ++i) { |
| DCHECK(url_matcher_condition_sets_.find(*i) != |
| url_matcher_condition_sets_.end()); |
| url_matcher_condition_sets_.erase(*i); |
| } |
| UpdateInternalDatastructures(); |
| } |
| |
| void URLMatcher::ClearUnusedConditionSets() { |
| UpdateConditionFactory(); |
| } |
| |
| std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL( |
| const GURL& url) const { |
| // Find all IDs of StringPatterns that match |url|. |
| // See URLMatcherConditionFactory for the canonicalization of URLs and the |
| // distinction between full url searches and url component searches. |
| std::set<StringPattern::ID> matches; |
| std::string url_for_component_searches; |
| |
| if (!full_url_matcher_.IsEmpty()) { |
| full_url_matcher_.Match( |
| condition_factory_.CanonicalizeURLForFullSearches(url), &matches); |
| } |
| if (!url_component_matcher_.IsEmpty()) { |
| url_for_component_searches = |
| condition_factory_.CanonicalizeURLForComponentSearches(url); |
| url_component_matcher_.Match(url_for_component_searches, &matches); |
| } |
| if (!regex_set_matcher_.IsEmpty()) { |
| regex_set_matcher_.Match( |
| condition_factory_.CanonicalizeURLForRegexSearches(url), &matches); |
| } |
| if (!origin_and_path_regex_set_matcher_.IsEmpty()) { |
| origin_and_path_regex_set_matcher_.Match( |
| condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url), |
| &matches); |
| } |
| |
| // Calculate all URLMatcherConditionSets for which all URLMatcherConditions |
| // were fulfilled. |
| std::set<URLMatcherConditionSet::ID> result; |
| for (std::set<StringPattern::ID>::const_iterator i = matches.begin(); |
| i != matches.end(); ++i) { |
| // For each URLMatcherConditionSet there is exactly one condition |
| // registered in substring_match_triggers_. This means that the following |
| // logic tests each URLMatcherConditionSet exactly once if it can be |
| // completely fulfilled. |
| StringPatternTriggers::const_iterator triggered_condition_sets_iter = |
| substring_match_triggers_.find(*i); |
| if (triggered_condition_sets_iter == substring_match_triggers_.end()) |
| continue; // Not all substring matches are triggers for a condition set. |
| const std::set<URLMatcherConditionSet::ID>& condition_sets = |
| triggered_condition_sets_iter->second; |
| for (std::set<URLMatcherConditionSet::ID>::const_iterator j = |
| condition_sets.begin(); j != condition_sets.end(); ++j) { |
| URLMatcherConditionSets::const_iterator condition_set_iter = |
| url_matcher_condition_sets_.find(*j); |
| DCHECK(condition_set_iter != url_matcher_condition_sets_.end()); |
| if (condition_set_iter->second->IsMatch( |
| matches, url, url_for_component_searches)) |
| result.insert(*j); |
| } |
| } |
| |
| return result; |
| } |
| |
| bool URLMatcher::IsEmpty() const { |
| return condition_factory_.IsEmpty() && |
| url_matcher_condition_sets_.empty() && |
| substring_match_triggers_.empty() && |
| full_url_matcher_.IsEmpty() && |
| url_component_matcher_.IsEmpty() && |
| regex_set_matcher_.IsEmpty() && |
| origin_and_path_regex_set_matcher_.IsEmpty() && |
| registered_full_url_patterns_.empty() && |
| registered_url_component_patterns_.empty(); |
| } |
| |
| void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) { |
| // The purpose of |full_url_conditions| is just that we need to execute |
| // the same logic once for Full URL searches and once for URL Component |
| // searches (see URLMatcherConditionFactory). |
| |
| // Determine which patterns need to be registered when this function |
| // terminates. |
| std::set<const StringPattern*> new_patterns; |
| for (URLMatcherConditionSets::const_iterator condition_set_iter = |
| url_matcher_condition_sets_.begin(); |
| condition_set_iter != url_matcher_condition_sets_.end(); |
| ++condition_set_iter) { |
| const URLMatcherConditionSet::Conditions& conditions = |
| condition_set_iter->second->conditions(); |
| for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = |
| conditions.begin(); condition_iter != conditions.end(); |
| ++condition_iter) { |
| // If we are called to process Full URL searches, ignore others, and |
| // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.) |
| if (!condition_iter->IsRegexCondition() && |
| !condition_iter->IsOriginAndPathRegexCondition() && |
| full_url_conditions == condition_iter->IsFullURLCondition()) |
| new_patterns.insert(condition_iter->string_pattern()); |
| } |
| |
| if (full_url_conditions) |
| continue; |
| |
| const URLMatcherConditionSet::QueryConditions& query_conditions = |
| condition_set_iter->second->query_conditions(); |
| for (URLMatcherConditionSet::QueryConditions::const_iterator |
| query_condition_iter = query_conditions.begin(); |
| query_condition_iter != query_conditions.end(); |
| ++query_condition_iter) { |
| new_patterns.insert(query_condition_iter->string_pattern()); |
| } |
| } |
| |
| // This is the set of patterns that were registered before this function |
| // is called. |
| std::set<const StringPattern*>& registered_patterns = |
| full_url_conditions ? registered_full_url_patterns_ |
| : registered_url_component_patterns_; |
| |
| // Add all patterns that are in new_patterns but not in registered_patterns. |
| std::vector<const StringPattern*> patterns_to_register = |
| base::STLSetDifference<std::vector<const StringPattern*> >( |
| new_patterns, registered_patterns); |
| |
| // Remove all patterns that are in registered_patterns but not in |
| // new_patterns. |
| std::vector<const StringPattern*> patterns_to_unregister = |
| base::STLSetDifference<std::vector<const StringPattern*> >( |
| registered_patterns, new_patterns); |
| |
| // Update the SubstringSetMatcher. |
| SubstringSetMatcher& url_matcher = |
| full_url_conditions ? full_url_matcher_ : url_component_matcher_; |
| url_matcher.RegisterAndUnregisterPatterns(patterns_to_register, |
| patterns_to_unregister); |
| |
| // Update the set of registered_patterns for the next time this function |
| // is being called. |
| registered_patterns.swap(new_patterns); |
| } |
| |
| void URLMatcher::UpdateRegexSetMatcher() { |
| std::vector<const StringPattern*> new_patterns; |
| std::vector<const StringPattern*> new_origin_and_path_patterns; |
| |
| for (URLMatcherConditionSets::const_iterator condition_set_iter = |
| url_matcher_condition_sets_.begin(); |
| condition_set_iter != url_matcher_condition_sets_.end(); |
| ++condition_set_iter) { |
| const URLMatcherConditionSet::Conditions& conditions = |
| condition_set_iter->second->conditions(); |
| for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = |
| conditions.begin(); condition_iter != conditions.end(); |
| ++condition_iter) { |
| if (condition_iter->IsRegexCondition()) { |
| new_patterns.push_back(condition_iter->string_pattern()); |
| } else if (condition_iter->IsOriginAndPathRegexCondition()) { |
| new_origin_and_path_patterns.push_back( |
| condition_iter->string_pattern()); |
| } |
| } |
| } |
| |
| // Start over from scratch. We can't really do better than this, since the |
| // FilteredRE2 backend doesn't support incremental updates. |
| regex_set_matcher_.ClearPatterns(); |
| regex_set_matcher_.AddPatterns(new_patterns); |
| origin_and_path_regex_set_matcher_.ClearPatterns(); |
| origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns); |
| } |
| |
| void URLMatcher::UpdateTriggers() { |
| // Count substring pattern frequencies. |
| std::map<StringPattern::ID, size_t> substring_pattern_frequencies; |
| for (URLMatcherConditionSets::const_iterator condition_set_iter = |
| url_matcher_condition_sets_.begin(); |
| condition_set_iter != url_matcher_condition_sets_.end(); |
| ++condition_set_iter) { |
| const URLMatcherConditionSet::Conditions& conditions = |
| condition_set_iter->second->conditions(); |
| for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = |
| conditions.begin(); condition_iter != conditions.end(); |
| ++condition_iter) { |
| const StringPattern* pattern = condition_iter->string_pattern(); |
| substring_pattern_frequencies[pattern->id()]++; |
| } |
| |
| const URLMatcherConditionSet::QueryConditions& query_conditions = |
| condition_set_iter->second->query_conditions(); |
| for (URLMatcherConditionSet::QueryConditions::const_iterator |
| query_condition_iter = query_conditions.begin(); |
| query_condition_iter != query_conditions.end(); |
| ++query_condition_iter) { |
| const StringPattern* pattern = query_condition_iter->string_pattern(); |
| substring_pattern_frequencies[pattern->id()]++; |
| } |
| } |
| |
| // Update trigger conditions: Determine for each URLMatcherConditionSet which |
| // URLMatcherCondition contains a StringPattern that occurs least |
| // frequently in this URLMatcher. We assume that this condition is very |
| // specific and occurs rarely in URLs. If a match occurs for this |
| // URLMatcherCondition, we want to test all other URLMatcherCondition in the |
| // respective URLMatcherConditionSet as well to see whether the entire |
| // URLMatcherConditionSet is considered matching. |
| substring_match_triggers_.clear(); |
| for (URLMatcherConditionSets::const_iterator condition_set_iter = |
| url_matcher_condition_sets_.begin(); |
| condition_set_iter != url_matcher_condition_sets_.end(); |
| ++condition_set_iter) { |
| const URLMatcherConditionSet::Conditions& conditions = |
| condition_set_iter->second->conditions(); |
| if (conditions.empty()) |
| continue; |
| URLMatcherConditionSet::Conditions::const_iterator condition_iter = |
| conditions.begin(); |
| StringPattern::ID trigger = condition_iter->string_pattern()->id(); |
| // We skip the first element in the following loop. |
| ++condition_iter; |
| for (; condition_iter != conditions.end(); ++condition_iter) { |
| StringPattern::ID current_id = |
| condition_iter->string_pattern()->id(); |
| if (substring_pattern_frequencies[trigger] > |
| substring_pattern_frequencies[current_id]) { |
| trigger = current_id; |
| } |
| } |
| |
| const URLMatcherConditionSet::QueryConditions& query_conditions = |
| condition_set_iter->second->query_conditions(); |
| for (URLMatcherConditionSet::QueryConditions::const_iterator |
| query_condition_iter = query_conditions.begin(); |
| query_condition_iter != query_conditions.end(); |
| ++query_condition_iter) { |
| StringPattern::ID current_id = |
| query_condition_iter->string_pattern()->id(); |
| if (substring_pattern_frequencies[trigger] > |
| substring_pattern_frequencies[current_id]) { |
| trigger = current_id; |
| } |
| } |
| |
| substring_match_triggers_[trigger].insert(condition_set_iter->second->id()); |
| } |
| } |
| |
| void URLMatcher::UpdateConditionFactory() { |
| std::set<StringPattern::ID> used_patterns; |
| for (URLMatcherConditionSets::const_iterator condition_set_iter = |
| url_matcher_condition_sets_.begin(); |
| condition_set_iter != url_matcher_condition_sets_.end(); |
| ++condition_set_iter) { |
| const URLMatcherConditionSet::Conditions& conditions = |
| condition_set_iter->second->conditions(); |
| for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = |
| conditions.begin(); condition_iter != conditions.end(); |
| ++condition_iter) { |
| used_patterns.insert(condition_iter->string_pattern()->id()); |
| } |
| const URLMatcherConditionSet::QueryConditions& query_conditions = |
| condition_set_iter->second->query_conditions(); |
| for (URLMatcherConditionSet::QueryConditions::const_iterator |
| query_condition_iter = query_conditions.begin(); |
| query_condition_iter != query_conditions.end(); |
| ++query_condition_iter) { |
| used_patterns.insert(query_condition_iter->string_pattern()->id()); |
| } |
| } |
| condition_factory_.ForgetUnusedPatterns(used_patterns); |
| } |
| |
| void URLMatcher::UpdateInternalDatastructures() { |
| UpdateSubstringSetMatcher(false); |
| UpdateSubstringSetMatcher(true); |
| UpdateRegexSetMatcher(); |
| UpdateTriggers(); |
| UpdateConditionFactory(); |
| } |
| |
| } // namespace url_matcher |