|  | // Copyright 2019 The Chromium Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | #include "extensions/browser/api/declarative_net_request/regex_rules_matcher.h" | 
|  |  | 
|  | #include <algorithm> | 
|  |  | 
|  | #include "base/logging.h" | 
|  | #include "base/strings/string_util.h" | 
|  | #include "base/strings/stringprintf.h" | 
|  | #include "components/url_pattern_index/url_pattern_index.h" | 
|  | #include "extensions/browser/api/declarative_net_request/request_action.h" | 
|  | #include "extensions/browser/api/declarative_net_request/request_params.h" | 
|  | #include "extensions/browser/api/declarative_net_request/utils.h" | 
|  |  | 
|  | namespace extensions { | 
|  | namespace declarative_net_request { | 
|  | namespace flat_rule = url_pattern_index::flat; | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | bool IsExtraHeadersMatcherInternal( | 
|  | const RegexRulesMatcher::RegexRulesList* regex_list) { | 
|  | DCHECK(regex_list); | 
|  |  | 
|  | // We only support removing a subset of extra headers currently. If that | 
|  | // changes, the implementation here should change as well. | 
|  | static_assert(flat::ActionType_count == 6, | 
|  | "Modify this method to ensure IsExtraHeadersMatcherInternal is " | 
|  | "updated as new actions are added."); | 
|  |  | 
|  | return std::any_of(regex_list->begin(), regex_list->end(), | 
|  | [](const flat::RegexRule* regex_rule) { | 
|  | return regex_rule->action_type() == | 
|  | flat::ActionType_modify_headers; | 
|  | }); | 
|  | } | 
|  |  | 
|  | re2::StringPiece ToRE2StringPiece(const ::flatbuffers::String& str) { | 
|  | return re2::StringPiece(str.c_str(), str.size()); | 
|  | } | 
|  |  | 
|  | // Helper to check if the |rule| metadata matches the given request |params|. | 
|  | bool DoesRuleMetadataMatchRequest(const flat_rule::UrlRule& rule, | 
|  | const RequestParams& params) { | 
|  | // Evaluates `element_type`, `method`, `is_third_party` and | 
|  | // `embedder_conditions_matcher`. | 
|  | if (!url_pattern_index::DoesRuleFlagsMatch( | 
|  | rule, params.element_type, flat_rule::ActivationType_NONE, | 
|  | params.method, params.is_third_party, | 
|  | params.embedder_conditions_matcher)) { | 
|  | return false; | 
|  | } | 
|  |  | 
|  | // Compares included and excluded request domains. | 
|  | if (!url_pattern_index::DoesURLMatchRequestDomainList(*params.url, rule)) | 
|  | return false; | 
|  |  | 
|  | // Compares included and excluded initiator domains. | 
|  | return url_pattern_index::DoesOriginMatchInitiatorDomainList( | 
|  | params.first_party_origin, rule); | 
|  | } | 
|  |  | 
|  | bool IsBeforeRequestAction(flat::ActionType action_type) { | 
|  | switch (action_type) { | 
|  | case flat::ActionType_block: | 
|  | case flat::ActionType_allow: | 
|  | case flat::ActionType_redirect: | 
|  | case flat::ActionType_upgrade_scheme: | 
|  | case flat::ActionType_allow_all_requests: | 
|  | return true; | 
|  | case flat::ActionType_modify_headers: | 
|  | return false; | 
|  | case flat::ActionType_count: | 
|  | NOTREACHED(); | 
|  | } | 
|  | return false; | 
|  | } | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | RegexRuleInfo::RegexRuleInfo(const flat::RegexRule* regex_rule, | 
|  | const re2::RE2* regex) | 
|  | : regex_rule(regex_rule), regex(regex) { | 
|  | DCHECK(regex_rule); | 
|  | DCHECK(regex); | 
|  | } | 
|  | RegexRuleInfo::RegexRuleInfo(const RegexRuleInfo& info) = default; | 
|  | RegexRuleInfo& RegexRuleInfo::operator=(const RegexRuleInfo& info) = default; | 
|  |  | 
|  | RegexRulesMatcher::RegexRulesMatcher(const ExtensionId& extension_id, | 
|  | RulesetID ruleset_id, | 
|  | const RegexRulesList* regex_list, | 
|  | const ExtensionMetadataList* metadata_list) | 
|  | : RulesetMatcherBase(extension_id, ruleset_id), | 
|  | regex_list_(regex_list), | 
|  | metadata_list_(metadata_list), | 
|  | is_extra_headers_matcher_(IsExtraHeadersMatcherInternal(regex_list)) { | 
|  | InitializeMatcher(); | 
|  | } | 
|  |  | 
|  | RegexRulesMatcher::~RegexRulesMatcher() = default; | 
|  |  | 
|  | std::vector<RequestAction> RegexRulesMatcher::GetModifyHeadersActions( | 
|  | const RequestParams& params, | 
|  | absl::optional<uint64_t> min_priority) const { | 
|  | const std::vector<RegexRuleInfo>& potential_matches = | 
|  | GetPotentialMatches(params); | 
|  |  | 
|  | std::vector<const flat_rule::UrlRule*> rules; | 
|  | for (const RegexRuleInfo& info : potential_matches) { | 
|  | // Check for the rule's priority iff |min_priority| is specified. | 
|  | bool has_sufficient_priority = | 
|  | !min_priority || | 
|  | info.regex_rule->url_rule()->priority() > *min_priority; | 
|  |  | 
|  | if (has_sufficient_priority && | 
|  | info.regex_rule->action_type() == flat::ActionType_modify_headers && | 
|  | re2::RE2::PartialMatch(params.url->spec(), *info.regex)) { | 
|  | rules.push_back(info.regex_rule->url_rule()); | 
|  | } | 
|  | } | 
|  |  | 
|  | return GetModifyHeadersActionsFromMetadata(params, rules, *metadata_list_); | 
|  | } | 
|  |  | 
|  | absl::optional<RequestAction> RegexRulesMatcher::GetAllowAllRequestsAction( | 
|  | const RequestParams& params) const { | 
|  | const std::vector<RegexRuleInfo>& potential_matches = | 
|  | GetPotentialMatches(params); | 
|  | auto info = std::find_if(potential_matches.begin(), potential_matches.end(), | 
|  | [¶ms](const RegexRuleInfo& info) { | 
|  | return info.regex_rule->action_type() == | 
|  | flat::ActionType_allow_all_requests && | 
|  | re2::RE2::PartialMatch(params.url->spec(), | 
|  | *info.regex); | 
|  | }); | 
|  | if (info == potential_matches.end()) | 
|  | return absl::nullopt; | 
|  |  | 
|  | return CreateAllowAllRequestsAction(params, *info->regex_rule->url_rule()); | 
|  | } | 
|  |  | 
|  | absl::optional<RequestAction> | 
|  | RegexRulesMatcher::GetBeforeRequestActionIgnoringAncestors( | 
|  | const RequestParams& params) const { | 
|  | const std::vector<RegexRuleInfo>& potential_matches = | 
|  | GetPotentialMatches(params); | 
|  | auto info = std::find_if( | 
|  | potential_matches.begin(), potential_matches.end(), | 
|  | [¶ms](const RegexRuleInfo& info) { | 
|  | return IsBeforeRequestAction(info.regex_rule->action_type()) && | 
|  | re2::RE2::PartialMatch(params.url->spec(), *info.regex); | 
|  | }); | 
|  | if (info == potential_matches.end()) | 
|  | return absl::nullopt; | 
|  |  | 
|  | const flat_rule::UrlRule& rule = *info->regex_rule->url_rule(); | 
|  | switch (info->regex_rule->action_type()) { | 
|  | case flat::ActionType_block: | 
|  | return CreateBlockOrCollapseRequestAction(params, rule); | 
|  | case flat::ActionType_allow: | 
|  | return CreateAllowAction(params, rule); | 
|  | case flat::ActionType_redirect: | 
|  | // If this is a regex substitution rule, handle the substitution. Else | 
|  | // create the redirect action from the information in |metadata_list_| | 
|  | // below. | 
|  | return info->regex_rule->regex_substitution() | 
|  | ? CreateRegexSubstitutionRedirectAction(params, *info) | 
|  | : CreateRedirectActionFromMetadata(params, rule, | 
|  | *metadata_list_); | 
|  | case flat::ActionType_upgrade_scheme: | 
|  | return CreateUpgradeAction(params, rule); | 
|  | case flat::ActionType_allow_all_requests: | 
|  | return CreateAllowAllRequestsAction(params, rule); | 
|  | case flat::ActionType_modify_headers: | 
|  | case flat::ActionType_count: | 
|  | NOTREACHED(); | 
|  | break; | 
|  | } | 
|  |  | 
|  | return absl::nullopt; | 
|  | } | 
|  |  | 
|  | void RegexRulesMatcher::InitializeMatcher() { | 
|  | if (IsEmpty()) | 
|  | return; | 
|  |  | 
|  | for (const auto* regex_rule : *regex_list_) { | 
|  | const flat_rule::UrlRule* rule = regex_rule->url_rule(); | 
|  |  | 
|  | const bool is_case_sensitive = | 
|  | !(rule->options() & flat_rule::OptionFlag_IS_CASE_INSENSITIVE); | 
|  |  | 
|  | const bool require_capturing = !!regex_rule->regex_substitution(); | 
|  |  | 
|  | // TODO(karandeepb): Regex compilation can be expensive and sometimes we are | 
|  | // compiling the same regex twice, once during rule indexing and now during | 
|  | // ruleset loading. We should try maintaining a global cache of compiled | 
|  | // regexes and modify FilteredRE2 to take a regex object directly. | 
|  | int re2_id; | 
|  | re2::RE2::ErrorCode error_code = filtered_re2_.Add( | 
|  | ToRE2StringPiece(*rule->url_pattern()), | 
|  | CreateRE2Options(is_case_sensitive, require_capturing), &re2_id); | 
|  |  | 
|  | // Ideally there shouldn't be any error, since we had already validated the | 
|  | // regular expression while indexing the ruleset. That said, there are cases | 
|  | // possible where this may happen, for example, the library's implementation | 
|  | // may change etc. | 
|  | // TODO(crbug.com/1050780): Notify the extension about the same. | 
|  | if (error_code != re2::RE2::NoError) | 
|  | continue; | 
|  |  | 
|  | const bool did_insert = | 
|  | re2_id_to_rules_map_.insert({re2_id, regex_rule}).second; | 
|  | DCHECK(did_insert) << "Duplicate |re2_id| seen."; | 
|  | } | 
|  |  | 
|  | // FilteredRE2 on compilation yields a set of candidate strings. These aid in | 
|  | // pre-filtering and obtaining the set of potential matches for a request. | 
|  | std::vector<std::string> strings_to_match; | 
|  | filtered_re2_.Compile(&strings_to_match); | 
|  |  | 
|  | // FilteredRE2 guarantees that the returned set of candidate strings is | 
|  | // lower-cased. | 
|  | DCHECK(std::all_of(strings_to_match.begin(), strings_to_match.end(), | 
|  | [](const std::string& s) { | 
|  | return std::all_of(s.begin(), s.end(), [](const char c) { | 
|  | return !base::IsAsciiUpper(c); | 
|  | }); | 
|  | })); | 
|  |  | 
|  | // Convert |strings_to_match| to MatcherStringPatterns. This is necessary to | 
|  | // use url_matcher::SubstringSetMatcher. | 
|  | std::vector<base::MatcherStringPattern> patterns; | 
|  | patterns.reserve(strings_to_match.size()); | 
|  |  | 
|  | for (size_t i = 0; i < strings_to_match.size(); ++i) | 
|  | patterns.emplace_back(std::move(strings_to_match[i]), i); | 
|  |  | 
|  | substring_matcher_ = std::make_unique<base::SubstringSetMatcher>(); | 
|  |  | 
|  | // This is only used for regex rules, which are limited to 1000, | 
|  | // so hitting the 8MB limit should be all but impossible. | 
|  | bool success = substring_matcher_->Build(patterns); | 
|  | CHECK(success); | 
|  | } | 
|  |  | 
|  | bool RegexRulesMatcher::IsEmpty() const { | 
|  | return regex_list_->Length() == 0; | 
|  | } | 
|  |  | 
|  | const std::vector<RegexRuleInfo>& RegexRulesMatcher::GetPotentialMatches( | 
|  | const RequestParams& params) const { | 
|  | auto iter = params.potential_regex_matches.find(this); | 
|  | if (iter != params.potential_regex_matches.end()) | 
|  | return iter->second; | 
|  |  | 
|  | // Early out if this is an empty matcher. | 
|  | if (IsEmpty()) { | 
|  | auto result = params.potential_regex_matches.insert( | 
|  | std::make_pair(this, std::vector<RegexRuleInfo>())); | 
|  | return result.first->second; | 
|  | } | 
|  |  | 
|  | // Compute the potential matches. FilteredRE2 requires the text to be lower | 
|  | // cased first. | 
|  | if (!params.lower_cased_url_spec) | 
|  | params.lower_cased_url_spec = base::ToLowerASCII(params.url->spec()); | 
|  |  | 
|  | // To pre-filter the set of regexes to match against |params|, we first need | 
|  | // to compute the set of candidate strings tracked by |substring_matcher_| | 
|  | // within |params.lower_cased_url_spec|. | 
|  | std::set<base::MatcherStringPattern::ID> candidate_ids_set; | 
|  | DCHECK(substring_matcher_); | 
|  | substring_matcher_->Match(*params.lower_cased_url_spec, &candidate_ids_set); | 
|  | std::vector<int> candidate_ids_list(candidate_ids_set.begin(), | 
|  | candidate_ids_set.end()); | 
|  |  | 
|  | // FilteredRE2 then yields the set of potential regex matches. | 
|  | std::vector<int> potential_re2_ids; | 
|  | filtered_re2_.AllPotentials(candidate_ids_list, &potential_re2_ids); | 
|  |  | 
|  | // We prune the set of potential matches even further by matching request | 
|  | // metadata. | 
|  | std::vector<RegexRuleInfo> potential_matches; | 
|  | for (int re2_id : potential_re2_ids) { | 
|  | auto it = re2_id_to_rules_map_.find(re2_id); | 
|  | DCHECK(it != re2_id_to_rules_map_.end()); | 
|  |  | 
|  | const flat::RegexRule* rule = it->second; | 
|  | if (!DoesRuleMetadataMatchRequest(*rule->url_rule(), params)) | 
|  | continue; | 
|  |  | 
|  | const RE2& regex = filtered_re2_.GetRE2(re2_id); | 
|  | potential_matches.emplace_back(rule, ®ex); | 
|  | } | 
|  |  | 
|  | // Sort potential matches in descending order of priority. | 
|  | std::sort(potential_matches.begin(), potential_matches.end(), | 
|  | [](const RegexRuleInfo& lhs, const RegexRuleInfo& rhs) { | 
|  | return lhs.regex_rule->url_rule()->priority() > | 
|  | rhs.regex_rule->url_rule()->priority(); | 
|  | }); | 
|  |  | 
|  | // Cache |potential_matches|. | 
|  | auto result = params.potential_regex_matches.insert( | 
|  | std::make_pair(this, std::move(potential_matches))); | 
|  | return result.first->second; | 
|  | } | 
|  |  | 
|  | absl::optional<RequestAction> | 
|  | RegexRulesMatcher::CreateRegexSubstitutionRedirectAction( | 
|  | const RequestParams& params, | 
|  | const RegexRuleInfo& info) const { | 
|  | // We could have extracted the captured strings during the matching stage | 
|  | // and directly used RE2::Rewrite here (which doesn't need to match the | 
|  | // regex again). However we prefer to capture the strings only when | 
|  | // necessary. Not capturing the strings should allow re2 to perform | 
|  | // additional optimizations during the matching stage. | 
|  | std::string redirect_str = params.url->spec(); | 
|  | bool success = | 
|  | RE2::Replace(&redirect_str, *info.regex, | 
|  | ToRE2StringPiece(*info.regex_rule->regex_substitution())); | 
|  | if (!success) { | 
|  | // This should generally not happen since we had already checked for a | 
|  | // match and during indexing, had verified that the substitution pattern | 
|  | // is not ill-formed. However, the re2 library implementation might have | 
|  | // changed since indexing, causing this. | 
|  | LOG(ERROR) << base::StringPrintf( | 
|  | "Rewrite failed. Regex:%s Substitution:%s URL:%s\n", | 
|  | info.regex->pattern().c_str(), | 
|  | info.regex_rule->regex_substitution()->c_str(), | 
|  | params.url->spec().c_str()); | 
|  | return absl::nullopt; | 
|  | } | 
|  |  | 
|  | GURL redirect_url(redirect_str); | 
|  |  | 
|  | // Redirects to JavaScript urls are not allowed. | 
|  | // TODO(crbug.com/1033780): this results in counterintuitive behavior. | 
|  | if (redirect_url.SchemeIs(url::kJavaScriptScheme)) | 
|  | return absl::nullopt; | 
|  |  | 
|  | return CreateRedirectAction(params, *info.regex_rule->url_rule(), | 
|  | std::move(redirect_url)); | 
|  | } | 
|  |  | 
|  | }  // namespace declarative_net_request | 
|  | }  // namespace extensions |