| // Copyright 2019 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "extensions/browser/api/declarative_net_request/regex_rules_matcher.h" |
| |
| #include <algorithm> |
| #include <optional> |
| |
| #include "base/containers/contains.h" |
| #include "base/logging.h" |
| #include "base/notreached.h" |
| #include "base/strings/string_util.h" |
| #include "base/strings/stringprintf.h" |
| #include "components/url_pattern_index/url_pattern_index.h" |
| #include "extensions/browser/api/declarative_net_request/request_action.h" |
| #include "extensions/browser/api/declarative_net_request/request_params.h" |
| #include "extensions/browser/api/declarative_net_request/utils.h" |
| |
| namespace extensions::declarative_net_request { |
| namespace flat_rule = url_pattern_index::flat; |
| |
| namespace { |
| |
| bool IsExtraHeadersMatcherInternal( |
| const RegexRulesMatcher::RegexRulesList* regex_list) { |
| DCHECK(regex_list); |
| |
| // We only support removing a subset of extra headers currently. If that |
| // changes, the implementation here should change as well. |
| static_assert(flat::ActionType_count == 6, |
| "Modify this method to ensure IsExtraHeadersMatcherInternal is " |
| "updated as new actions are added."); |
| |
| return base::Contains(*regex_list, flat::ActionType_modify_headers, |
| &flat::RegexRule::action_type); |
| } |
| |
| // Helper to check if the `rule` metadata matches the given request `params`. |
| bool DoesRuleMetadataMatchRequest(const flat_rule::UrlRule& rule, |
| const RequestParams& params) { |
| // Evaluates `element_type`, `method`, `is_third_party` and |
| // `embedder_conditions_matcher`. |
| if (!url_pattern_index::DoesRuleFlagsMatch( |
| rule, params.element_type, flat_rule::ActivationType_NONE, |
| params.method, params.is_third_party, |
| params.embedder_conditions_matcher)) { |
| return false; |
| } |
| |
| // Compares included and excluded request domains. |
| if (!url_pattern_index::DoesURLMatchRequestDomainList(*params.url, rule)) { |
| return false; |
| } |
| |
| // Compares included and excluded initiator domains. |
| return url_pattern_index::DoesOriginMatchInitiatorDomainList( |
| params.first_party_origin, rule); |
| } |
| |
| // For the given `action_type`, returns: |
| // - true if multiple actions of this type can be matched for a request. |
| // - false if an action of this type that is matched to a request will exclude |
| // all other actions from matching to that request. |
| bool ActionTypeAllowsMultipleActions(flat::ActionType action_type) { |
| switch (action_type) { |
| case flat::ActionType_block: |
| case flat::ActionType_allow: |
| case flat::ActionType_redirect: |
| case flat::ActionType_upgrade_scheme: |
| case flat::ActionType_allow_all_requests: |
| return false; |
| case flat::ActionType_modify_headers: |
| return true; |
| case flat::ActionType_count: |
| NOTREACHED(); |
| } |
| return true; |
| } |
| |
| } // namespace |
| |
| RegexRuleInfo::RegexRuleInfo(const flat::RegexRule* regex_rule, |
| const re2::RE2* regex) |
| : regex_rule(regex_rule), regex(regex) { |
| DCHECK(regex_rule); |
| DCHECK(regex); |
| } |
| RegexRuleInfo::RegexRuleInfo(const RegexRuleInfo& info) = default; |
| RegexRuleInfo& RegexRuleInfo::operator=(const RegexRuleInfo& info) = default; |
| |
| RegexRulesMatcher::RegexRulesMatcher( |
| const ExtensionId& extension_id, |
| RulesetID ruleset_id, |
| const RegexRulesList* before_request_regex_list, |
| const RegexRulesList* headers_received_regex_list, |
| const ExtensionMetadataList* metadata_list) |
| : RulesetMatcherBase(extension_id, ruleset_id), |
| before_request_matcher_(before_request_regex_list, |
| this, |
| RulesetMatchingStage::kOnBeforeRequest), |
| headers_received_matcher_(headers_received_regex_list, |
| this, |
| RulesetMatchingStage::kOnHeadersReceived), |
| metadata_list_(metadata_list), |
| // See comments for this field in extension_url_pattern_index_matcher.cc |
| // for why different checks are used for `before_request_regex_list` and |
| // `headers_received_regex_list`. |
| is_extra_headers_matcher_( |
| IsExtraHeadersMatcherInternal(before_request_regex_list) || |
| headers_received_regex_list->size() > 0) {} |
| |
| RegexRulesMatcher::~RegexRulesMatcher() = default; |
| |
| bool RegexRulesMatcher::IsExtraHeadersMatcher() const { |
| return is_extra_headers_matcher_; |
| } |
| |
| size_t RegexRulesMatcher::GetRulesCount() const { |
| return GetBeforeRequestRulesCount() + GetHeadersReceivedRulesCount(); |
| } |
| |
| size_t RegexRulesMatcher::GetBeforeRequestRulesCount() const { |
| return before_request_matcher_.GetRulesCount(); |
| } |
| |
| size_t RegexRulesMatcher::GetHeadersReceivedRulesCount() const { |
| return headers_received_matcher_.GetRulesCount(); |
| } |
| |
| std::vector<RequestAction> RegexRulesMatcher::GetModifyHeadersActions( |
| const RequestParams& params, |
| RulesetMatchingStage stage, |
| std::optional<uint64_t> min_priority) const { |
| const std::vector<RegexRuleInfo>& potential_matches = |
| GetMatcherForStage(stage).GetPotentialMatches(params); |
| |
| std::vector<const flat_rule::UrlRule*> rules; |
| for (const RegexRuleInfo& info : potential_matches) { |
| // Check for the rule's priority iff `min_priority` is specified. |
| bool has_sufficient_priority = |
| !min_priority || |
| info.regex_rule->url_rule()->priority() > *min_priority; |
| |
| if (has_sufficient_priority && |
| info.regex_rule->action_type() == flat::ActionType_modify_headers && |
| re2::RE2::PartialMatch(params.url->spec(), *info.regex)) { |
| rules.push_back(info.regex_rule->url_rule()); |
| } |
| } |
| |
| return GetModifyHeadersActionsFromMetadata(params, rules, *metadata_list_); |
| } |
| |
| std::optional<RequestAction> RegexRulesMatcher::GetAllowAllRequestsAction( |
| const RequestParams& params, |
| RulesetMatchingStage stage) const { |
| const std::vector<RegexRuleInfo>& potential_matches = |
| GetMatcherForStage(stage).GetPotentialMatches(params); |
| auto info = std::ranges::find_if( |
| potential_matches, [¶ms](const RegexRuleInfo& info) { |
| return info.regex_rule->action_type() == |
| flat::ActionType_allow_all_requests && |
| re2::RE2::PartialMatch(params.url->spec(), *info.regex); |
| }); |
| if (info == potential_matches.end()) { |
| return std::nullopt; |
| } |
| |
| return CreateAllowAllRequestsAction(params, *info->regex_rule->url_rule()); |
| } |
| |
| std::optional<RequestAction> RegexRulesMatcher::GetActionIgnoringAncestors( |
| const RequestParams& params, |
| RulesetMatchingStage stage) const { |
| const std::vector<RegexRuleInfo>& potential_matches = |
| GetMatcherForStage(stage).GetPotentialMatches(params); |
| auto info = std::ranges::find_if( |
| potential_matches, [¶ms](const RegexRuleInfo& info) { |
| return !ActionTypeAllowsMultipleActions( |
| info.regex_rule->action_type()) && |
| re2::RE2::PartialMatch(params.url->spec(), *info.regex); |
| }); |
| |
| return info == potential_matches.end() ? std::nullopt |
| : CreateActionFromInfo(params, *info); |
| } |
| |
| RegexRulesMatcher::MatchHelper::MatchHelper( |
| const raw_ptr<const RegexRulesList> regex_list, |
| const RegexRulesMatcher* parent_matcher, |
| RulesetMatchingStage stage) |
| : regex_list_(regex_list), regex_match_key_(parent_matcher, stage) { |
| InitializeMatcher(); |
| } |
| |
| RegexRulesMatcher::MatchHelper::~MatchHelper() = default; |
| |
| size_t RegexRulesMatcher::MatchHelper::GetRulesCount() const { |
| return regex_list_->size(); |
| } |
| |
| const std::vector<RegexRuleInfo>& |
| RegexRulesMatcher::MatchHelper::GetPotentialMatches( |
| const RequestParams& params) const { |
| auto iter = params.potential_regex_matches.find(regex_match_key_); |
| if (iter != params.potential_regex_matches.end()) { |
| return iter->second; |
| } |
| |
| // Early out if this is an empty matcher. |
| if (IsEmpty()) { |
| auto result = params.potential_regex_matches.insert( |
| std::make_pair(regex_match_key_, std::vector<RegexRuleInfo>())); |
| return result.first->second; |
| } |
| |
| // Compute the potential matches. FilteredRE2 requires the text to be lower |
| // cased first. |
| if (!params.lower_cased_url_spec) { |
| params.lower_cased_url_spec = base::ToLowerASCII(params.url->spec()); |
| } |
| |
| // To pre-filter the set of regexes to match against `params`, we first need |
| // to compute the set of candidate strings tracked by `substring_matcher_` |
| // within `params.lower_cased_url_spec`. |
| std::set<base::MatcherStringPattern::ID> candidate_ids_set; |
| DCHECK(substring_matcher_); |
| substring_matcher_->Match(*params.lower_cased_url_spec, &candidate_ids_set); |
| std::vector<int> candidate_ids_list(candidate_ids_set.begin(), |
| candidate_ids_set.end()); |
| |
| // FilteredRE2 then yields the set of potential regex matches. |
| std::vector<int> potential_re2_ids; |
| filtered_re2_.AllPotentials(candidate_ids_list, &potential_re2_ids); |
| |
| // We prune the set of potential matches even further by matching request |
| // metadata. |
| std::vector<RegexRuleInfo> potential_matches; |
| for (int re2_id : potential_re2_ids) { |
| auto it = re2_id_to_rules_map_.find(re2_id); |
| CHECK(it != re2_id_to_rules_map_.end()); |
| |
| const flat::RegexRule* rule = it->second; |
| if (!DoesRuleMetadataMatchRequest(*rule->url_rule(), params)) { |
| continue; |
| } |
| |
| const RE2& regex = filtered_re2_.GetRE2(re2_id); |
| potential_matches.emplace_back(rule, ®ex); |
| } |
| |
| // Sort potential matches in descending order of priority. |
| std::sort(potential_matches.begin(), potential_matches.end(), |
| [](const RegexRuleInfo& lhs, const RegexRuleInfo& rhs) { |
| return lhs.regex_rule->url_rule()->priority() > |
| rhs.regex_rule->url_rule()->priority(); |
| }); |
| |
| // Cache `potential_matches`. |
| auto result = params.potential_regex_matches.insert( |
| std::make_pair(regex_match_key_, std::move(potential_matches))); |
| return result.first->second; |
| } |
| |
| bool RegexRulesMatcher::MatchHelper::IsEmpty() const { |
| return regex_list_->size() == 0; |
| } |
| |
| void RegexRulesMatcher::MatchHelper::InitializeMatcher() { |
| if (IsEmpty()) { |
| return; |
| } |
| |
| for (const auto* regex_rule : *regex_list_) { |
| const flat_rule::UrlRule* rule = regex_rule->url_rule(); |
| |
| const bool is_case_sensitive = |
| rule->options() & flat_rule::OptionFlag_IS_MATCH_CASE; |
| |
| const bool require_capturing = !!regex_rule->regex_substitution(); |
| |
| // TODO(karandeepb): Regex compilation can be expensive and sometimes we are |
| // compiling the same regex twice, once during rule indexing and now during |
| // ruleset loading. We should try maintaining a global cache of compiled |
| // regexes and modify FilteredRE2 to take a regex object directly. |
| int re2_id; |
| re2::RE2::ErrorCode error_code = filtered_re2_.Add( |
| rule->url_pattern()->string_view(), |
| CreateRE2Options(is_case_sensitive, require_capturing), &re2_id); |
| |
| // Ideally there shouldn't be any error, since we had already validated the |
| // regular expression while indexing the ruleset. That said, there are cases |
| // possible where this may happen, for example, the library's implementation |
| // may change etc. |
| // TODO(crbug.com/40118204): Notify the extension about the same. |
| if (error_code != re2::RE2::NoError) { |
| continue; |
| } |
| |
| const bool did_insert = |
| re2_id_to_rules_map_.insert({re2_id, regex_rule}).second; |
| DCHECK(did_insert) << "Duplicate |re2_id| seen."; |
| } |
| |
| // FilteredRE2 on compilation yields a set of candidate strings. These aid in |
| // pre-filtering and obtaining the set of potential matches for a request. |
| std::vector<std::string> strings_to_match; |
| filtered_re2_.Compile(&strings_to_match); |
| |
| // FilteredRE2 guarantees that the returned set of candidate strings is |
| // lower-cased. |
| DCHECK(std::ranges::all_of(strings_to_match, [](const std::string& s) { |
| return std::ranges::all_of( |
| s, [](const char c) { return !base::IsAsciiUpper(c); }); |
| })); |
| |
| // Convert `strings_to_match` to MatcherStringPatterns. This is necessary to |
| // use url_matcher::SubstringSetMatcher. |
| std::vector<base::MatcherStringPattern> patterns; |
| patterns.reserve(strings_to_match.size()); |
| |
| for (size_t i = 0; i < strings_to_match.size(); ++i) { |
| patterns.emplace_back(std::move(strings_to_match[i]), i); |
| } |
| |
| substring_matcher_ = std::make_unique<base::SubstringSetMatcher>(); |
| |
| // This is only used for regex rules, which are limited to 1000, |
| // so hitting the 8MB limit should be all but impossible. |
| bool success = substring_matcher_->Build(patterns); |
| CHECK(success); |
| } |
| |
| std::optional<RequestAction> RegexRulesMatcher::CreateActionFromInfo( |
| const RequestParams& params, |
| const RegexRuleInfo& info) const { |
| const flat_rule::UrlRule& rule = *info.regex_rule->url_rule(); |
| switch (info.regex_rule->action_type()) { |
| case flat::ActionType_block: |
| return CreateBlockOrCollapseRequestAction(params, rule); |
| case flat::ActionType_allow: |
| return CreateAllowAction(params, rule); |
| case flat::ActionType_redirect: |
| // If this is a regex substitution rule, handle the substitution. Else |
| // create the redirect action from the information in `metadata_list_` |
| // below. |
| return info.regex_rule->regex_substitution() |
| ? CreateRegexSubstitutionRedirectAction(params, info) |
| : CreateRedirectActionFromMetadata(params, rule, |
| *metadata_list_); |
| case flat::ActionType_upgrade_scheme: |
| return CreateUpgradeAction(params, rule); |
| case flat::ActionType_allow_all_requests: |
| return CreateAllowAllRequestsAction(params, rule); |
| case flat::ActionType_modify_headers: |
| case flat::ActionType_count: |
| NOTREACHED(); |
| } |
| |
| return std::nullopt; |
| } |
| |
| std::optional<RequestAction> |
| RegexRulesMatcher::CreateRegexSubstitutionRedirectAction( |
| const RequestParams& params, |
| const RegexRuleInfo& info) const { |
| // We could have extracted the captured strings during the matching stage |
| // and directly used RE2::Rewrite here (which doesn't need to match the |
| // regex again). However we prefer to capture the strings only when |
| // necessary. Not capturing the strings should allow re2 to perform |
| // additional optimizations during the matching stage. |
| std::string redirect_str = params.url->spec(); |
| bool success = |
| RE2::Replace(&redirect_str, *info.regex, |
| info.regex_rule->regex_substitution()->string_view()); |
| if (!success) { |
| // This should generally not happen since we had already checked for a |
| // match and during indexing, had verified that the substitution pattern |
| // is not ill-formed. However, the re2 library implementation might have |
| // changed since indexing, causing this. |
| LOG(ERROR) << base::StringPrintf( |
| "Rewrite failed. Regex:%s Substitution:%s URL:%s\n", |
| info.regex->pattern().c_str(), |
| info.regex_rule->regex_substitution()->c_str(), |
| params.url->spec().c_str()); |
| return std::nullopt; |
| } |
| |
| GURL redirect_url(redirect_str); |
| |
| // Redirects to JavaScript urls are not allowed. |
| // TODO(crbug.com/40111509): this results in counterintuitive behavior. |
| if (redirect_url.SchemeIs(url::kJavaScriptScheme)) { |
| return std::nullopt; |
| } |
| |
| return CreateRedirectAction(params, *info.regex_rule->url_rule(), |
| std::move(redirect_url)); |
| } |
| |
| const RegexRulesMatcher::MatchHelper& RegexRulesMatcher::GetMatcherForStage( |
| RulesetMatchingStage stage) const { |
| switch (stage) { |
| case RulesetMatchingStage::kOnBeforeRequest: |
| return before_request_matcher_; |
| case RulesetMatchingStage::kOnHeadersReceived: |
| return headers_received_matcher_; |
| } |
| |
| NOTREACHED(); |
| } |
| |
| } // namespace extensions::declarative_net_request |