| // Copyright 2018 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "chrome/browser/browser_switcher/browser_switcher_sitelist.h" |
| |
| #include <string.h> |
| |
| #include <algorithm> |
| #include <optional> |
| #include <sstream> |
| #include <string> |
| #include <string_view> |
| #include <utility> |
| #include <vector> |
| |
| #include "base/functional/bind.h" |
| #include "base/metrics/histogram_macros.h" |
| #include "base/strings/strcat.h" |
| #include "base/strings/string_util.h" |
| #include "base/values.h" |
| #include "chrome/browser/browser_switcher/browser_switcher_prefs.h" |
| #include "chrome/browser/browser_switcher/ieem_sitelist_parser.h" |
| #include "components/prefs/pref_service.h" |
| #include "components/url_formatter/url_fixer.h" |
| #include "third_party/re2/src/re2/re2.h" |
| #include "url/gurl.h" |
| #include "url/url_util.h" |
| |
| namespace browser_switcher { |
| |
| namespace { |
| |
| // Find the position of |token| inside |input|, if present. Ignore case for |
| // ASCII characters. |
| // |
| // If |token| is not in |input|, return a pointer to the null-byte at the end |
| // of |input|. |
| auto StringFindInsensitiveASCII(std::string_view input, |
| std::string_view token) { |
| return std::ranges::search(input, token, std::equal_to<>(), |
| &base::ToLowerASCII<char>, |
| &base::ToLowerASCII<char>); |
| } |
| |
| // Checks if the omitted prefix for a non-fully specific prefix is one of the |
| // expected parts that are allowed to be omitted (e.g. "https://"). |
| bool IsValidPrefix(std::string_view prefix) { |
| static re2::LazyRE2 re = {"(https?|file):(//)?"}; |
| return prefix.empty() || re2::RE2::FullMatch(prefix, *re); |
| } |
| |
| // Checks whether |patterns| contains a pattern that matches |url|, and returns |
| // the longest matching pattern. If there are no matches, an empty pattern is |
| // returned. |
| // |
| // If |contains_inverted_matches| is true, treat patterns that start with "!" as |
| // inverted matches. |
| const Rule* MatchUrlToList(const NoCopyUrl& url, |
| const std::vector<std::unique_ptr<Rule>>& rules, |
| bool contains_inverted_matches) { |
| const Rule* reason = nullptr; |
| for (const std::unique_ptr<Rule>& rule : rules) { |
| DCHECK(rule); |
| if (reason && rule->priority() <= reason->priority()) |
| continue; |
| if (rule->inverted() && !contains_inverted_matches) |
| continue; |
| if (rule->Matches(url)) |
| reason = rule.get(); |
| } |
| return reason; |
| } |
| |
| // Rules that are just an "*" are the most simple: they just return true all the |
| // time, regardless of ParsingMode. |
| class WildcardRule : public Rule { |
| public: |
| WildcardRule() : Rule("*") {} |
| ~WildcardRule() override = default; |
| |
| bool Matches(const NoCopyUrl& url) const override { return true; } |
| |
| bool IsValid() const override { return true; } |
| |
| std::string ToString() const override { return "*"; } |
| }; |
| |
| // Rules with ParsingMode::kDefault. They treat rules with/without a '/' |
| // separately. They do some pre-processing to come up with a |canonical_| rule |
| // string, then some simple string searches. |
| class DefaultModeRule : public Rule { |
| public: |
| explicit DefaultModeRule(std::string_view original_rule) |
| : Rule(original_rule) { |
| canonical_ = std::string(original_rule); |
| |
| // Drop the leading "!", if present. |
| if (inverted()) |
| canonical_ = canonical_.substr(1); |
| |
| if (canonical_.find("/") == std::string::npos) { |
| // No "/" in the string. It's a hostnmae or wildcard, so just convert to |
| // lowercase. |
| canonical_ = base::ToLowerASCII(canonical_); |
| return; |
| } |
| |
| // The string has a "/" in it. It could be: |
| // - "//example.com/abc", convert hostname to lowercase |
| // - "example.com/abc", treat same as "//example.com/abc" |
| // - "http://example.com/abc", convert hostname and scheme to lowercase |
| // - "/abc", keep capitalization |
| |
| if (base::StartsWith(canonical_, "/") && |
| !base::StartsWith(canonical_, "//")) { |
| // Rule starts with a single slash, e.g. "/abc". Don't change case. |
| return; |
| } |
| |
| if (canonical_.find("/") != 0 && |
| canonical_.find("://") == std::string::npos) { |
| // Transform "example.com/abc" => "//example.com/abc". |
| canonical_.insert(0, "//"); |
| } |
| |
| // For patterns that include a "/": parse the URL to get the proper |
| // capitalization (for scheme/hostname). |
| // |
| // To properly parse URLs with no scheme, we need a valid base URL. We use |
| // "ftp://XXX/", which is a valid URL with an unsupported scheme. That |
| // way, parsing still succeeds, and we can easily know when the scheme |
| // isn't part of the original pattern (and omit it from the output). |
| const char* placeholder_scheme = "ftp:"; |
| std::string placeholder = base::StrCat({placeholder_scheme, "//XXX/"}); |
| GURL base_url(placeholder); |
| |
| GURL relative_url = base_url.Resolve(canonical_); |
| std::string_view spec = relative_url.possibly_invalid_spec(); |
| |
| // The parsed URL might start with "ftp://XXX/" or "ftp://". Remove that |
| // prefix. |
| auto remainder = base::RemovePrefix(spec, placeholder, |
| base::CompareCase::INSENSITIVE_ASCII); |
| if (remainder) { |
| spec = *remainder; |
| } |
| remainder = base::RemovePrefix(spec, placeholder_scheme, |
| base::CompareCase::INSENSITIVE_ASCII); |
| if (remainder) { |
| spec = *remainder; |
| } |
| canonical_ = std::string(spec); |
| } |
| |
| ~DefaultModeRule() override = default; |
| |
| bool Matches(const NoCopyUrl& url) const override { |
| std::string_view pattern = canonical_; |
| |
| if (pattern.find('/') != std::string_view::npos) { |
| // Check that the prefix is valid. The URL's hostname/scheme have |
| // already been case-normalized, so that part of the URL is always |
| // case-insensitive. |
| size_t pos = url.spec().find(pattern); |
| if (pos != std::string_view::npos && |
| IsValidPrefix(std::string_view(url.spec().data(), pos))) { |
| return true; |
| } |
| if (!url.spec_without_port().empty()) { |
| pos = url.spec_without_port().find(pattern); |
| return pos != std::string_view::npos && |
| IsValidPrefix( |
| std::string_view(url.spec_without_port().data(), pos)); |
| } |
| return false; |
| } |
| |
| // Compare hosts and ports, case-insensitive. |
| auto result = StringFindInsensitiveASCII(url.host_and_port(), pattern); |
| return result.begin() != url.host_and_port().end(); |
| } |
| |
| bool IsValid() const override { return true; } |
| |
| std::string ToString() const override { |
| if (inverted()) |
| return "!" + canonical_; |
| return canonical_; |
| } |
| |
| private: |
| // The canonical version of the rule, with the leading "!" removed if it's |
| // inverted. |
| std::string canonical_; |
| }; |
| |
| // Rules with ParsingMode::kIESiteListMode. They treat rules the same regardless |
| // of whether a '/' is present. They parse the rule as a URL, then split it |
| // into scheme, host, port, and path parts. They compare each of these parts |
| // with the URL to be matched. |
| class IESiteListModeRule : public Rule { |
| public: |
| explicit IESiteListModeRule(std::string_view original_rule) |
| : Rule(original_rule) { |
| // Parse the string as a URL and extract its parts. |
| // |
| // Some parts of the URL will be dropped, to match IE/Edge behavior: |
| // - username |
| // - password |
| // - query |
| // - fragment |
| |
| // Drop the leading "!", if present. |
| if (inverted()) |
| original_rule = original_rule.substr(1); |
| |
| // Rules with leading slashes are interpreted as file:// URLs on POSIX |
| // systems. To make it more consistent with Windows, remove the leading |
| // slashes. |
| // |
| // Only remove the first leading slash, to be consistent with Edge (which |
| // *does* parse it as a file:// URL if there are 2 slashes). |
| if (base::StartsWith(original_rule, "/")) |
| original_rule = original_rule.substr(1); |
| |
| // Parse as a URL. This is more relaxed than GURL's constructor, e.g. it |
| // adds http:// if the scheme is missing. |
| // |
| // This lets us parse strings like "example.com", even though they're not |
| // fully-specified URLs (missing scheme and path). |
| GURL url = url_formatter::FixupURL(std::string(original_rule), ""); |
| |
| if (!url.is_valid() || |
| (!url.SchemeIsHTTPOrHTTPS() && !url.SchemeIsFile())) { |
| // The rule is invalid, so it won't match anything. Continue parsing it, |
| // in case we want to print it later for debugging/troubleshooting. |
| valid_ = false; |
| } |
| |
| // If it starts with http:// or https://, preserve the scheme. Otherwise, |
| // use a wildcard ("*") as the scheme. |
| // |
| // "http://" may have been added by FixupUrl(), so look for it in the |
| // original string instead. |
| if (valid_ && |
| (StringFindInsensitiveASCII(original_rule, "http://").begin() == |
| original_rule.begin() || |
| StringFindInsensitiveASCII(original_rule, "https://").begin() == |
| original_rule.begin() || |
| url.SchemeIsFile())) { |
| scheme_ = url.scheme(); |
| } |
| |
| if (url.has_host()) |
| host_ = url.host(); |
| |
| if (url.has_port()) |
| port_ = url.IntPort(); |
| |
| // Make sure |path_| always has at least the leading slash. |
| if (url.has_path() && !url.path_piece().empty()) |
| path_ = base::ToLowerASCII(url.path()); |
| else |
| path_ = "/"; |
| } |
| |
| ~IESiteListModeRule() override = default; |
| |
| bool Matches(const NoCopyUrl& no_copy_url) const override { |
| DCHECK(valid_); |
| |
| const GURL& url = no_copy_url.original(); |
| // Compare schemes, if present in the rule. |
| if (scheme_ && url.scheme_piece() != *scheme_) { |
| return false; |
| } |
| |
| // Compare hosts. |
| if (!url::DomainIs(url.host_piece(), host_)) |
| return false; |
| |
| // Compare ports, if present in the rule. |
| if (port_ && url.IntPort() != *port_) |
| return false; |
| |
| // Compare paths, case-insensitively. They must match at the beginning. |
| return StringFindInsensitiveASCII(url.path_piece(), path_).begin() == |
| url.path_piece().begin(); |
| } |
| |
| bool IsValid() const override { return valid_; } |
| |
| // Typical return value looks like "*://example.com:8000/path". |
| std::string ToString() const override { |
| DCHECK(valid_); |
| |
| std::ostringstream out; |
| |
| if (inverted()) |
| out << "!"; |
| |
| // <scheme>:// |
| if (scheme_) |
| out << *scheme_; |
| else |
| out << "*"; |
| out << "://"; |
| |
| // <host>:<port> |
| out << host_; |
| if (port_) |
| out << ":" << *port_; |
| |
| // <path> |
| out << path_; |
| |
| return out.str(); |
| } |
| |
| private: |
| std::optional<std::string> scheme_; |
| std::string host_; |
| std::optional<int> port_; |
| // Always at least a "/". |
| std::string path_; |
| |
| bool valid_ = true; |
| }; |
| |
| } // namespace |
| |
| std::unique_ptr<Rule> CanonicalizeRule(std::string_view original_rule, |
| ParsingMode parsing_mode) { |
| std::unique_ptr<Rule> rule; |
| |
| if (original_rule == "*") { |
| rule = std::make_unique<WildcardRule>(); |
| } else { |
| switch (parsing_mode) { |
| case ParsingMode::kDefault: |
| rule = std::make_unique<DefaultModeRule>(original_rule); |
| break; |
| case ParsingMode::kIESiteListMode: |
| rule = std::make_unique<IESiteListModeRule>(original_rule); |
| break; |
| default: |
| NOTREACHED(); |
| } |
| } |
| |
| if (!rule || !rule->IsValid()) |
| return nullptr; |
| else |
| return rule; |
| } |
| |
| Decision::Decision(Action action_, Reason reason_, const Rule* matching_rule_) |
| : action(action_), reason(reason_), matching_rule(matching_rule_) {} |
| |
| Decision::Decision() = default; |
| Decision::Decision(Decision&) = default; |
| Decision::Decision(Decision&&) = default; |
| |
| bool Decision::operator==(const Decision& that) const { |
| if (action != that.action || reason != that.reason) |
| return false; |
| if (matching_rule == that.matching_rule) |
| return true; |
| if (!matching_rule || !that.matching_rule) |
| return false; |
| return matching_rule->ToString() == that.matching_rule->ToString(); |
| } |
| |
| BrowserSwitcherSitelist::~BrowserSwitcherSitelist() = default; |
| |
| bool BrowserSwitcherSitelist::ShouldSwitch(const GURL& url) const { |
| return GetDecision(url).action == kGo; |
| } |
| |
| BrowserSwitcherSitelistImpl::BrowserSwitcherSitelistImpl( |
| BrowserSwitcherPrefs* prefs) |
| : prefs_(prefs) { |
| prefs_changed_subscription_ = prefs_->RegisterPrefsChangedCallback( |
| base::BindRepeating(&BrowserSwitcherSitelistImpl::OnPrefsChanged, |
| base::Unretained(this))); |
| } |
| |
| BrowserSwitcherSitelistImpl::~BrowserSwitcherSitelistImpl() = default; |
| |
| Decision BrowserSwitcherSitelistImpl::GetDecision(const GURL& url) const { |
| // Don't record metrics for LBS non-users. |
| if (!IsActive()) |
| return {kStay, kDisabled, nullptr}; |
| |
| Decision decision = GetDecisionImpl(url); |
| UMA_HISTOGRAM_BOOLEAN("BrowserSwitcher.Decision", decision.action == kGo); |
| return decision; |
| } |
| |
| Decision BrowserSwitcherSitelistImpl::GetDecisionImpl(const GURL& url) const { |
| SCOPED_UMA_HISTOGRAM_TIMER("BrowserSwitcher.DecisionTime"); |
| |
| if (!url.SchemeIsHTTPOrHTTPS() && !url.SchemeIsFile()) { |
| return {kStay, kProtocol, nullptr}; |
| } |
| |
| NoCopyUrl no_copy_url(url); |
| const RuleSet* rulesets[] = {&prefs_->GetRules(), &ieem_sitelist_, |
| &external_sitelist_, &external_greylist_}; |
| |
| const Rule* reason_to_go = nullptr; |
| for (const RuleSet* rules : rulesets) { |
| const Rule* match = MatchUrlToList(no_copy_url, rules->sitelist, |
| /*contains_inverted_matches=*/true); |
| if (!match) |
| continue; |
| if (!reason_to_go || match->priority() > reason_to_go->priority()) |
| reason_to_go = match; |
| } |
| |
| // If sitelists don't match, no need to check the greylists. |
| if (!reason_to_go) |
| return {kStay, kDefault, nullptr}; |
| if (reason_to_go->inverted()) |
| return {kStay, kSitelist, reason_to_go}; |
| |
| const Rule* reason_to_stay = nullptr; |
| for (const RuleSet* rules : rulesets) { |
| const Rule* match = MatchUrlToList(no_copy_url, rules->greylist, |
| /*contains_inverted_matches=*/false); |
| if (!match) |
| continue; |
| if (!reason_to_stay || match->priority() > reason_to_stay->priority()) |
| reason_to_stay = match; |
| } |
| |
| if (reason_to_go->priority() <= 1 && reason_to_stay) |
| return {kStay, kGreylist, reason_to_stay}; |
| |
| if (!reason_to_stay || reason_to_go->priority() >= reason_to_stay->priority()) |
| return {kGo, kSitelist, reason_to_go}; |
| else |
| return {kStay, kGreylist, reason_to_stay}; |
| } |
| |
| void BrowserSwitcherSitelistImpl::SetIeemSitelist(RawRuleSet&& rules) { |
| UMA_HISTOGRAM_COUNTS_100000("BrowserSwitcher.IeemSitelistSize", |
| rules.sitelist.size()); |
| StoreRules(ieem_sitelist_, rules); |
| original_ieem_sitelist_ = std::move(rules); |
| } |
| |
| void BrowserSwitcherSitelistImpl::SetExternalSitelist(RawRuleSet&& rules) { |
| UMA_HISTOGRAM_COUNTS_100000("BrowserSwitcher.ExternalSitelistSize", |
| rules.sitelist.size()); |
| StoreRules(external_sitelist_, rules); |
| original_external_sitelist_ = std::move(rules); |
| } |
| |
| void BrowserSwitcherSitelistImpl::SetExternalGreylist(RawRuleSet&& rules) { |
| UMA_HISTOGRAM_COUNTS_100000("BrowserSwitcher.ExternalGreylistSize", |
| rules.sitelist.size()); |
| DCHECK(rules.sitelist.empty()); |
| StoreRules(external_greylist_, rules); |
| original_external_greylist_ = std::move(rules); |
| } |
| |
| const RuleSet* BrowserSwitcherSitelistImpl::GetIeemSitelist() const { |
| return &ieem_sitelist_; |
| } |
| |
| const RuleSet* BrowserSwitcherSitelistImpl::GetExternalSitelist() const { |
| return &external_sitelist_; |
| } |
| |
| const RuleSet* BrowserSwitcherSitelistImpl::GetExternalGreylist() const { |
| return &external_greylist_; |
| } |
| |
| void BrowserSwitcherSitelistImpl::StoreRules(RuleSet& dst, |
| const RawRuleSet& src) { |
| dst.sitelist.clear(); |
| dst.greylist.clear(); |
| ParsingMode parsing_mode = prefs_->GetParsingMode(); |
| for (const std::string& original_rule : src.sitelist) { |
| std::unique_ptr<Rule> rule = CanonicalizeRule(original_rule, parsing_mode); |
| if (rule) |
| dst.sitelist.push_back(std::move(rule)); |
| } |
| for (const std::string& original_rule : src.greylist) { |
| std::unique_ptr<Rule> rule = CanonicalizeRule(original_rule, parsing_mode); |
| if (rule) |
| dst.greylist.push_back(std::move(rule)); |
| } |
| } |
| |
| void BrowserSwitcherSitelistImpl::OnPrefsChanged( |
| BrowserSwitcherPrefs* prefs, |
| const std::vector<std::string>& changed_prefs) { |
| auto it = std::ranges::find(changed_prefs, prefs::kParsingMode); |
| if (it != changed_prefs.end()) { |
| // ParsingMode changed, re-canonicalize rules. |
| StoreRules(ieem_sitelist_, original_ieem_sitelist_); |
| StoreRules(external_sitelist_, original_external_sitelist_); |
| StoreRules(external_greylist_, original_external_greylist_); |
| } |
| } |
| |
| bool BrowserSwitcherSitelistImpl::IsActive() const { |
| if (!prefs_->IsEnabled()) |
| return false; |
| |
| const RuleSet* rulesets[] = {&prefs_->GetRules(), &ieem_sitelist_, |
| &external_sitelist_, &external_greylist_}; |
| for (const RuleSet* rules : rulesets) { |
| if (!rules->sitelist.empty() || !rules->greylist.empty()) |
| return true; |
| } |
| return false; |
| } |
| |
| } // namespace browser_switcher |