blob: 9bce7d2a104cdcbb1e5948ded2e5b39c81df5d7e [file] [log] [blame]
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/ui/lens/lens_url_matcher.h"
#include "base/json/json_reader.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_split.h"
#include "third_party/farmhash/src/src/farmhash.h"
namespace lens {
namespace {
// Converts a JSON string array to a vector.
std::vector<std::string> JSONArrayToVector(const std::string& json_array) {
std::optional<base::Value> json_value =
base::JSONReader::Read(json_array, base::JSON_PARSE_CHROMIUM_EXTENSIONS);
if (!json_value) {
return {};
}
base::Value::List* entries = json_value->GetIfList();
if (!entries) {
return {};
}
std::vector<std::string> result;
result.reserve(entries->size());
for (const base::Value& entry : *entries) {
const std::string* filter = entry.GetIfString();
if (filter) {
result.emplace_back(*filter);
}
}
return result;
}
} // namespace
LensUrlMatcher::LensUrlMatcher(std::string url_allow_filters,
std::string url_block_filters,
std::string path_match_allow_filters,
std::string path_match_block_filters,
std::string url_forced_allowed_match_patterns,
std::string hashed_domain_block_filters_list) {
base::MatcherStringPattern::ID id(0);
InitializeUrlMatcher(url_allow_filters, url_block_filters, &id);
InitializeForceAllowUrlPatterns(url_forced_allowed_match_patterns, &id);
InitializePathAllowMatcher(path_match_allow_filters, &id);
InitializePathBlockMatcher(path_match_block_filters, &id);
InitializeHashedDomainBlockFilters(hashed_domain_block_filters_list);
}
LensUrlMatcher::~LensUrlMatcher() = default;
void LensUrlMatcher::InitializeUrlMatcher(std::string url_allow_filters,
std::string url_block_filters,
base::MatcherStringPattern::ID* id) {
url_matcher_ = std::make_unique<url_matcher::URLMatcher>();
url_matcher::util::AddFiltersWithLimit(url_matcher_.get(), true, id,
JSONArrayToVector(url_allow_filters),
&url_filters_);
url_matcher::util::AddFiltersWithLimit(url_matcher_.get(), false, id,
JSONArrayToVector(url_block_filters),
&url_filters_);
}
void LensUrlMatcher::InitializeForceAllowUrlPatterns(
std::string url_path_forced_allowed_match_patterns,
base::MatcherStringPattern::ID* id) {
auto force_allow_url_strings =
JSONArrayToVector(url_path_forced_allowed_match_patterns);
std::vector<base::MatcherStringPattern> force_allow_url_patterns;
std::vector<const base::MatcherStringPattern*> force_allow_url_pointers;
force_allow_url_patterns.reserve(force_allow_url_strings.size());
force_allow_url_pointers.reserve(force_allow_url_strings.size());
for (const std::string& entry : force_allow_url_strings) {
(*id)++;
force_allow_url_patterns.emplace_back(entry, *id);
force_allow_url_pointers.push_back(&force_allow_url_patterns.back());
}
url_forced_allow_matcher = std::make_unique<url_matcher::RegexSetMatcher>();
// Pointers will not be referenced after AddPatterns() completes.
url_forced_allow_matcher->AddPatterns(force_allow_url_pointers);
}
void LensUrlMatcher::InitializePathAllowMatcher(
std::string path_match_allow_filters,
base::MatcherStringPattern::ID* id) {
const auto allow_strings = JSONArrayToVector(path_match_allow_filters);
std::vector<base::MatcherStringPattern> allow_patterns;
std::vector<const base::MatcherStringPattern*> allow_pointers;
allow_patterns.reserve(allow_strings.size());
allow_pointers.reserve(allow_strings.size());
for (const std::string& entry : allow_strings) {
(*id)++;
allow_patterns.emplace_back(entry, *id);
allow_pointers.push_back(&allow_patterns.back());
}
path_allow_matcher_ = std::make_unique<url_matcher::RegexSetMatcher>();
// Pointers will not be referenced after AddPatterns() completes.
path_allow_matcher_->AddPatterns(allow_pointers);
}
void LensUrlMatcher::InitializePathBlockMatcher(
std::string path_match_block_filters,
base::MatcherStringPattern::ID* id) {
const auto block_strings = JSONArrayToVector(path_match_block_filters);
std::vector<base::MatcherStringPattern> block_patterns;
std::vector<const base::MatcherStringPattern*> block_pointers;
block_patterns.reserve(block_strings.size());
block_pointers.reserve(block_strings.size());
for (const std::string& entry : block_strings) {
(*id)++;
block_patterns.emplace_back(entry, *id);
block_pointers.push_back(&block_patterns.back());
}
path_block_matcher_ = std::make_unique<url_matcher::RegexSetMatcher>();
// Pointers will not be referenced after AddPatterns() completes.
path_block_matcher_->AddPatterns(block_pointers);
}
void LensUrlMatcher::InitializeHashedDomainBlockFilters(
std::string hashed_domain_block_filters_list) {
for (std::string_view hash_string :
base::SplitStringPiece(hashed_domain_block_filters_list, ",",
base::WhitespaceHandling::TRIM_WHITESPACE,
base::SplitResult::SPLIT_WANT_NONEMPTY)) {
uint32_t hash;
if (base::StringToUint(hash_string, &hash)) {
hashed_domain_block_filters_.insert(hash);
}
}
}
bool LensUrlMatcher::IsMatch(const GURL& url) {
// Check if the URL matches any of the allow filters. If it does not, return
// false immediately to block this URL.
auto matches = url_matcher_.get()->MatchURL(url);
if (!matches.size()) {
return false;
}
// Now that the URL is allowed, check if it matches any of the block filters.
// If it does, return false to block this URL.
for (auto match : matches) {
// Blocks take precedence over allows.
if (!url_filters_[match].allow) {
return false;
}
}
// Check if the domain matches any of the hashed block filters. If it does,
// return false to block this URL.
if (SubdomainsMatchHash(url.GetHost())) {
return false;
}
// Check if the path matches the path block matcher. If it does, return false
// to block this URL.
if (path_block_matcher_ && !path_block_matcher_->IsEmpty() &&
path_block_matcher_->Match(url.GetPath(), &matches)) {
return false;
}
// Check if the URL matches any of the forced allowed URLs. If it does, return
// true as this should be a shown match even if the path does not contain an
// allowlisted pattern (below).
if (url_forced_allow_matcher && !url_forced_allow_matcher->IsEmpty() &&
url_forced_allow_matcher->Match(url.spec(), &matches)) {
return true;
}
// Finally, check if the path matches the path allow matcher. If it doesn't,
// return false to block this URL.
if (path_allow_matcher_ && !path_allow_matcher_->IsEmpty() &&
!path_allow_matcher_->Match(url.GetPath(), &matches)) {
return false;
}
// Finally if all checks pass, this must be a valid match, i.e.:
// 1. The URL matches at least one of the allowed URLs.
// 2. The URL does not match any of the blocked URLs.
// 3. The domain does not match any of the hashed blocked domains.
// 4. The URL does not match any of the block path patterns.
// 5. The URL either matches the force allowed patterns, or matches at least
// one of the allowed path patterns.
return true;
}
bool LensUrlMatcher::SubdomainsMatchHash(std::string_view str) {
// Remove any periods from the start and end of the hostname.
size_t start = str.find_first_not_of('.');
if (start == std::string::npos) {
return false;
}
size_t end = str.find_last_not_of('.');
std::string_view domain =
std::string_view(str).substr(start, 1 + end - start);
while (true) {
if (MatchesHash(domain)) {
return true;
}
size_t found = domain.find('.');
if (found == std::string::npos) {
// Top-level domain.
return false;
}
domain = domain.substr(found + 1);
}
}
bool LensUrlMatcher::MatchesHash(std::string_view str) {
uint32_t hash = util::Fingerprint32(str);
return hashed_domain_block_filters_.contains(hash);
}
} // namespace lens