blob: 0b9fa918748b8ec79900bfc53e0eaa8bbffa9216 [file] [log] [blame]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/subresource_filter/tools/rule_parser/rule.h"
#include <stddef.h>
#include <algorithm>
#include <utility>
#include "base/logging.h"
#include "base/macros.h"
#include "base/strings/string_util.h"
namespace subresource_filter {
namespace {
// Returns the simplest matching strategy that can be used for the |rule|.
UrlPatternType GetUrlPatternType(const UrlRule& rule) {
const std::string& url_pattern = rule.url_pattern;
if (url_pattern.size() >= 2 && url_pattern.front() == '/' &&
url_pattern.back() == '/') {
return url_pattern_index::proto::URL_PATTERN_TYPE_REGEXP;
}
if (url_pattern.find('^') != std::string::npos ||
rule.anchor_right != url_pattern_index::proto::ANCHOR_TYPE_NONE ||
rule.anchor_left != url_pattern_index::proto::ANCHOR_TYPE_NONE) {
return url_pattern_index::proto::URL_PATTERN_TYPE_WILDCARDED;
}
size_t wildcard_pos = url_pattern.find('*');
if (wildcard_pos == std::string::npos ||
wildcard_pos == url_pattern.size() - 1) {
return url_pattern_index::proto::URL_PATTERN_TYPE_SUBSTRING;
}
if (wildcard_pos == 0) {
size_t next_wildcard = url_pattern.find('*', 1);
if (next_wildcard == std::string::npos ||
next_wildcard == url_pattern.size() - 1) {
return url_pattern_index::proto::URL_PATTERN_TYPE_SUBSTRING;
}
}
return url_pattern_index::proto::URL_PATTERN_TYPE_WILDCARDED;
}
void DomainListJoin(const google::protobuf::RepeatedPtrField<
url_pattern_index::proto::DomainListItem>& domain_list,
char separator,
std::string* output) {
for (const url_pattern_index::proto::DomainListItem& item : domain_list) {
if (item.exclude())
*output += '~';
*output += item.domain();
*output += separator;
}
output->pop_back(); // Remove the last separator.
}
} // namespace
UrlRule::UrlRule() = default;
UrlRule::UrlRule(const UrlRule&) = default;
UrlRule::~UrlRule() = default;
UrlRule& UrlRule::operator=(const UrlRule&) = default;
bool UrlRule::operator==(const UrlRule& other) const {
return is_allowlist == other.is_allowlist &&
is_third_party == other.is_third_party &&
match_case == other.match_case && type_mask == other.type_mask &&
domains == other.domains &&
url_pattern_type == other.url_pattern_type &&
url_pattern == other.url_pattern;
}
url_pattern_index::proto::UrlRule UrlRule::ToProtobuf() const {
url_pattern_index::proto::UrlRule result;
result.set_semantics(
is_allowlist ? url_pattern_index::proto::RULE_SEMANTICS_WHITELIST
: url_pattern_index::proto::RULE_SEMANTICS_BLACKLIST);
switch (is_third_party) {
case TriState::DONT_CARE:
result.set_source_type(url_pattern_index::proto::SOURCE_TYPE_ANY);
break;
case TriState::YES:
result.set_source_type(url_pattern_index::proto::SOURCE_TYPE_THIRD_PARTY);
break;
case TriState::NO:
result.set_source_type(url_pattern_index::proto::SOURCE_TYPE_FIRST_PARTY);
break;
default:
LOG(FATAL);
break;
}
if (type_mask & kAllElementTypes)
result.set_element_types(type_mask & kAllElementTypes);
if (type_mask & kAllActivationTypes) {
result.set_activation_types((type_mask & kAllActivationTypes) >>
kActivationTypesShift);
}
for (const std::string& domain : domains) {
url_pattern_index::proto::DomainListItem* list_item = result.add_domains();
if (domain.empty())
continue;
if (domain[0] == '~') {
list_item->set_domain(domain.substr(1));
list_item->set_exclude(true);
} else {
list_item->set_domain(domain);
}
}
result.set_url_pattern_type(url_pattern_type);
result.set_anchor_left(anchor_left);
result.set_anchor_right(anchor_right);
if (match_case != result.match_case())
result.set_match_case(match_case);
if (!url_pattern.empty())
result.set_url_pattern(url_pattern);
return result;
}
void UrlRule::Canonicalize() {
url_pattern_type = GetUrlPatternType(*this);
CanonicalizeUrlPattern();
CanonicalizeDomainList(&domains);
}
CssRule::CssRule() = default;
CssRule::CssRule(const CssRule&) = default;
CssRule::~CssRule() = default;
CssRule& CssRule::operator=(const CssRule&) = default;
bool CssRule::operator==(const CssRule& other) const {
return is_allowlist == other.is_allowlist && domains == other.domains &&
css_selector == other.css_selector;
}
url_pattern_index::proto::CssRule CssRule::ToProtobuf() const {
url_pattern_index::proto::CssRule result;
result.set_semantics(
is_allowlist ? url_pattern_index::proto::RULE_SEMANTICS_WHITELIST
: url_pattern_index::proto::RULE_SEMANTICS_BLACKLIST);
for (const std::string& domain : domains) {
url_pattern_index::proto::DomainListItem* list_item = result.add_domains();
if (domain.empty())
continue;
if (domain[0] == '~') {
list_item->set_domain(domain.substr(1));
list_item->set_exclude(true);
} else {
list_item->set_domain(domain);
}
}
if (!css_selector.empty())
result.set_css_selector(css_selector);
return result;
}
void CssRule::Canonicalize() {
CanonicalizeDomainList(&domains);
}
void UrlRule::CanonicalizeUrlPattern() {
if (url_pattern_type == url_pattern_index::proto::URL_PATTERN_TYPE_REGEXP)
return;
// TODO(melandory): Canonicalize more, e.g. squeeze '/\*\*+/' sequences
// down to one '*'.
if (anchor_left != url_pattern_index::proto::ANCHOR_TYPE_SUBDOMAIN &&
!url_pattern.empty() && url_pattern.front() == '*') {
url_pattern.erase(0, 1);
anchor_left = url_pattern_index::proto::ANCHOR_TYPE_NONE;
}
if (!url_pattern.empty() && url_pattern.back() == '*') {
url_pattern.erase(url_pattern.size() - 1, 1);
anchor_right = url_pattern_index::proto::ANCHOR_TYPE_NONE;
}
}
void CanonicalizeDomainList(std::vector<std::string>* domains) {
if (!domains->empty()) {
std::sort(domains->begin(), domains->end(),
[](const std::string& lhs, const std::string& rhs) {
return lhs.size() > rhs.size() ||
(lhs.size() == rhs.size() && lhs < rhs);
});
}
}
std::string ToString(const url_pattern_index::proto::UrlRule& rule) {
std::string result;
switch (rule.semantics()) {
case url_pattern_index::proto::RULE_SEMANTICS_BLACKLIST:
break;
case url_pattern_index::proto::RULE_SEMANTICS_WHITELIST:
result += "@@";
break;
default:
LOG(FATAL);
}
switch (rule.anchor_left()) {
case url_pattern_index::proto::ANCHOR_TYPE_NONE:
break;
case url_pattern_index::proto::ANCHOR_TYPE_BOUNDARY:
result += '|';
break;
case url_pattern_index::proto::ANCHOR_TYPE_SUBDOMAIN:
result += "||";
break;
default:
LOG(FATAL);
}
result += rule.url_pattern();
switch (rule.anchor_right()) {
case url_pattern_index::proto::ANCHOR_TYPE_NONE:
break;
case url_pattern_index::proto::ANCHOR_TYPE_BOUNDARY:
result += '|';
break;
default:
LOG(FATAL);
}
std::vector<std::string> options;
if (rule.element_types() != kDefaultElementTypes) {
// Try to print as few element types as possible.
int balance = 0;
for (const auto& element_type : kElementTypes) {
if (rule.element_types() & type_mask_for(element_type.type))
++balance;
else
--balance;
}
const bool print_positives = (balance <= 0);
for (const auto& element_type : kElementTypes) {
if (element_type.type == url_pattern_index::proto::ELEMENT_TYPE_POPUP)
continue;
const bool is_positive =
rule.element_types() & type_mask_for(element_type.type);
if (is_positive == print_positives) {
if (is_positive)
options.push_back(element_type.name);
else
options.push_back(std::string("~") + element_type.name);
}
}
// The POPUP type is excluded by default, therefore should be printed last.
if (rule.element_types() &
type_mask_for(url_pattern_index::proto::ELEMENT_TYPE_POPUP)) {
const auto& popup_type = kElementTypes[11];
DCHECK_EQ(url_pattern_index::proto::ELEMENT_TYPE_POPUP, popup_type.type);
options.push_back(popup_type.name);
}
}
for (const auto& activation_type : kActivationTypes) {
if (rule.activation_types() & activation_type.type)
options.push_back(activation_type.name);
}
// This workaround produces a text rule which, when parsed back from text,
// results in the same URL rule. Otherwise the parsed rule would have
// element_types == kDefaultElementTypes instead of 0.
if (!rule.element_types() && !rule.activation_types()) {
const auto& image_type = kElementTypes[2];
DCHECK_EQ(url_pattern_index::proto::ELEMENT_TYPE_IMAGE, image_type.type);
options.push_back(image_type.name);
options.push_back(std::string("~") + image_type.name);
}
std::string source_type_string;
switch (rule.source_type()) {
case url_pattern_index::proto::SOURCE_TYPE_ANY:
break;
case url_pattern_index::proto::SOURCE_TYPE_FIRST_PARTY:
source_type_string = "~";
FALLTHROUGH;
case url_pattern_index::proto::SOURCE_TYPE_THIRD_PARTY:
source_type_string += "third-party";
options.push_back(std::move(source_type_string));
break;
default:
LOG(FATAL);
}
if (rule.match_case())
options.push_back("match-case");
if (rule.domains_size()) {
std::string domains = "domain=";
DomainListJoin(rule.domains(), '|', &domains);
options.push_back(std::move(domains));
}
if (!options.empty()) {
result += '$';
result += base::JoinString(options, ",");
}
return result;
}
std::string ToString(const url_pattern_index::proto::CssRule& rule) {
std::string result;
if (rule.domains_size())
DomainListJoin(rule.domains(), ',', &result);
switch (rule.semantics()) {
case url_pattern_index::proto::RULE_SEMANTICS_BLACKLIST:
result += "##";
break;
case url_pattern_index::proto::RULE_SEMANTICS_WHITELIST:
result += "#@#";
break;
default:
LOG(FATAL);
}
return result += rule.css_selector();
}
std::ostream& operator<<(std::ostream& os, const UrlRule& rule) {
return os << "UrlRule(" << ToString(rule.ToProtobuf()) << ")";
}
std::ostream& operator<<(std::ostream& os, const CssRule& rule) {
return os << "CssRule(" << ToString(rule.ToProtobuf()) << ")";
}
} // namespace subresource_filter