blob: a9762fa15607286d6ed72a43b3457195d6754e2a [file] [log] [blame]
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// The matching logic distinguishes between the terms URL pattern and
// subpattern. A URL pattern usually stands for the full thing, e.g.
// "example.com^*path*par=val^", whereas subpattern denotes a maximal substring
// of a pattern not containing the wildcard '*' character. For the example above
// the subpatterns are: "example.com^", "path" and "par=val^".
//
// The separator placeholder '^' symbol is used in subpatterns to match any
// separator character, which is any ASCII symbol except letters, digits, and
// the following: '_', '-', '.', '%'. Note that the separator placeholder
// character '^' is itself a separator, as well as '\0'.
#include "components/url_pattern_index/url_pattern.h"
#include <stddef.h>
#include <algorithm>
#include <ostream>
#include "base/logging.h"
#include "base/numerics/checked_math.h"
#include "base/strings/string_util.h"
#include "components/url_pattern_index/flat/url_pattern_index_generated.h"
#include "components/url_pattern_index/fuzzy_pattern_matching.h"
#include "components/url_pattern_index/string_splitter.h"
#include "url/gurl.h"
#include "url/third_party/mozilla/url_parse.h"
namespace url_pattern_index {
namespace {
class IsWildcard {
public:
bool operator()(char c) const { return c == '*'; }
};
proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) {
switch (type) {
case flat::UrlPatternType_SUBSTRING:
return proto::URL_PATTERN_TYPE_SUBSTRING;
case flat::UrlPatternType_WILDCARDED:
return proto::URL_PATTERN_TYPE_WILDCARDED;
case flat::UrlPatternType_REGEXP:
return proto::URL_PATTERN_TYPE_REGEXP;
default:
return proto::URL_PATTERN_TYPE_UNSPECIFIED;
}
}
proto::AnchorType ConvertAnchorType(flat::AnchorType type) {
switch (type) {
case flat::AnchorType_NONE:
return proto::ANCHOR_TYPE_NONE;
case flat::AnchorType_BOUNDARY:
return proto::ANCHOR_TYPE_BOUNDARY;
case flat::AnchorType_SUBDOMAIN:
return proto::ANCHOR_TYPE_SUBDOMAIN;
default:
return proto::ANCHOR_TYPE_UNSPECIFIED;
}
}
base::StringPiece ConvertString(const flatbuffers::String* string) {
return string ? base::StringPiece(string->data(), string->size())
: base::StringPiece();
}
bool HasAnyUpperAscii(base::StringPiece string) {
return std::any_of(string.begin(), string.end(), base::IsAsciiUpper<char>);
}
// Returns whether |position| within the |url| belongs to its |host| component
// and corresponds to the beginning of a (sub-)domain.
inline bool IsSubdomainAnchored(base::StringPiece url,
url::Component host,
size_t position) {
DCHECK_LE(position, url.size());
const size_t host_begin = static_cast<size_t>(host.begin);
const size_t host_end = static_cast<size_t>(host.end());
DCHECK_LE(host_end, url.size());
return position == host_begin ||
(position > host_begin && position <= host_end &&
url[position - 1] == '.');
}
// Returns the position of the leftmost occurrence of a |subpattern| in the
// |text| starting no earlier than |from| the specified position. If the
// |subpattern| has separator placeholders, searches for a fuzzy occurrence.
size_t FindSubpattern(base::StringPiece text,
base::StringPiece subpattern,
size_t from = 0) {
const bool is_fuzzy =
(subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);
return is_fuzzy ? FindFuzzy(text, subpattern, from)
: text.find(subpattern, from);
}
// Same as FindSubpattern(url, subpattern), but searches for an occurrence that
// starts at the beginning of a (sub-)domain within the url's |host| component.
size_t FindSubdomainAnchoredSubpattern(base::StringPiece url,
url::Component host,
base::StringPiece subpattern) {
const bool is_fuzzy =
(subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);
// Any match found after the end of the host will be discarded, so just
// avoid searching there for the subpattern to begin with.
//
// Check for overflow.
size_t max_match_end = 0;
if (!base::CheckAdd(host.end(), subpattern.length())
.AssignIfValid(&max_match_end)) {
return base::StringPiece::npos;
}
const base::StringPiece url_match_candidate = url.substr(0, max_match_end);
const base::StringPiece url_host = url.substr(0, host.end());
for (size_t position = static_cast<size_t>(host.begin);
position <= static_cast<size_t>(host.end()); ++position) {
// Enforce as a loop precondition that we are always anchored at a
// sub-domain before calling find. This is to reduce the number of potential
// searches for |subpattern|.
DCHECK(IsSubdomainAnchored(url, host, position));
position = is_fuzzy ? FindFuzzy(url_match_candidate, subpattern, position)
: url_match_candidate.find(subpattern, position);
if (position == base::StringPiece::npos ||
IsSubdomainAnchored(url, host, position)) {
return position;
}
// Enforce the loop precondition. This skips |position| to the next '.',
// within the host, which the loop itself increments to the anchored
// sub-domain.
position = url_host.find('.', position);
if (position == base::StringPiece::npos)
break;
}
return base::StringPiece::npos;
}
// Returns whether the given |url_pattern| matches the given |url_spec|.
// Compares the pattern the the url in a case-sensitive manner.
bool IsCaseSensitiveMatch(base::StringPiece url_pattern,
proto::AnchorType anchor_left,
proto::AnchorType anchor_right,
base::StringPiece url_spec,
url::Component url_host) {
DCHECK(!url_spec.empty());
StringSplitter<IsWildcard> subpatterns(url_pattern);
auto subpattern_it = subpatterns.begin();
auto subpattern_end = subpatterns.end();
if (subpattern_it == subpattern_end) {
return anchor_left == proto::ANCHOR_TYPE_NONE ||
anchor_right == proto::ANCHOR_TYPE_NONE;
}
base::StringPiece subpattern = *subpattern_it;
++subpattern_it;
// If there is only one |subpattern|, and it has a right anchor, then simply
// check that it is a suffix of the |url_spec|, and the left anchor is
// fulfilled.
if (subpattern_it == subpattern_end &&
anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
if (!EndsWithFuzzy(url_spec, subpattern))
return false;
if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY)
return url_spec.size() == subpattern.size();
if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) {
DCHECK_LE(subpattern.size(), url_spec.size());
return url_host.is_nonempty() &&
IsSubdomainAnchored(url_spec, url_host,
url_spec.size() - subpattern.size());
}
return true;
}
// Otherwise, the first |subpattern| does not have to be a suffix. But it
// still can have a left anchor. Check and handle that.
base::StringPiece text = url_spec;
if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY) {
if (!StartsWithFuzzy(url_spec, subpattern))
return false;
if (subpattern_it == subpattern_end) {
DCHECK_EQ(anchor_right, proto::ANCHOR_TYPE_NONE);
return true;
}
text.remove_prefix(subpattern.size());
} else if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) {
if (!url_host.is_nonempty())
return false;
const size_t match_begin =
FindSubdomainAnchoredSubpattern(url_spec, url_host, subpattern);
if (match_begin == base::StringPiece::npos)
return false;
if (subpattern_it == subpattern_end) {
DCHECK_EQ(anchor_right, proto::ANCHOR_TYPE_NONE);
return true;
}
text.remove_prefix(match_begin + subpattern.size());
} else {
DCHECK_EQ(anchor_left, proto::ANCHOR_TYPE_NONE);
// Get back to the initial |subpattern|, process it in the loop below.
subpattern_it = subpatterns.begin();
}
// Consecutively find all the remaining subpatterns in the |text|. If the
// pattern has a right anchor, don't search for the last subpattern, but
// instead check that it is a suffix of the |text|.
while (subpattern_it != subpattern_end) {
subpattern = *subpattern_it;
DCHECK(!subpattern.empty());
if (++subpattern_it == subpattern_end &&
anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
break;
}
const size_t match_position = FindSubpattern(text, subpattern);
if (match_position == base::StringPiece::npos)
return false;
text.remove_prefix(match_position + subpattern.size());
}
return anchor_right != proto::ANCHOR_TYPE_BOUNDARY ||
EndsWithFuzzy(text, subpattern);
}
} // namespace
UrlPattern::UrlInfo::UrlInfo(const GURL& url)
: spec_(url.possibly_invalid_spec()),
host_(url.parsed_for_possibly_invalid_spec().host) {
DCHECK(url.is_valid());
}
base::StringPiece UrlPattern::UrlInfo::GetLowerCaseSpec() const {
if (lower_case_spec_cached_)
return *lower_case_spec_cached_;
if (!HasAnyUpperAscii(spec_)) {
lower_case_spec_cached_ = spec_;
} else {
lower_case_spec_owner_ = base::ToLowerASCII(spec_);
lower_case_spec_cached_ = lower_case_spec_owner_;
}
return *lower_case_spec_cached_;
}
UrlPattern::UrlInfo::~UrlInfo() = default;
UrlPattern::UrlPattern() = default;
UrlPattern::UrlPattern(base::StringPiece url_pattern,
proto::UrlPatternType type,
MatchCase match_case)
: type_(type), url_pattern_(url_pattern), match_case_(match_case) {}
UrlPattern::UrlPattern(base::StringPiece url_pattern,
proto::AnchorType anchor_left,
proto::AnchorType anchor_right)
: type_(proto::URL_PATTERN_TYPE_WILDCARDED),
url_pattern_(url_pattern),
anchor_left_(anchor_left),
anchor_right_(anchor_right) {}
UrlPattern::UrlPattern(const flat::UrlRule& rule)
: type_(ConvertUrlPatternType(rule.url_pattern_type())),
url_pattern_(ConvertString(rule.url_pattern())),
anchor_left_(ConvertAnchorType(rule.anchor_left())),
anchor_right_(ConvertAnchorType(rule.anchor_right())),
match_case_(rule.options() & flat::OptionFlag_IS_CASE_INSENSITIVE
? MatchCase::kFalse
: MatchCase::kTrue) {}
UrlPattern::~UrlPattern() = default;
bool UrlPattern::MatchesUrl(const UrlInfo& url) const {
DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING ||
type_ == proto::URL_PATTERN_TYPE_WILDCARDED);
DCHECK(base::IsStringASCII(url_pattern_));
DCHECK(base::IsStringASCII(url.spec()));
DCHECK(base::IsStringASCII(url.GetLowerCaseSpec()));
if (match_case()) {
return IsCaseSensitiveMatch(url_pattern_, anchor_left_, anchor_right_,
url.spec(), url.host());
}
// Use the lower-cased url for case-insensitive comparison. Case-insensitive
// patterns should already be lower-cased.
DCHECK(!HasAnyUpperAscii(url_pattern_));
return IsCaseSensitiveMatch(url_pattern_, anchor_left_, anchor_right_,
url.GetLowerCaseSpec(), url.host());
}
std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) {
switch (pattern.anchor_left()) {
case proto::ANCHOR_TYPE_SUBDOMAIN:
out << '|';
FALLTHROUGH;
case proto::ANCHOR_TYPE_BOUNDARY:
out << '|';
FALLTHROUGH;
default:
break;
}
out << pattern.url_pattern();
if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY)
out << '|';
if (pattern.match_case())
out << "$match-case";
return out;
}
} // namespace url_pattern_index