blob: ec1dc9599b5538b26a4ad668184c73e536528a54 [file] [log] [blame]
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <stddef.h>
#include <stdint.h>
#include <map>
#include <vector>
#include "base/macros.h"
#include "base/strings/string_piece_forward.h"
#include "components/url_pattern_index/closed_hash_map.h"
#include "components/url_pattern_index/flat/url_pattern_index_generated.h"
#include "components/url_pattern_index/proto/rules.pb.h"
#include "components/url_pattern_index/uint64_hasher.h"
#include "third_party/flatbuffers/src/include/flatbuffers/flatbuffers.h"
class GURL;
namespace url {
class Origin;
namespace url_pattern_index {
// The integer type used to represent N-grams.
using NGram = uint64_t;
// The hasher used for hashing N-grams.
using NGramHasher = Uint64Hasher;
// The hash table probe sequence used both by UrlPatternIndex and its builder.
using NGramHashTableProber = DefaultProber<NGram, NGramHasher>;
// FlatBuffer offset aliases.
using UrlRuleOffset = flatbuffers::Offset<flat::UrlRule>;
using UrlPatternIndexOffset = flatbuffers::Offset<flat::UrlPatternIndex>;
using FlatStringOffset = flatbuffers::Offset<flatbuffers::String>;
using FlatDomains = flatbuffers::Vector<FlatStringOffset>;
using FlatDomainsOffset = flatbuffers::Offset<FlatDomains>;
struct OffsetVectorCompare {
bool operator()(const std::vector<FlatStringOffset>& a,
const std::vector<FlatStringOffset>& b) const;
using FlatDomainMap = std::
map<std::vector<FlatStringOffset>, FlatDomainsOffset, OffsetVectorCompare>;
constexpr size_t kNGramSize = 5;
static_assert(kNGramSize <= sizeof(NGram), "NGram type is too narrow.");
// The default element types mask as specified by the flatbuffer schema.
constexpr uint16_t kDefaultFlatElementTypesMask =
flat::ElementType_ANY & ~flat::ElementType_MAIN_FRAME;
// The default element types mask used by a proto::UrlRule.
constexpr uint32_t kDefaultProtoElementTypesMask =
// Serializes the |rule| to the FlatBuffer |builder|, and returns an offset to
// it in the resulting buffer. Returns null offset iff the |rule| could not be
// serialized because of unsupported options or it is otherwise invalid.
// |domain_map| Should point to a non-nullptr map of domain vectors to their
// existing offsets. It is used to de-dupe domain vectors in the serialized
// rules.
UrlRuleOffset SerializeUrlRule(const proto::UrlRule& rule,
flatbuffers::FlatBufferBuilder* builder,
FlatDomainMap* domain_map);
// Performs three-way comparison between two domains. In the total order defined
// by this predicate, the lengths of domains will be monotonically decreasing.
// Domains of same length are ordered in lexicographic order.
// Returns a negative value if |lhs_domain| should be ordered before
// |rhs_domain|, zero if |lhs_domain| is equal to |rhs_domain| and a positive
// value if |lhs_domain| should be ordered after |rhs_domain|.
int CompareDomains(base::StringPiece lhs_domain, base::StringPiece rhs_domain);
// The current format version of UrlPatternIndex.
// Increase this value when introducing an incompatible change to the
// UrlPatternIndex schema (flat/url_pattern_index.fbs). url_pattern_index
// clients can use this as a signal to rebuild rulesets.
constexpr int kUrlPatternIndexFormatVersion = 4;
// The class used to construct an index over the URL patterns of a set of URL
// rules. The rules themselves need to be converted to FlatBuffers format by the
// client of this class, as well as persisted into the |flat_builder| that is
// supplied in the constructor.
class UrlPatternIndexBuilder {
explicit UrlPatternIndexBuilder(flatbuffers::FlatBufferBuilder* flat_builder);
// Adds a UrlRule to the index. The caller should have already persisted the
// rule into the same |flat_builder| by a call to SerializeUrlRule returning a
// non-null |offset|, and should pass in the resulting |offset| here.
void IndexUrlRule(UrlRuleOffset offset);
// Finalizes construction of the index, serializes it using |flat_builder|,
// and returns an offset to it in the resulting FlatBuffer.
UrlPatternIndexOffset Finish();
using MutableUrlRuleList = std::vector<UrlRuleOffset>;
using MutableNGramIndex =
ClosedHashMap<NGram, MutableUrlRuleList, NGramHashTableProber>;
// Returns an N-gram of the |pattern| encoded into the NGram integer type. The
// N-gram is picked using a greedy heuristic, i.e. the one is chosen which
// corresponds to the shortest list of rules within the index. If there are no
// valid N-grams in the |pattern|, the return value is 0.
NGram GetMostDistinctiveNGram(base::StringPiece pattern);
// This index contains all non-REGEXP rules that have at least one acceptable
// N-gram. For each given rule, the N-gram used as an index key is picked
// greedily (see GetMostDistinctiveNGram).
MutableNGramIndex ngram_index_;
// A fallback list that contains all the rules with no acceptable N-gram.
MutableUrlRuleList fallback_rules_;
// Must outlive this instance.
flatbuffers::FlatBufferBuilder* flat_builder_;
// Encapsulates a read-only index built over the URL patterns of a set of URL
// rules, and provides fast matching of network requests against these rules.
class UrlPatternIndexMatcher {
enum class FindRuleStrategy {
// Any rule is returned in case multiple rules match.
// If multiple rules match, any of the rules with the highest priority is
// returned.
// Creates an instance to access the given |flat_index|. If |flat_index| is
// nullptr, then all requests return no match.
explicit UrlPatternIndexMatcher(const flat::UrlPatternIndex* flat_index);
// If the index contains one or more UrlRules that match the request, returns
// one of them, depending on the |strategy|. Otherwise, returns nullptr.
// Notes on parameters:
// - |url| should be valid and not longer than url::kMaxURLChars, otherwise
// the return value is nullptr. The length limit is chosen due to
// performance implications of matching giant URLs, along with the fact
// that in many places in Chrome (e.g. at the IPC layer), URLs longer than
// this are dropped already.
// - Exactly one of |element_type| and |activation_type| should be specified,
// i.e., not equal to *_UNSPECIFIED, otherwise the return value is nullptr.
// - |is_third_party| should be pre-computed by the caller, e.g. using the
// registry_controlled_domains library, to reflect the relation between
// |url| and |first_party_origin|.
// A rule is deemed to match the request iff all of the following applies:
// - The |url| matches the rule's UrlPattern (see url_pattern.h).
// - The |first_party_origin| matches the rule's targeted domains list.
// - |element_type| or |activation_type| is among the rule's targeted types.
// - The |is_third_party| bit matches the rule's requirement on the requested
// |url| being first-/third-party w.r.t. its |first_party_origin|.
// - The rule is not generic if |disable_generic_rules| is true.
const flat::UrlRule* FindMatch(const GURL& url,
const url::Origin& first_party_origin,
proto::ElementType element_type,
proto::ActivationType activation_type,
bool is_third_party,
bool disable_generic_rules,
FindRuleStrategy strategy) const;
// Helper function to work with flat::*Type(s). If the index contains one or
// more UrlRules that match the request, returns one of them depending on
// |strategy|. Otherwise, returns nullptr.
const flat::UrlRule* FindMatch(const GURL& url,
const url::Origin& first_party_origin,
flat::ElementType element_type,
flat::ActivationType activation_type,
bool is_third_party,
bool disable_generic_rules,
FindRuleStrategy strategy) const;
// Must outlive this instance.
const flat::UrlPatternIndex* flat_index_;
} // namespace url_pattern_index