blob: 9c15a7d2a62279663f4d68d5071cf329b879ed7d [file] [log] [blame]
// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "content/browser/first_party_sets/first_party_set_parser.h"
#include <cstdint>
#include <iterator>
#include <string>
#include <utility>
#include <vector>
#include "base/containers/contains.h"
#include "base/containers/flat_map.h"
#include "base/containers/flat_set.h"
#include "base/json/json_reader.h"
#include "base/logging.h"
#include "base/ranges/algorithm.h"
#include "base/strings/string_util.h"
#include "base/types/expected.h"
#include "base/values.h"
#include "content/public/browser/first_party_sets_handler.h"
#include "content/public/common/content_features.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "net/base/schemeful_site.h"
#include "net/first_party_sets/first_party_set_entry.h"
#include "third_party/abseil-cpp/absl/types/optional.h"
#include "url/gurl.h"
#include "url/origin.h"
namespace content {
namespace {
using ParseErrorType = FirstPartySetsHandler::ParseErrorType;
using ParseWarningType = FirstPartySetsHandler::ParseWarningType;
using ParseError = FirstPartySetsHandler::ParseError;
using ParseWarning = FirstPartySetsHandler::ParseWarning;
using Aliases = FirstPartySetParser::Aliases;
using SetsAndAliases = FirstPartySetParser::SetsAndAliases;
using SetsMap = FirstPartySetParser::SetsMap;
// Ensures that the string represents an origin that is non-opaque and HTTPS.
// Returns the registered domain.
base::expected<net::SchemefulSite, ParseErrorType> Canonicalize(
base::StringPiece origin_string,
bool emit_errors) {
const url::Origin origin(url::Origin::Create(GURL(origin_string)));
if (origin.opaque()) {
if (emit_errors) {
LOG(ERROR) << "First-Party Set origin " << origin_string
<< " is not valid; ignoring.";
}
return base::unexpected(ParseErrorType::kInvalidOrigin);
}
if (origin.scheme() != "https") {
if (emit_errors) {
LOG(ERROR) << "First-Party Set origin " << origin_string
<< " is not HTTPS; ignoring.";
}
return base::unexpected(ParseErrorType::kNonHttpsScheme);
}
absl::optional<net::SchemefulSite> site =
net::SchemefulSite::CreateIfHasRegisterableDomain(origin);
if (!site.has_value()) {
if (emit_errors) {
LOG(ERROR) << "First-Party Set origin " << origin_string
<< " does not have a valid registered domain; ignoring.";
}
return base::unexpected(ParseErrorType::kInvalidDomain);
}
return site.value();
}
// Struct to hold metadata describing a particular "subset" during parsing.
struct SubsetDescriptor {
std::string field_name;
net::SiteType site_type;
absl::optional<int> size_limit;
};
const char kFirstPartySetPrimaryField[] = "primary";
const char kFirstPartySetAssociatedSitesField[] = "associatedSites";
const char kFirstPartySetServiceSitesField[] = "serviceSites";
const char kCCTLDsField[] = "ccTLDs";
const char kFirstPartySetPolicyReplacementsField[] = "replacements";
const char kFirstPartySetPolicyAdditionsField[] = "additions";
bool IsSingletonSet(const std::vector<SetsMap::value_type>& set_entries,
const Aliases& aliases) {
// There's no point in having a set with only one site and no aliases.
return set_entries.size() + aliases.size() < 2;
}
// Parses a single base::Value into a net::SchemefulSite, and verifies that it
// is not already included in this set or any other.
base::expected<net::SchemefulSite, ParseErrorType> ParseSiteAndValidate(
const base::Value& item,
const std::vector<std::pair<net::SchemefulSite, net::FirstPartySetEntry>>&
set_entries,
const base::flat_set<net::SchemefulSite>& other_sets_sites,
bool emit_errors) {
if (!item.is_string())
return base::unexpected(ParseErrorType::kInvalidType);
const base::expected<net::SchemefulSite, ParseErrorType> maybe_site =
Canonicalize(item.GetString(), emit_errors);
if (!maybe_site.has_value())
return base::unexpected(maybe_site.error());
const net::SchemefulSite& site = *maybe_site;
if (base::ranges::any_of(
set_entries,
[&](const std::pair<net::SchemefulSite, net::FirstPartySetEntry>&
site_and_entry) { return site_and_entry.first == site; })) {
return base::unexpected(ParseErrorType::kRepeatedDomain);
}
if (other_sets_sites.contains(site))
return base::unexpected(ParseErrorType::kNonDisjointSets);
return site;
}
// Removes the TLD from a SchemefulSite, if possible. (It is not possible if the
// site has no final subcomponent.)
absl::optional<std::string> RemoveTldFromSite(const net::SchemefulSite& site) {
const size_t tld_length = net::registry_controlled_domains::GetRegistryLength(
site.GetURL(),
net::registry_controlled_domains::INCLUDE_UNKNOWN_REGISTRIES,
net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
if (tld_length == 0)
return absl::nullopt;
const std::string serialized = site.Serialize();
return serialized.substr(0, serialized.size() - tld_length);
}
// Parses the optional ccTLDs field, if present. If absent, this is a no-op.
// Returns any error encountered while parsing the strings into SchemefulSites.
//
// Ignores any aliases that differ from their canonical representative by more
// than just the TLD, and adds a warning to `warnings`.
// Ignores any aliases provided for a representative site that is not in the
// First-Party Set we're currently parsing/validating, and adds a warning to
// `warnings`.
base::expected<Aliases, ParseError> ParseCctlds(
const base::Value::Dict& set_declaration,
const std::vector<std::pair<net::SchemefulSite, net::FirstPartySetEntry>>&
set_entries,
const base::flat_set<net::SchemefulSite>& elements,
bool emit_errors,
std::vector<ParseWarning>* warnings) {
const base::Value::Dict* cctld_dict = set_declaration.FindDict(kCCTLDsField);
if (!cctld_dict)
return {};
std::vector<std::pair<net::SchemefulSite, net::SchemefulSite>> aliases;
for (const auto [site, site_alias_list] : *cctld_dict) {
net::SchemefulSite site_as_schemeful_site((GURL(site)));
if (!base::Contains(
set_entries, site_as_schemeful_site,
[](const auto& site_and_entry) { return site_and_entry.first; })) {
if (warnings) {
warnings->push_back(ParseWarning(
ParseWarningType::kCctldKeyNotCanonical, {kCCTLDsField, site}));
}
continue;
}
const absl::optional<std::string> site_without_tld =
RemoveTldFromSite(site_as_schemeful_site);
if (!site_without_tld.has_value())
continue;
if (!site_alias_list.is_list())
continue;
const base::Value::List& site_aliases = site_alias_list.GetList();
for (size_t i = 0; i < site_aliases.size(); ++i) {
const base::Value& item = site_aliases[i];
const base::expected<net::SchemefulSite, ParseErrorType> alias_or_error =
ParseSiteAndValidate(item, set_entries, elements, emit_errors);
if (!alias_or_error.has_value()) {
return base::unexpected(
ParseError(alias_or_error.error(), {kCCTLDsField, site, static_cast<int>(i)}));
}
const net::SchemefulSite alias = alias_or_error.value();
const absl::optional<std::string> alias_site_without_tld =
RemoveTldFromSite(alias);
if (!alias_site_without_tld.has_value())
continue;
if (alias_site_without_tld != site_without_tld) {
if (warnings) {
warnings->push_back(
ParseWarning(ParseWarningType::kAliasNotCctldVariant,
{kCCTLDsField, site, static_cast<int>(i)}));
}
continue;
}
aliases.emplace_back(alias, site_as_schemeful_site);
}
}
return aliases;
}
// Parses a given optional subset, ensuring that it is disjoint from all other
// subsets in this set, and from all other sets that have previously been
// parsed.
absl::optional<ParseError> ParseSubset(
const base::Value::Dict& set_declaration,
const net::SchemefulSite& primary,
const SubsetDescriptor& descriptor,
const base::flat_set<net::SchemefulSite>& other_sets_sites,
bool emit_errors,
std::vector<std::pair<net::SchemefulSite, net::FirstPartySetEntry>>&
set_entries) {
const base::Value* field_value = set_declaration.Find(descriptor.field_name);
if (!field_value)
return absl::nullopt;
if (!field_value->is_list())
return ParseError(ParseErrorType::kInvalidType, {descriptor.field_name});
// Add each site to our mapping (after validating).
uint32_t index = 0;
for (const auto& item : field_value->GetList()) {
base::expected<net::SchemefulSite, ParseErrorType> site_or_error =
ParseSiteAndValidate(item, set_entries, other_sets_sites, emit_errors);
if (!site_or_error.has_value())
return ParseError(site_or_error.error(), {descriptor.field_name, static_cast<int>(index)});
if (!descriptor.size_limit.has_value() ||
static_cast<int>(index) < descriptor.size_limit.value()) {
set_entries.emplace_back(
site_or_error.value(),
net::FirstPartySetEntry(
primary, descriptor.site_type,
descriptor.size_limit.has_value()
? absl::make_optional(
net::FirstPartySetEntry::SiteIndex(index))
: absl::nullopt));
}
// Continue parsing even after we've reached the size limit (if there is
// one), in order to surface malformed input domains as errors.
++index;
}
return absl::nullopt;
}
// Validates a single First-Party Set and parses it into a SingleSet.
// Note that this is intended for use *only* on sets that were received via the
// Component Updater or from enterprise policy, so this does not check
// assertions or versions. It rejects sets which are non-disjoint with
// previously-encountered sets (i.e. sets which have non-empty intersections
// with `elements`), and singleton sets (i.e. sets must have a primary and at
// least one valid associated site).
//
// Uses `elements` to check disjointness of sets; augments `elements` to include
// the elements of the set that was parsed.
//
// Returns the parsed set if parsing and validation were successful; otherwise,
// returns an appropriate ParseError.
//
// Outputs any warnings encountered during parsing to `warnings`,
// regardless of success/failure.
base::expected<SetsAndAliases, ParseError> ParseSet(
const base::Value& value,
bool exempt_from_limits,
bool emit_errors,
base::flat_set<net::SchemefulSite>& elements,
std::vector<ParseWarning>* warnings) {
if (!value.is_dict())
return base::unexpected(ParseError(ParseErrorType::kInvalidType, {}));
const base::Value::Dict& set_declaration = value.GetDict();
// Confirm that the set has a primary, and the primary is a string.
const base::Value* primary_item =
set_declaration.Find(kFirstPartySetPrimaryField);
if (!primary_item) {
return base::unexpected(
ParseError(ParseErrorType::kInvalidType, {kFirstPartySetPrimaryField}));
}
base::expected<net::SchemefulSite, ParseErrorType> primary_or_error =
ParseSiteAndValidate(*primary_item, /*set_entries=*/{}, elements,
emit_errors);
if (!primary_or_error.has_value()) {
return base::unexpected(
ParseError(primary_or_error.error(), {kFirstPartySetPrimaryField}));
}
const net::SchemefulSite& primary = primary_or_error.value();
std::vector<std::pair<net::SchemefulSite, net::FirstPartySetEntry>>
set_entries(
{{primary, net::FirstPartySetEntry(primary, net::SiteType::kPrimary,
absl::nullopt)}});
for (const SubsetDescriptor& descriptor : {
SubsetDescriptor{
.field_name = kFirstPartySetAssociatedSitesField,
.site_type = net::SiteType::kAssociated,
.size_limit =
exempt_from_limits
? absl::nullopt
: absl::make_optional(
features::kFirstPartySetsMaxAssociatedSites.Get()),
},
{
.field_name = kFirstPartySetServiceSitesField,
.site_type = net::SiteType::kService,
.size_limit = absl::nullopt,
},
}) {
if (absl::optional<ParseError> error =
ParseSubset(set_declaration, primary, descriptor, elements,
emit_errors, set_entries);
error.has_value()) {
return base::unexpected(error.value());
}
}
const base::expected<Aliases, ParseError> aliases_or_error = ParseCctlds(
set_declaration, set_entries, elements, emit_errors, warnings);
if (!aliases_or_error.has_value())
return base::unexpected(aliases_or_error.error());
const Aliases& aliases = aliases_or_error.value();
if (IsSingletonSet(set_entries, aliases))
return base::unexpected(ParseError(ParseErrorType::kSingletonSet,
{kFirstPartySetAssociatedSitesField}));
for (const std::pair<net::SchemefulSite, net::FirstPartySetEntry>&
site_and_entry : set_entries) {
bool inserted = elements.insert(site_and_entry.first).second;
DCHECK(inserted);
}
for (const std::pair<net::SchemefulSite, net::SchemefulSite>&
alias_and_canonical : aliases) {
bool inserted = elements.insert(alias_and_canonical.first).second;
DCHECK(inserted);
}
return std::make_pair(FirstPartySetParser::SingleSet(set_entries), aliases);
}
const char* SetTypeToString(FirstPartySetParser::PolicySetType set_type) {
switch (set_type) {
case FirstPartySetParser::PolicySetType::kReplacement:
return kFirstPartySetPolicyReplacementsField;
case FirstPartySetParser::PolicySetType::kAddition:
return kFirstPartySetPolicyAdditionsField;
}
}
// Returns the parsed sets if successful; otherwise returns the first error.
// Stores any warnings encountered when parsing in the `warnings` out-parameter.
base::expected<std::vector<FirstPartySetParser::SingleSet>, ParseError>
GetPolicySetsFromList(const base::Value::List* policy_sets,
base::flat_set<net::SchemefulSite>& elements,
FirstPartySetParser::PolicySetType set_type,
std::vector<ParseWarning>& warnings) {
if (!policy_sets) {
return {};
}
std::vector<FirstPartySetParser::SingleSet> parsed_sets;
size_t previous_size = warnings.size();
for (int i = 0; i < static_cast<int>(policy_sets->size()); i++) {
base::expected<SetsAndAliases, ParseError> parsed =
ParseSet((*policy_sets)[i], /*exempt_from_limits=*/true,
/*emit_errors=*/false, elements, &warnings);
for (auto it = warnings.begin() + previous_size; it != warnings.end();
it++) {
it->PrependPath({SetTypeToString(set_type), i});
}
if (!parsed.has_value()) {
ParseError error = parsed.error();
error.PrependPath({SetTypeToString(set_type), i});
return base::unexpected(error);
}
SetsMap& set = parsed.value().first;
if (!parsed.value().second.empty()) {
std::vector<SetsMap::value_type> alias_entries;
for (const auto& alias : parsed.value().second) {
alias_entries.emplace_back(alias.first, set.find(alias.second)->second);
}
set.insert(std::make_move_iterator(alias_entries.begin()),
std::make_move_iterator(alias_entries.end()));
}
parsed_sets.push_back(set);
previous_size = warnings.size();
}
return parsed_sets;
}
} // namespace
FirstPartySetParser::ParsedPolicySetLists::ParsedPolicySetLists(
std::vector<FirstPartySetParser::SingleSet> replacement_list,
std::vector<FirstPartySetParser::SingleSet> addition_list)
: replacements(std::move(replacement_list)),
additions(std::move(addition_list)) {}
FirstPartySetParser::ParsedPolicySetLists::ParsedPolicySetLists() = default;
FirstPartySetParser::ParsedPolicySetLists::ParsedPolicySetLists(
FirstPartySetParser::ParsedPolicySetLists&&) = default;
FirstPartySetParser::ParsedPolicySetLists::ParsedPolicySetLists(
const FirstPartySetParser::ParsedPolicySetLists&) = default;
FirstPartySetParser::ParsedPolicySetLists::~ParsedPolicySetLists() = default;
bool FirstPartySetParser::ParsedPolicySetLists::operator==(
const FirstPartySetParser::ParsedPolicySetLists& other) const {
return std::tie(replacements, additions) ==
std::tie(other.replacements, other.additions);
}
absl::optional<net::SchemefulSite>
FirstPartySetParser::CanonicalizeRegisteredDomain(
const base::StringPiece origin_string,
bool emit_errors) {
base::expected<net::SchemefulSite, ParseErrorType> maybe_site =
Canonicalize(origin_string, emit_errors);
if (!maybe_site.has_value()) {
return absl::nullopt;
}
return maybe_site.value();
}
SetsAndAliases FirstPartySetParser::ParseSetsFromStream(std::istream& input,
bool emit_errors) {
std::vector<SetsMap::value_type> sets;
std::vector<Aliases::value_type> aliases;
base::flat_set<SetsMap::key_type> elements;
for (std::string line; std::getline(input, line);) {
base::StringPiece trimmed = base::TrimWhitespaceASCII(line, base::TRIM_ALL);
if (trimmed.empty())
continue;
absl::optional<base::Value> maybe_value = base::JSONReader::Read(
trimmed, base::JSONParserOptions::JSON_ALLOW_TRAILING_COMMAS);
if (!maybe_value.has_value())
return {};
base::expected<SetsAndAliases, ParseError> parsed = ParseSet(
*maybe_value, /*exempt_from_limits=*/false, emit_errors, elements,
/*warnings=*/nullptr);
if (!parsed.has_value()) {
if (parsed.error().type() == ParseErrorType::kInvalidDomain) {
// Ignore sets that include an invalid domain (which might have been
// caused by a PSL update), but don't let that break other sets.
continue;
}
// Abort, something is wrong with the component.
return {};
}
base::ranges::move(parsed.value().first, std::back_inserter(sets));
base::ranges::move(parsed.value().second, std::back_inserter(aliases));
}
return std::make_pair(sets, aliases);
}
FirstPartySetParser::PolicyParseResult
FirstPartySetParser::ParseSetsFromEnterprisePolicy(
const base::Value::Dict& policy) {
std::vector<ParseWarning> warnings;
base::flat_set<net::SchemefulSite> elements;
base::expected<std::vector<SingleSet>, ParseError> parsed_replacements =
GetPolicySetsFromList(
policy.FindList(kFirstPartySetPolicyReplacementsField), elements,
PolicySetType::kReplacement, warnings);
if (!parsed_replacements.has_value()) {
return base::unexpected(
std::make_pair(parsed_replacements.error(), warnings));
}
base::expected<std::vector<SingleSet>, ParseError> parsed_additions =
GetPolicySetsFromList(policy.FindList(kFirstPartySetPolicyAdditionsField),
elements, PolicySetType::kAddition, warnings);
if (!parsed_additions.has_value()) {
return base::unexpected(std::make_pair(parsed_additions.error(), warnings));
}
return std::make_pair(
ParsedPolicySetLists(std::move(parsed_replacements.value()),
std::move(parsed_additions.value())),
warnings);
}
} // namespace content