blob: 242cf2a0d3459390bf8aa6096ebbde3f1638e174 [file] [log] [blame]
// Copyright 2014 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/affiliations/core/browser/affiliation_utils.h"
#include <algorithm>
#include <map>
#include <ostream>
#include <string_view>
#include "base/base64.h"
#include "base/strings/escape.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "components/url_formatter/elide_url.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "url/third_party/mozilla/url_parse.h"
#include "url/url_canon_stdstring.h"
namespace affiliations {
namespace {
// The scheme used for identifying Android applications.
const char kAndroidAppScheme[] = "android";
// Returns a std::string_view corresponding to |component| in |uri|, or the empty
// string in case there is no such component.
std::string_view ComponentString(const std::string& uri,
const url::Component& component) {
if (!component.is_valid()) {
return std::string_view();
}
return std::string_view(uri).substr(component.begin, component.len);
}
// Returns true if the passed ASCII |input| string contains nothing else than
// alphanumeric characters and those in |other_characters|.
bool ContainsOnlyAlphanumericAnd(std::string_view input,
std::string_view other_characters) {
for (char c : input) {
if (!base::IsAsciiAlpha(c) && !base::IsAsciiDigit(c) &&
other_characters.find(c) == std::string_view::npos)
return false;
}
return true;
}
// Canonicalizes a Web facet URI, and returns true if canonicalization was
// successful and produced a valid URI.
bool CanonicalizeWebFacetURI(const std::string& input_uri,
const url::Parsed& input_parsed,
std::string* canonical_uri) {
url::Parsed canonical_parsed;
url::StdStringCanonOutput canonical_output(canonical_uri);
bool canonicalization_succeeded = url::CanonicalizeStandardUrl(
input_uri, input_parsed, url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION,
nullptr, &canonical_output, &canonical_parsed);
canonical_output.Complete();
if (canonicalization_succeeded && canonical_parsed.host.is_nonempty() &&
!canonical_parsed.username.is_valid() &&
!canonical_parsed.password.is_valid() &&
ComponentString(*canonical_uri, canonical_parsed.path) == "/" &&
!canonical_parsed.query.is_valid() && !canonical_parsed.ref.is_valid()) {
// Get rid of the trailing slash added by url::CanonicalizeStandardURL().
DCHECK_EQ((size_t)canonical_parsed.path.begin, canonical_uri->size() - 1);
canonical_uri->erase(canonical_parsed.path.begin,
canonical_parsed.path.len);
return true;
}
return false;
}
// Adds padding until the length of the base64-encoded |data| becomes a multiple
// of 4, and returns true if the thusly obtained |data| is now correctly padded,
// i.e., there are at most 2 padding characters ('=') at the very end.
bool CanonicalizeBase64Padding(std::string* data) {
while (data->size() % 4u != 0u)
data->push_back('=');
size_t first_padding = data->find_first_of('=');
return first_padding == std::string::npos ||
(data->size() - first_padding <= 2u &&
data->find_first_not_of('=', first_padding) == std::string::npos);
}
// Canonicalizes the username component in an Android facet URI (containing the
// certificate hash), and returns true if canonicalization was successful and
// produced a valid non-empty component.
bool CanonicalizeHashComponent(std::string_view input_hash,
url::CanonOutput* canonical_output) {
// Characters other than alphanumeric that are used in the "URL and filename
// safe" base64 alphabet; plus the padding ('=').
const char kBase64NonAlphanumericChars[] = "-_=";
std::string base64_encoded_hash =
base::UnescapeBinaryURLComponent(input_hash);
if (!base64_encoded_hash.empty() &&
CanonicalizeBase64Padding(&base64_encoded_hash) &&
ContainsOnlyAlphanumericAnd(base64_encoded_hash,
kBase64NonAlphanumericChars)) {
canonical_output->Append(base64_encoded_hash);
canonical_output->push_back('@');
return true;
}
return false;
}
// Canonicalizes the host component in an Android facet URI (containing the
// package name), and returns true if canonicalization was successful and
// produced a valid non-empty component.
bool CanonicalizePackageNameComponent(
std::string_view input_package_name,
url::CanonOutput* canonical_output) {
// Characters other than alphanumeric that are permitted in the package names.
const char kPackageNameNonAlphanumericChars[] = "._";
std::string package_name =
base::UnescapeBinaryURLComponent(input_package_name);
// TODO(engedy): We might want to use a regex to check this more throughly.
if (!package_name.empty() &&
ContainsOnlyAlphanumericAnd(package_name,
kPackageNameNonAlphanumericChars)) {
canonical_output->Append(package_name);
return true;
}
return false;
}
// Canonicalizes an Android facet URI, and returns true if canonicalization was
// successful and produced a valid URI.
bool CanonicalizeAndroidFacetURI(const std::string& input_uri,
const url::Parsed& input_parsed,
std::string* canonical_uri) {
url::StdStringCanonOutput canonical_output(canonical_uri);
url::Component unused;
bool success = url::CanonicalizeScheme(
input_parsed.scheme.as_string_view_on(input_uri.c_str()),
&canonical_output, &unused);
canonical_output.push_back('/');
canonical_output.push_back('/');
// We cannot use url::CanonicalizeUserInfo as that would percent encode the
// base64 padding characters ('=').
success &= CanonicalizeHashComponent(
ComponentString(input_uri, input_parsed.username), &canonical_output);
// We cannot use url::CanonicalizeHost as that would convert the package name
// to lower case, but the package name is case sensitive.
success &= CanonicalizePackageNameComponent(
ComponentString(input_uri, input_parsed.host), &canonical_output);
canonical_output.Complete();
return success && input_parsed.password.is_empty() &&
(input_parsed.path.is_empty() ||
ComponentString(input_uri, input_parsed.path) == "/") &&
input_parsed.port.is_empty() && !input_parsed.query.is_valid() &&
!input_parsed.ref.is_valid();
}
// Computes the canonicalized form of |uri| into |canonical_uri|, and returns
// true if canonicalization was successful and produced a valid URI.
bool ParseAndCanonicalizeFacetURI(const std::string& input_uri,
std::string* canonical_uri) {
DCHECK(canonical_uri);
canonical_uri->clear();
canonical_uri->reserve(input_uri.size() + 32);
url::Parsed input_parsed = url::ParseStandardURL(input_uri);
std::string_view scheme = ComponentString(input_uri, input_parsed.scheme);
if (base::EqualsCaseInsensitiveASCII(scheme, url::kHttpsScheme)) {
return CanonicalizeWebFacetURI(input_uri, input_parsed, canonical_uri);
}
if (base::EqualsCaseInsensitiveASCII(scheme, kAndroidAppScheme)) {
return CanonicalizeAndroidFacetURI(input_uri, input_parsed, canonical_uri);
}
*canonical_uri = input_uri;
return false;
}
// Extracts and sorts the facet URIs of the given affiliated facets. This is
// used to determine whether two equivalence classes are equal.
std::vector<FacetURI> ExtractAndSortFacetURIs(const AffiliatedFacets& facets) {
std::vector<FacetURI> uris;
uris.reserve(facets.size());
std::ranges::transform(facets, std::back_inserter(uris), &Facet::uri);
std::sort(uris.begin(), uris.end());
return uris;
}
// Appends a new level to the |main_domain| from |full_domain|.
// |main_domain| must be a suffix of |full_domain|.
void IncreaseDomainLevel(const std::string& full_domain,
std::string& main_domain) {
DCHECK_GT(full_domain.size(), main_domain.size());
auto starting_pos = full_domain.rbegin() + main_domain.size();
// Verify that we are at '.' and move to the next character.
DCHECK_EQ(*starting_pos, '.');
starting_pos++;
// Find next '.' from |starting_pos|
auto ending_pos = std::find(starting_pos, full_domain.rend(), '.');
main_domain = std::string(ending_pos.base(), full_domain.end());
}
// An implementation of the disjoint-set data structure
// (https://en.wikipedia.org/wiki/Disjoint-set_data_structure). This
// implementation uses the path compression and union by rank optimizations,
// achieving near-constant runtime on all operations.
//
// This data structure allows to keep track of disjoin sets. Constructor accepts
// number of elements and initially each element represent an individual set.
// Later by calling MergeSets corresponding sets are merged together.
// Example usage:
// DisjointSet disjoint_set(5);
// disjoint_set.GetDisjointSets(); // Returns {{0}, {1}, {2}, {3}, {4}}
// disjoint_set.MergeSets(0, 2);
// disjoint_set.GetDisjointSets(); // Returns {{0, 2}, {1}, {3}, {4}}
// disjoint_set.MergeSets(2, 4);
// disjoint_set.GetDisjointSets(); // Returns {{0, 2, 4}, {1}, {3}}
class DisjointSet {
public:
explicit DisjointSet(size_t size) : parent_id_(size), ranks_(size, 0) {
for (size_t i = 0; i < size; i++) {
parent_id_[i] = i;
}
}
// Merges two sets based on their rank. Set with higher rank becomes a parent
// for another set.
void MergeSets(int set1, int set2) {
set1 = GetRoot(set1);
set2 = GetRoot(set2);
if (set1 == set2) {
return;
}
// Update parent based on rank.
if (ranks_[set1] > ranks_[set2]) {
parent_id_[set2] = set1;
} else {
parent_id_[set1] = set2;
// if ranks were equal increment by one new root's rank.
if (ranks_[set1] == ranks_[set2]) {
ranks_[set2]++;
}
}
}
// Returns disjoin sets after merging. It's guarantee that the result will
// hold all elements.
std::vector<std::vector<int>> GetDisjointSets() {
std::vector<std::vector<int>> disjoint_sets(parent_id_.size());
for (size_t i = 0; i < parent_id_.size(); i++) {
// Append all elements to the root.
int root = GetRoot(i);
disjoint_sets[root].push_back(i);
}
// Clear empty sets.
std::erase_if(disjoint_sets, [](const auto& set) { return set.empty(); });
return disjoint_sets;
}
private:
// Returns root for a given element.
int GetRoot(int index) {
if (index == parent_id_[index]) {
return index;
}
// To speed up future lookups flatten the tree along the way.
return parent_id_[index] = GetRoot(parent_id_[index]);
}
// Vector where element at i'th position holds a parent for i.
std::vector<int> parent_id_;
// Upper bound depth of a tree for i'th element.
std::vector<size_t> ranks_;
};
} // namespace
// FacetURI -------------------------------------------------------------------
FacetURI::FacetURI() = default;
// static
FacetURI FacetURI::FromPotentiallyInvalidSpec(const std::string& spec) {
std::string canonical_spec;
bool is_valid = ParseAndCanonicalizeFacetURI(spec, &canonical_spec);
return FacetURI(canonical_spec, is_valid);
}
// static
FacetURI FacetURI::FromCanonicalSpec(const std::string& canonical_spec) {
return FacetURI(canonical_spec, true);
}
bool FacetURI::IsValidWebFacetURI() const {
return scheme() == url::kHttpsScheme;
}
bool FacetURI::IsValidAndroidFacetURI() const {
return scheme() == kAndroidAppScheme;
}
std::string FacetURI::scheme() const {
return is_valid()
? std::string(ComponentString(canonical_spec_, parsed_.scheme))
: "";
}
std::string FacetURI::android_package_name() const {
if (!IsValidAndroidFacetURI())
return "";
return std::string(ComponentString(canonical_spec_, parsed_.host));
}
std::string FacetURI::GetAndroidPackageDisplayName() const {
CHECK(IsValidAndroidFacetURI());
std::vector<std::string> parts = base::SplitString(
android_package_name(), ".", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
std::ranges::reverse(parts);
return base::JoinString(parts, ".");
}
FacetURI::FacetURI(const std::string& canonical_spec, bool is_valid)
: is_valid_(is_valid), canonical_spec_(canonical_spec) {
// TODO(engedy): Refactor code in order to avoid to avoid parsing the URL
// twice.
parsed_ = url::ParseStandardURL(canonical_spec_);
}
// Facet
Facet::Facet(FacetURI uri,
FacetBrandingInfo branding_info,
GURL change_password_url,
std::string main_domain)
: uri(std::move(uri)),
branding_info(std::move(branding_info)),
change_password_url(std::move(change_password_url)),
main_domain(std::move(main_domain)) {}
Facet::~Facet() = default;
Facet::Facet(const Facet& other) = default;
Facet::Facet(Facet&& other) = default;
Facet& Facet::operator=(const Facet& other) = default;
Facet& Facet::operator=(Facet&& other) = default;
// GroupedFacets
GroupedFacets::GroupedFacets() = default;
GroupedFacets::~GroupedFacets() = default;
GroupedFacets::GroupedFacets(const GroupedFacets& other) = default;
GroupedFacets::GroupedFacets(GroupedFacets&& other) = default;
GroupedFacets& GroupedFacets::operator=(const GroupedFacets& other) = default;
GroupedFacets& GroupedFacets::operator=(GroupedFacets&& other) = default;
std::vector<GroupedFacets> MergeRelatedGroups(
const base::flat_set<std::string>& psl_extensions,
const std::vector<GroupedFacets>& groups) {
DisjointSet unions(groups.size());
std::map<std::string, int> main_domain_to_group;
for (size_t i = 0; i < groups.size(); i++) {
for (auto& facet : groups[i].facets) {
if (facet.uri.IsValidAndroidFacetURI()) {
continue;
}
// If domain is empty - compute it manually.
std::string main_domain =
facet.main_domain.empty()
? GetExtendedTopLevelDomain(
GURL(facet.uri.potentially_invalid_spec()), psl_extensions)
: facet.main_domain;
if (main_domain.empty()) {
continue;
}
auto it = main_domain_to_group.find(main_domain);
if (it == main_domain_to_group.end()) {
main_domain_to_group[main_domain] = i;
continue;
}
unions.MergeSets(i, it->second);
}
}
std::vector<GroupedFacets> result;
for (const auto& merged_groups : unions.GetDisjointSets()) {
GroupedFacets group;
for (int group_id : merged_groups) {
// Move all the elements into a new vector.
group.facets.insert(group.facets.end(), groups[group_id].facets.begin(),
groups[group_id].facets.end());
// Use non-empty name for a combined group.
if (!groups[group_id].branding_info.icon_url.is_empty()) {
group.branding_info = groups[group_id].branding_info;
}
}
result.push_back(std::move(group));
}
return result;
}
// AffiliatedFacetsWithUpdateTime ---------------------------------------------
AffiliatedFacetsWithUpdateTime::AffiliatedFacetsWithUpdateTime() = default;
AffiliatedFacetsWithUpdateTime::AffiliatedFacetsWithUpdateTime(
const AffiliatedFacetsWithUpdateTime& other) = default;
AffiliatedFacetsWithUpdateTime::AffiliatedFacetsWithUpdateTime(
AffiliatedFacetsWithUpdateTime&& other) = default;
AffiliatedFacetsWithUpdateTime& AffiliatedFacetsWithUpdateTime::operator=(
const AffiliatedFacetsWithUpdateTime& other) = default;
AffiliatedFacetsWithUpdateTime& AffiliatedFacetsWithUpdateTime::operator=(
AffiliatedFacetsWithUpdateTime&& other) = default;
AffiliatedFacetsWithUpdateTime::~AffiliatedFacetsWithUpdateTime() = default;
// Helpers --------------------------------------------------------------------
std::ostream& operator<<(std::ostream& os, const FacetURI& facet_uri) {
return os << facet_uri.potentially_invalid_spec();
}
bool operator==(const GroupedFacets& lhs, const GroupedFacets& rhs) {
if (!std::ranges::is_permutation(lhs.facets, rhs.facets)) {
return false;
}
return lhs.branding_info == rhs.branding_info;
}
bool AreEquivalenceClassesEqual(const AffiliatedFacets& a,
const AffiliatedFacets& b) {
return a.size() == b.size() &&
ExtractAndSortFacetURIs(a) == ExtractAndSortFacetURIs(b);
}
bool IsValidAndroidFacetURI(const std::string& url) {
FacetURI facet = FacetURI::FromPotentiallyInvalidSpec(url);
return facet.IsValidAndroidFacetURI();
}
std::string GetExtendedTopLevelDomain(
const GURL& url,
const base::flat_set<std::string>& psl_extensions) {
std::string main_domain =
net::registry_controlled_domains::GetDomainAndRegistry(
url, net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
if (main_domain.empty()) {
return main_domain;
}
std::string full_domain = url.GetHost();
// Something went wrong, and it shouldn't happen. Return early in this case to
// avoid undefined behaviour.
if (!base::EndsWith(full_domain, main_domain)) {
return main_domain;
}
// If a domain is contained within the PSL extension list, an additional
// subdomain is added to that domain. This is done until the domain is not
// contained within the PSL extension list or fully shown. For multi-level
// extension, this approach only works if all sublevels are included in the
// PSL extension list.
while (main_domain != full_domain && psl_extensions.contains(main_domain)) {
IncreaseDomainLevel(full_domain, main_domain);
}
return main_domain;
}
bool IsExtendedPublicSuffixDomainMatch(
const GURL& url1,
const GURL& url2,
const base::flat_set<std::string>& psl_extensions) {
if (!url1.is_valid() || !url2.is_valid()) {
return false;
}
std::string domain1(GetExtendedTopLevelDomain(url1, psl_extensions));
std::string domain2(GetExtendedTopLevelDomain(url2, psl_extensions));
if (domain1.empty() || domain2.empty()) {
return false;
}
return domain1 == domain2;
}
} // namespace affiliations