blob: 1f081d0dc4e54a7154f286bf9f40aa9a99715ef5 [file] [log] [blame]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/lookalikes/core/lookalike_url_util.h"
#include <utility>
#include "base/bind.h"
#include "base/callback.h"
#include "base/feature_list.h"
#include "base/macros.h"
#include "base/memory/scoped_refptr.h"
#include "base/memory/singleton.h"
#include "base/metrics/field_trial_params.h"
#include "base/metrics/histogram_macros.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/task/post_task.h"
#include "base/task/thread_pool.h"
#include "base/time/default_clock.h"
#include "components/lookalikes/core/features.h"
#include "components/security_state/core/features.h"
#include "components/url_formatter/spoof_checks/top_domains/top500_domains.h"
#include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h"
#include "components/url_formatter/url_formatter.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "net/base/url_util.h"
namespace lookalikes {
const char kHistogramName[] = "NavigationSuggestion.Event";
} // namespace lookalikes
namespace {
bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1,
const url_formatter::Skeletons& skeletons2) {
DCHECK(!skeletons1.empty());
DCHECK(!skeletons2.empty());
for (const std::string& skeleton1 : skeletons1) {
if (base::Contains(skeletons2, skeleton1)) {
return true;
}
}
return false;
}
// Returns a site that the user has used before that the eTLD+1 in
// |domain_and_registry| may be attempting to spoof, based on skeleton
// comparison.
std::string GetMatchingSiteEngagementDomain(
const std::vector<DomainInfo>& engaged_sites,
const DomainInfo& navigated_domain) {
DCHECK(!navigated_domain.domain_and_registry.empty());
for (const DomainInfo& engaged_site : engaged_sites) {
DCHECK(!engaged_site.domain_and_registry.empty());
if (SkeletonsMatch(navigated_domain.skeletons, engaged_site.skeletons)) {
return engaged_site.domain_and_registry;
}
}
return std::string();
}
// Returns the first matching top domain with an edit distance of at most one
// to |domain_and_registry|. This search is done in lexicographic order on the
// top 500 suitable domains, instead of in order by popularity. This means that
// the resulting "similar" domain may not be the most popular domain that
// matches.
std::string GetSimilarDomainFromTop500(
const DomainInfo& navigated_domain,
const LookalikeTargetAllowlistChecker& target_allowlisted) {
for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
for (const char* const top_domain_skeleton :
top500_domains::kTop500EditDistanceSkeletons) {
if (!IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton),
base::UTF8ToUTF16(top_domain_skeleton))) {
continue;
}
const std::string top_domain =
url_formatter::LookupSkeletonInTopDomains(top_domain_skeleton).domain;
DCHECK(!top_domain.empty());
// If the only difference between the navigated and top
// domains is the registry part, this is unlikely to be a spoofing
// attempt. Ignore this match and continue. E.g. If the navigated domain
// is google.com.tw and the top domain is google.com.tr, this won't
// produce a match.
const std::string top_domain_without_registry =
url_formatter::top_domains::HostnameWithoutRegistry(top_domain);
DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
top_domain_without_registry));
if (navigated_domain.domain_without_registry ==
top_domain_without_registry) {
continue;
}
// Skip past domains that are allowed to be spoofed.
if (target_allowlisted.Run(GURL(std::string(url::kHttpsScheme) +
url::kStandardSchemeSeparator +
top_domain))) {
continue;
}
return top_domain;
}
}
return std::string();
}
// Returns the first matching engaged domain with an edit distance of at most
// one to |domain_and_registry|.
std::string GetSimilarDomainFromEngagedSites(
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites,
const LookalikeTargetAllowlistChecker& target_allowlisted) {
for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
for (const DomainInfo& engaged_site : engaged_sites) {
if (!url_formatter::top_domains::IsEditDistanceCandidate(
engaged_site.domain_and_registry)) {
continue;
}
for (const std::string& engaged_skeleton : engaged_site.skeletons) {
if (!IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton),
base::UTF8ToUTF16(engaged_skeleton))) {
continue;
}
// If the only difference between the navigated and engaged
// domain is the registry part, this is unlikely to be a spoofing
// attempt. Ignore this match and continue. E.g. If the navigated
// domain is google.com.tw and the top domain is google.com.tr, this
// won't produce a match.
if (navigated_domain.domain_without_registry ==
engaged_site.domain_without_registry) {
continue;
}
// Skip past domains that are allowed to be spoofed.
if (target_allowlisted.Run(GURL(std::string(url::kHttpsScheme) +
url::kStandardSchemeSeparator +
engaged_site.domain_and_registry))) {
continue;
}
return engaged_site.domain_and_registry;
}
}
}
return std::string();
}
void RecordEvent(NavigationSuggestionEvent event) {
UMA_HISTOGRAM_ENUMERATION(lookalikes::kHistogramName, event);
}
// Returns the parts of the domain that are separated by "." or "-", not
// including the eTLD.
std::vector<std::string> SplitDomainWithouteTLDIntoTokens(
const std::string& host_without_etld) {
return base::SplitString(host_without_etld, "-.", base::TRIM_WHITESPACE,
base::SPLIT_WANT_NONEMPTY);
}
// Checks whether |domain| is a top domain. If yes, returns true and fills
// |found_domain| with the matching top domain.
bool IsTop500Domain(const DomainInfo& domain, std::string* found_domain) {
for (auto& skeleton : domain.skeletons) {
// Matching with top domains is only done with skeleton matching. We check
// if the skeleton of our hostname matches the skeleton of any top domain.
url_formatter::TopDomainEntry matched_domain =
url_formatter::IDNSpoofChecker().LookupSkeletonInTopDomains(skeleton);
// We are only interested in an exact match with a top 500 domain (as
// opposed to skeleton match). Here we check that the matched domain is a
// top 500 domain and also the hostname of the matched domain is exactly the
// same as our input eTLD+1.
if (matched_domain.is_top_500 &&
matched_domain.domain == domain.domain_and_registry) {
*found_domain = matched_domain.domain;
return true;
}
}
return false;
}
// Checks if the targeted domain is allowlisted. To check that we need to
// check all of the subdomains that could be made. The reason is for example
// in the case of "foo.scholar.google.com.university.edu", "google.com" is
// considered as the targeted domain. We need to make sure
// "scholar.google.com" or "foo.scholar.google.com" are not allowlisted
// before marking the input domain as a target embedding domain.
bool ASubdomainIsAllowlisted(
const std::string& embedded_target,
const base::span<const std::string>& subdomain_labels_so_far,
const LookalikeTargetAllowlistChecker& in_target_allowlist) {
const std::string https_scheme =
url::kHttpsScheme + std::string(url::kStandardSchemeSeparator);
if (in_target_allowlist.Run(GURL(https_scheme + embedded_target))) {
return true;
}
std::string potential_hostname = embedded_target;
// Attach each token from the end to the embedded target to check if that
// subdomain has been allowlisted.
for (int i = subdomain_labels_so_far.size() - 1; i >= 0; i--) {
potential_hostname = subdomain_labels_so_far[i] + "." + potential_hostname;
if (in_target_allowlist.Run(GURL(https_scheme + potential_hostname))) {
return true;
}
}
return false;
}
} // namespace
DomainInfo::DomainInfo(const std::string& arg_hostname,
const std::string& arg_domain_and_registry,
const std::string& arg_domain_without_registry,
const url_formatter::IDNConversionResult& arg_idn_result,
const url_formatter::Skeletons& arg_skeletons)
: hostname(arg_hostname),
domain_and_registry(arg_domain_and_registry),
domain_without_registry(arg_domain_without_registry),
idn_result(arg_idn_result),
skeletons(arg_skeletons) {}
DomainInfo::~DomainInfo() = default;
DomainInfo::DomainInfo(const DomainInfo&) = default;
DomainInfo GetDomainInfo(const GURL& url) {
if (net::IsLocalhost(url) || net::IsHostnameNonUnique(url.host())) {
return DomainInfo(std::string(), std::string(), std::string(),
url_formatter::IDNConversionResult(),
url_formatter::Skeletons());
}
const std::string hostname = url.host();
const std::string domain_and_registry = GetETLDPlusOne(url.host());
const std::string domain_without_registry =
domain_and_registry.empty()
? std::string()
: url_formatter::top_domains::HostnameWithoutRegistry(
domain_and_registry);
// eTLD+1 can be empty for private domains.
if (domain_and_registry.empty()) {
return DomainInfo(hostname, domain_and_registry, domain_without_registry,
url_formatter::IDNConversionResult(),
url_formatter::Skeletons());
}
// Compute skeletons using eTLD+1, skipping all spoofing checks. Spoofing
// checks in url_formatter can cause the converted result to be punycode.
// We want to avoid this in order to get an accurate skeleton for the unicode
// version of the domain.
const url_formatter::IDNConversionResult idn_result =
url_formatter::UnsafeIDNToUnicodeWithDetails(domain_and_registry);
const url_formatter::Skeletons skeletons =
url_formatter::GetSkeletons(idn_result.result);
return DomainInfo(hostname, domain_and_registry, domain_without_registry,
idn_result, skeletons);
}
std::string GetETLDPlusOne(const std::string& hostname) {
return net::registry_controlled_domains::GetDomainAndRegistry(
hostname, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
}
bool IsEditDistanceAtMostOne(const base::string16& str1,
const base::string16& str2) {
if (str1.size() > str2.size() + 1 || str2.size() > str1.size() + 1) {
return false;
}
base::string16::const_iterator i = str1.begin();
base::string16::const_iterator j = str2.begin();
size_t edit_count = 0;
while (i != str1.end() && j != str2.end()) {
if (*i == *j) {
i++;
j++;
} else {
edit_count++;
if (edit_count > 1) {
return false;
}
if (str1.size() > str2.size()) {
// First string is longer than the second. This can only happen if the
// first string has an extra character.
i++;
} else if (str2.size() > str1.size()) {
// Second string is longer than the first. This can only happen if the
// second string has an extra character.
j++;
} else {
// Both strings are the same length. This can only happen if the two
// strings differ by a single character.
i++;
j++;
}
}
}
if (i != str1.end() || j != str2.end()) {
// A character at the end did not match.
edit_count++;
}
return edit_count <= 1;
}
bool IsTopDomain(const DomainInfo& domain_info) {
// Top domains are only accessible through their skeletons, so query the top
// domains trie for each skeleton of this domain.
for (const std::string& skeleton : domain_info.skeletons) {
const url_formatter::TopDomainEntry top_domain =
url_formatter::LookupSkeletonInTopDomains(skeleton);
if (domain_info.domain_and_registry == top_domain.domain) {
return true;
}
}
return false;
}
bool ShouldBlockLookalikeUrlNavigation(LookalikeUrlMatchType match_type,
const DomainInfo& navigated_domain) {
if (match_type == LookalikeUrlMatchType::kSiteEngagement) {
return true;
}
if (match_type == LookalikeUrlMatchType::kTargetEmbedding &&
base::FeatureList::IsEnabled(
lookalikes::features::kDetectTargetEmbeddingLookalikes)) {
return true;
}
return match_type == LookalikeUrlMatchType::kSkeletonMatchTop500;
}
bool GetMatchingDomain(
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites,
const LookalikeTargetAllowlistChecker& in_target_allowlist,
std::string* matched_domain,
LookalikeUrlMatchType* match_type) {
DCHECK(!navigated_domain.domain_and_registry.empty());
DCHECK(matched_domain);
DCHECK(match_type);
if (navigated_domain.idn_result.has_idn_component) {
// If the navigated domain is IDN, check its skeleton against engaged sites
// and top domains.
const std::string matched_engaged_domain =
GetMatchingSiteEngagementDomain(engaged_sites, navigated_domain);
DCHECK_NE(navigated_domain.domain_and_registry, matched_engaged_domain);
if (!matched_engaged_domain.empty()) {
*matched_domain = matched_engaged_domain;
*match_type = LookalikeUrlMatchType::kSiteEngagement;
return true;
}
if (!navigated_domain.idn_result.matching_top_domain.domain.empty()) {
// In practice, this is not possible since the top domain list does not
// contain IDNs, so domain_and_registry can't both have IDN and be a top
// domain. Still, sanity check in case the top domain list changes in the
// future.
// At this point, navigated domain should not be a top domain.
DCHECK_NE(navigated_domain.domain_and_registry,
navigated_domain.idn_result.matching_top_domain.domain);
*matched_domain = navigated_domain.idn_result.matching_top_domain.domain;
*match_type = navigated_domain.idn_result.matching_top_domain.is_top_500
? LookalikeUrlMatchType::kSkeletonMatchTop500
: LookalikeUrlMatchType::kSkeletonMatchTop5k;
return true;
}
}
if (url_formatter::top_domains::IsEditDistanceCandidate(
navigated_domain.domain_and_registry)) {
// If we can't find an exact top domain or an engaged site, try to find an
// engaged domain within an edit distance of one.
const std::string similar_engaged_domain = GetSimilarDomainFromEngagedSites(
navigated_domain, engaged_sites, in_target_allowlist);
if (!similar_engaged_domain.empty() &&
navigated_domain.domain_and_registry != similar_engaged_domain) {
*matched_domain = similar_engaged_domain;
*match_type = LookalikeUrlMatchType::kEditDistanceSiteEngagement;
return true;
}
// Finally, try to find a top domain within an edit distance of one.
const std::string similar_top_domain =
GetSimilarDomainFromTop500(navigated_domain, in_target_allowlist);
if (!similar_top_domain.empty() &&
navigated_domain.domain_and_registry != similar_top_domain) {
*matched_domain = similar_top_domain;
*match_type = LookalikeUrlMatchType::kEditDistance;
return true;
}
}
if (IsTargetEmbeddingLookalike(navigated_domain.hostname, engaged_sites,
in_target_allowlist, matched_domain)) {
*match_type = LookalikeUrlMatchType::kTargetEmbedding;
return true;
}
return false;
}
void RecordUMAFromMatchType(LookalikeUrlMatchType match_type) {
switch (match_type) {
case LookalikeUrlMatchType::kSiteEngagement:
RecordEvent(NavigationSuggestionEvent::kMatchSiteEngagement);
break;
case LookalikeUrlMatchType::kEditDistance:
RecordEvent(NavigationSuggestionEvent::kMatchEditDistance);
break;
case LookalikeUrlMatchType::kEditDistanceSiteEngagement:
RecordEvent(NavigationSuggestionEvent::kMatchEditDistanceSiteEngagement);
break;
case LookalikeUrlMatchType::kTargetEmbedding:
RecordEvent(NavigationSuggestionEvent::kMatchTargetEmbedding);
break;
case LookalikeUrlMatchType::kSkeletonMatchTop500:
RecordEvent(NavigationSuggestionEvent::kMatchSkeletonTop500);
break;
case LookalikeUrlMatchType::kSkeletonMatchTop5k:
RecordEvent(NavigationSuggestionEvent::kMatchSkeletonTop5k);
break;
case LookalikeUrlMatchType::kNone:
break;
}
}
bool IsTargetEmbeddingLookalike(
const std::string& hostname,
const std::vector<DomainInfo>& engaged_sites,
const LookalikeTargetAllowlistChecker& in_target_allowlist,
std::string* safe_hostname) {
const std::string host_without_etld =
url_formatter::top_domains::HostnameWithoutRegistry(hostname);
const std::vector<std::string> hostname_tokens_without_etld =
SplitDomainWithouteTLDIntoTokens(host_without_etld);
// For each token, we look backwards to the previous token to see if
// "|prev_token|.|token|" forms a top domain or a high engaged domain.
std::string prev_token;
// We can have domains separated by '-'s or '.'s. In order to find target
// embedding urls with google.com.com or google-com.com, we get url parts as
// anything that is between two '-'s or '.'s. We check to see if any two
// consecutive tokens form a top or highly-engaged domain.
// Because of the way this matching is working, we can not identify target
// embedding attacks against domains that contain '-' in their address
// (e.g programme-tv.net). Also if the eTLD of the target has more than one
// part, we won't be able to protect it (e.g. google.co.uk).
for (size_t i = 0; i < hostname_tokens_without_etld.size(); i++) {
const std::string token = hostname_tokens_without_etld[i];
const std::string possible_embedded_target = prev_token + "." + token;
if (prev_token.empty()) {
prev_token = token;
continue;
}
prev_token = token;
// Short domains are more likely to be misidentified as being embedded. For
// example "mi.com", "mk.ru", or "com.ru" are a few examples of domains that
// could trigger the target embedding heuristic falsely.
if (possible_embedded_target.size() < 7) {
continue;
}
// We want to protect user's high engaged websites as well as top domains.
GURL possible_target_url(url::kHttpsScheme +
std::string(url::kStandardSchemeSeparator) +
possible_embedded_target);
DomainInfo possible_target_domain = GetDomainInfo(possible_target_url);
// We check if the eTLD+1 is a valid domain, otherwise there is no point in
// checking if it is a top domain or an engaged domain.
if (possible_target_domain.domain_and_registry.empty()) {
continue;
}
*safe_hostname =
GetMatchingSiteEngagementDomain(engaged_sites, possible_target_domain);
// |GetMatchingSiteEngagementDomain| uses skeleton matching, we make sure
// the found engaged site is an exact match of the embedded target.
if (*safe_hostname != possible_embedded_target) {
*safe_hostname = std::string();
}
if (safe_hostname->empty() &&
!IsTop500Domain(possible_target_domain, safe_hostname)) {
continue;
}
// Check if any subdomain is allowlisted.
std::vector<std::string> subdomain_labels_so_far(
hostname_tokens_without_etld.begin(),
hostname_tokens_without_etld.begin() + i - 1);
if (!ASubdomainIsAllowlisted(possible_embedded_target,
subdomain_labels_so_far,
in_target_allowlist)) {
return true;
}
// A target is found but it was allowlisted.
*safe_hostname = std::string();
}
return false;
}