blob: f99c531295f0aab21855e6b6039a570535dc873a [file] [log] [blame]
// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/lookalikes/core/lookalike_url_util.h"
#include <algorithm>
#include <string_view>
#include <utility>
#include "base/compiler_specific.h"
#include "base/containers/contains.h"
#include "base/functional/callback.h"
#include "base/hash/sha1.h"
#include "base/i18n/char_iterator.h"
#include "base/metrics/histogram_macros.h"
#include "base/no_destructor.h"
#include "base/strings/strcat.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/trace_event/trace_event.h"
#include "base/values.h"
#include "build/build_config.h"
#include "components/lookalikes/core/safety_tips_config.h"
#include "components/security_interstitials/core/pref_names.h"
#include "components/url_formatter/spoof_checks/common_words/common_words_util.h"
#include "components/url_formatter/spoof_checks/top_domains/top_bucket_domains.h"
#include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h"
#include "components/url_formatter/url_formatter.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "net/base/url_util.h"
#include "third_party/icu/source/common/unicode/uchar.h"
#include "third_party/icu/source/common/unicode/utypes.h"
using lookalikes::ComboSquattingParams;
using lookalikes::DomainInfo;
using lookalikes::GetDomainInfo;
using lookalikes::HasOneCharacterSwap;
using lookalikes::IsEditDistanceAtMostOne;
using lookalikes::LookalikeTargetAllowlistChecker;
using lookalikes::LookalikeUrlMatchType;
using lookalikes::NavigationSuggestionEvent;
using lookalikes::TopBucketDomainsParams;
namespace {
// Digits. Used for trimming domains in Edit Distance heuristic matches. Domains
// that only differ by trailing digits (e.g. a1.tld and a2.tld) are ignored.
const char kDigitChars[] = "0123456789";
// Minimum length of e2LD protected against target embedding. For example,
// foo.bar.baz.com-evil.com embeds foo.bar.baz.com, but we don't flag it since
// "baz" is shorter than kMinTargetE2LDLength.
const size_t kMinE2LDLengthForTargetEmbedding = 4;
// We might not protect a domain whose e2LD is a common word in target embedding
// based on the TLD that is paired with it. This list supplements words from
// url_formatter::common_words::IsCommonWord().
constexpr const char* kLocalAdditionalCommonWords[] = {"asahi", "hoteles",
"jharkhand", "nifty"};
// These domains are plausible lookalike targets, but they also use common words
// in their names. Selectively prevent flagging embeddings where the embedder
// ends in "-DOMAIN.TLD", since these tend to have higher false positive rates.
constexpr const char* kDomainsPermittedInEndEmbeddings[] = {
"office.com", "medium.com", "orange.fr"};
// What separators can be used to separate tokens in target embedding spoofs?
// e.g. www-google.com.example.com uses "-" (www-google) and "." (google.com).
const char kTargetEmbeddingSeparators[] = "-.";
// A small subset of private registries on the PSL that act like public
// registries AND are a common source of false positives in lookalike checks. We
// treat them as public for the purposes of lookalike checks.
constexpr const char* kPrivateRegistriesTreatedAsPublic[] = {"com.de",
"com.se"};
TopBucketDomainsParams* GetTopDomainParams() {
static TopBucketDomainsParams params{
top_bucket_domains::kTopBucketEditDistanceSkeletons,
top_bucket_domains::kNumTopBucketEditDistanceSkeletons};
return &params;
}
// Minimum length of the eTLD+1 without registry needed to show the punycode
// interstitial. IDN whose eTLD+1 without registry is shorter than this are
// still displayed in punycode, but don't show an interstitial.
const size_t kMinimumE2LDLengthToShowPunycodeInterstitial = 2;
// Default launch percentage of a new heuristic on Canary/Dev and Beta. These
// are used if there is a launch config for the heuristic in the proto.
const int kDefaultLaunchPercentageOnCanaryDev = 90;
const int kDefaultLaunchPercentageOnBeta = 50;
// Define skeletons of brand names and popular keywords for using in Combo
// Squatting heuristic. These lists are manually curated using Chrome metrics.
// We will check combinations of brand names and popular keywords.
// e. g. google-login.com or youtubesecure.com.
// For every brand name, brand_name[.]com should be checked to be valid. If
// no matched domain is found in top domains, brand_name[.]com will be
// suggested to the user for navigation.
// If brand_name[.]com is not valid for any brand name, each brand name should
// be mapped to a valid url manually and the data structure of
// ForCSQ should be changed accordingly.
// In each element of `kBrandNamesForCSQ`, first string is an original brand
// name and second string is its skeleton. If you are adding a brand name here,
// you can generate its skeleton using the format_url binary
// (components/url_formatter/tools/format_url.cc)
// TODO(crbug.com/40855941): Generate skeletons of hard coded brand names in
// Chrome initialization and remove manual adding of skeletons to this list.
constexpr std::string_view kBrandNamesForCSQ[][2] = {
{"adobe", "adobe"},
{"airbnb", "airbnb"},
{"alibaba", "alibaba"},
{"aliexpress", "aliexpress"},
{"amazon", "arnazon"},
{"baidu", "baidu"},
{"bestbuy", "bestbuy"},
{"blogspot", "blogspot"},
{"costco", "costco"},
{"craigslist", "craigslist"},
{"dropbox", "dropbox"},
{"expedia", "expedia"},
{"facebook", "facebook"},
{"fedex", "fedex"},
{"flickr", "flickr"},
{"github", "github"},
{"glassdoor", "glassdoor"},
{"gofundme", "gofundrne"},
{"google", "google"},
{"homedepot", "hornedepot"},
{"icloud", "icloud"},
{"indeed", "indeed"},
{"instagram", "instagrarn"},
{"intuit", "intuit"},
{"microsoft", "rnicrosoft"},
{"nbcnews", "nbcnews"},
{"netflix", "netflix"},
{"norton", "norton"},
{"nytimes", "nytirnes"},
{"office365", "office365"},
{"paypal", "paypal"},
{"pinterest", "pinterest"},
{"playstation", "playstation"},
{"quora", "quora"},
{"reddit", "reddit"},
{"reuters", "reuters"},
{"samsung", "sarnsung"},
{"spotify", "spotify"},
{"stackexchange", "stackexchange"},
{"stackoverflow", "stackoverflow"},
{"trello", "trello"},
{"twitch", "twitch"},
{"twitter", "twitter"},
{"uderny", "udemy"},
{"wikipedia", "wikipedia"},
{"wordpress", "wordpress"},
{"xfinity", "xfinity"},
{"yahoo", "yahoo"},
{"youtube", "youtube"},
{"zillow", "zillow"}};
// Each element in `kSkeletonsOfPopularKeywordsForCSQ` is a skeleton of a
// popular keyword. In contrast to `kBrandNamesForCSQ`, the original keywords
// are not included. Because in `kBrandNamesForCSQ`, original brand names are
// used to generate the matched domain, and original keywords are not needed for
// that process. If you are adding a keyword here, you can generate its skeleton
// using the format_url binary (components/url_formatter/tools/format_url.cc)
constexpr std::string_view kSkeletonsOfPopularKeywordsForCSQ[] = {
// Security
"account", "activate", "adrnin", "coin", "crypto", "login", "logout",
"password", "secure", "security", "signin", "signout", "wallet"};
// Minimum length of brand to be checked for Combo Squatting.
const size_t kMinBrandNameLengthForComboSquatting = 4;
ComboSquattingParams* GetComboSquattingParams() {
static base::NoDestructor<ComboSquattingParams> params(
{kBrandNamesForCSQ, kSkeletonsOfPopularKeywordsForCSQ});
return params.get();
}
bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1,
const url_formatter::Skeletons& skeletons2) {
DCHECK(!skeletons1.empty());
DCHECK(!skeletons2.empty());
for (const std::string& skeleton1 : skeletons1) {
if (base::Contains(skeletons2, skeleton1)) {
return true;
}
}
return false;
}
// Returns a site that the user has used before that the eTLD+1 in
// |domain_and_registry| may be attempting to spoof, based on skeleton
// comparison.
std::string GetMatchingSiteEngagementDomain(
const std::vector<DomainInfo>& engaged_sites,
const DomainInfo& navigated_domain) {
DCHECK(!navigated_domain.domain_and_registry.empty());
for (const DomainInfo& engaged_site : engaged_sites) {
DCHECK(!engaged_site.domain_and_registry.empty());
if (SkeletonsMatch(navigated_domain.skeletons, engaged_site.skeletons)) {
return engaged_site.domain_and_registry;
}
}
return std::string();
}
// Scans the top sites list and returns true if it finds a domain with an edit
// distance or character swap of one to |domain_and_registry|. This search is
// done in lexicographic order on the top 500 suitable domains, instead of in
// order by popularity. This means that the resulting "similar" domain may not
// be the most popular domain that matches.
bool GetSimilarDomainFromTopBucket(
const DomainInfo& navigated_domain,
const LookalikeTargetAllowlistChecker& target_allowlisted,
std::string* matched_domain,
LookalikeUrlMatchType* match_type) {
TopBucketDomainsParams* top_bucket_domain_params = GetTopDomainParams();
for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
for (size_t i = 0;
i < top_bucket_domain_params->num_edit_distance_skeletons; i++) {
const char* const top_domain_skeleton =
UNSAFE_TODO(top_bucket_domain_params->edit_distance_skeletons[i]);
DCHECK(strlen(top_domain_skeleton));
// Check edit distance on skeletons.
if (IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton),
base::UTF8ToUTF16(top_domain_skeleton))) {
const std::string top_domain =
url_formatter::LookupSkeletonInTopDomains(
top_domain_skeleton, url_formatter::SkeletonType::kFull)
.domain;
DCHECK(!top_domain.empty());
if (!IsLikelyEditDistanceFalsePositive(navigated_domain,
GetDomainInfo(top_domain)) &&
!target_allowlisted.Run(top_domain)) {
*matched_domain = top_domain;
*match_type = LookalikeUrlMatchType::kEditDistance;
return true;
}
}
// Check character swap on skeletons.
// TODO(crbug.com/40707797): Also check character swap on actual hostnames
// with diacritics etc removed. This is because some characters have two
// character skeletons such as m -> rn, and this prevents us from
// detecting character swaps between example.com and exapmle.com.
if (HasOneCharacterSwap(base::UTF8ToUTF16(navigated_skeleton),
base::UTF8ToUTF16(top_domain_skeleton))) {
const std::string top_domain =
url_formatter::LookupSkeletonInTopDomains(
top_domain_skeleton, url_formatter::SkeletonType::kFull)
.domain;
DCHECK(!top_domain.empty());
if (!IsLikelyCharacterSwapFalsePositive(navigated_domain,
GetDomainInfo(top_domain)) &&
!target_allowlisted.Run(top_domain)) {
*matched_domain = top_domain;
*match_type = LookalikeUrlMatchType::kCharacterSwapTop500;
return true;
}
}
}
}
return false;
}
// Scans the engaged site list for edit distance and character swap matches.
// Returns true if there is a match and fills |matched_domain| with the first
// matching engaged domain and |match_type| with the matching heuristic type.
bool GetSimilarDomainFromEngagedSites(
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites,
const LookalikeTargetAllowlistChecker& target_allowlisted,
std::string* matched_domain,
LookalikeUrlMatchType* match_type) {
for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
for (const DomainInfo& engaged_site : engaged_sites) {
DCHECK_NE(navigated_domain.domain_and_registry,
engaged_site.domain_and_registry);
if (!url_formatter::top_domains::IsEditDistanceCandidate(
engaged_site.domain_and_registry)) {
continue;
}
// Skip past domains that are allowed to be spoofed.
if (target_allowlisted.Run(engaged_site.domain_and_registry)) {
continue;
}
for (const std::string& engaged_skeleton : engaged_site.skeletons) {
// Check edit distance on skeletons.
if (IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton),
base::UTF8ToUTF16(engaged_skeleton)) &&
!IsLikelyEditDistanceFalsePositive(navigated_domain,
engaged_site)) {
*matched_domain = engaged_site.domain_and_registry;
*match_type = LookalikeUrlMatchType::kEditDistanceSiteEngagement;
return true;
}
// Check character swap on skeletons.
if (HasOneCharacterSwap(base::UTF8ToUTF16(navigated_skeleton),
base::UTF8ToUTF16(engaged_skeleton)) &&
!IsLikelyCharacterSwapFalsePositive(navigated_domain,
engaged_site)) {
*matched_domain = engaged_site.domain_and_registry;
*match_type = LookalikeUrlMatchType::kCharacterSwapSiteEngagement;
return true;
}
}
}
}
// Also check character swap on actual hostnames with diacritics etc removed.
// This is because some characters have two character skeletons such as m ->
// rn, and this prevents us from detecting character swaps between example.com
// and exapmle.com.
const std::u16string navigated_hostname_without_diacritics =
url_formatter::MaybeRemoveDiacritics(navigated_domain.idn_result.result);
if (navigated_hostname_without_diacritics !=
navigated_domain.idn_result.result) {
for (const DomainInfo& engaged_site : engaged_sites) {
DCHECK_NE(navigated_domain.domain_and_registry,
engaged_site.domain_and_registry);
const std::u16string engaged_hostname_without_diacritics =
url_formatter::MaybeRemoveDiacritics(engaged_site.idn_result.result);
if (HasOneCharacterSwap(navigated_hostname_without_diacritics,
engaged_hostname_without_diacritics)) {
*matched_domain = engaged_site.domain_and_registry;
*match_type = LookalikeUrlMatchType::kCharacterSwapSiteEngagement;
return true;
}
}
}
return false;
}
std::optional<NavigationSuggestionEvent> ToNavigationSuggestionEvent(
LookalikeUrlMatchType match_type) {
switch (match_type) {
case LookalikeUrlMatchType::kSkeletonMatchSiteEngagement:
return NavigationSuggestionEvent::kMatchSiteEngagement;
case LookalikeUrlMatchType::kEditDistance:
return NavigationSuggestionEvent::kMatchEditDistance;
case LookalikeUrlMatchType::kEditDistanceSiteEngagement:
return NavigationSuggestionEvent::kMatchEditDistanceSiteEngagement;
case LookalikeUrlMatchType::kTargetEmbedding:
return NavigationSuggestionEvent::kMatchTargetEmbedding;
case LookalikeUrlMatchType::kSkeletonMatchTop500:
return NavigationSuggestionEvent::kMatchSkeletonTop500;
case LookalikeUrlMatchType::kSkeletonMatchTop5k:
return NavigationSuggestionEvent::kMatchSkeletonTop5k;
case LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips:
return NavigationSuggestionEvent::kMatchTargetEmbeddingForSafetyTips;
case LookalikeUrlMatchType::kFailedSpoofChecks:
return NavigationSuggestionEvent::kFailedSpoofChecks;
case LookalikeUrlMatchType::kCharacterSwapSiteEngagement:
return NavigationSuggestionEvent::kMatchCharacterSwapSiteEngagement;
case LookalikeUrlMatchType::kCharacterSwapTop500:
return NavigationSuggestionEvent::kMatchCharacterSwapTop500;
case LookalikeUrlMatchType::kComboSquatting:
return NavigationSuggestionEvent::kComboSquatting;
case LookalikeUrlMatchType::kComboSquattingSiteEngagement:
return NavigationSuggestionEvent::kComboSquattingSiteEngagement;
case LookalikeUrlMatchType::kNone:
return std::nullopt;
}
}
// Returns the parts of the domain that are separated by "." or "-", not
// including the eTLD.
//
// |hostname| must outlive the return value since the vector contains
// StringPieces.
std::vector<std::string_view> SplitDomainIntoTokens(
const std::string& hostname) {
return base::SplitStringPiece(hostname, kTargetEmbeddingSeparators,
base::TRIM_WHITESPACE,
base::SPLIT_WANT_NONEMPTY);
}
// Returns whether any subdomain ending in the last entry of |domain_labels| is
// allowlisted. e.g. if domain_labels = {foo,scholar,google,com}, checks the
// allowlist for google.com, scholar.google.com, and foo.scholar.google.com.
bool ASubdomainIsAllowlisted(
const base::span<const std::string_view>& domain_labels,
const LookalikeTargetAllowlistChecker& in_target_allowlist) {
CHECK_GT(domain_labels.size(), 1u);
std::string potential_hostname(domain_labels.back());
// Attach each token from the end to the embedded target to check if that
// subdomain has been allowlisted.
for (size_t i = domain_labels.size() - 1; i; --i) {
potential_hostname =
base::StrCat({domain_labels[i - 1], ".", potential_hostname});
if (in_target_allowlist.Run(potential_hostname)) {
return true;
}
}
return false;
}
// Returns the top domain if the top domain without its separators matches the
// |potential_target| (e.g. googlecom). The matching is a skeleton matching.
std::string GetMatchingTopDomainWithoutSeparators(
std::string_view potential_target) {
const url_formatter::Skeletons skeletons =
url_formatter::GetSkeletons(base::UTF8ToUTF16(potential_target));
for (const auto& skeleton : skeletons) {
url_formatter::TopDomainEntry matched_domain =
url_formatter::LookupSkeletonInTopDomains(
skeleton, url_formatter::SkeletonType::kSeparatorsRemoved);
if (!matched_domain.domain.empty() &&
matched_domain.skeleton_type ==
url_formatter::SkeletonType::kSeparatorsRemoved) {
return matched_domain.domain;
}
}
return std::string();
}
// Returns whether the visited domain is either for a bare eTLD+1 (e.g.
// 'google.com') or a trivial subdomain (e.g. 'www.google.com').
bool IsETLDPlusOneOrTrivialSubdomain(const DomainInfo& host) {
return (host.domain_and_registry == host.hostname ||
"www." + host.domain_and_registry == host.hostname);
}
// Returns if |etld_plus_one| shares the skeleton of an eTLD+1 with an engaged
// site or a top bucket domain. |embedded_target| is set to matching eTLD+1.
bool DoesETLDPlus1MatchTopDomainOrEngagedSite(
const DomainInfo& domain,
const std::vector<DomainInfo>& engaged_sites,
std::string* embedded_target) {
for (const auto& skeleton : domain.skeletons) {
for (const auto& engaged_site : engaged_sites) {
// Skeleton matching only calculates skeletons of the eTLD+1, so only
// consider engaged sites that are bare eTLD+1s (or a trivial subdomain)
// and are a skeleton match.
if (IsETLDPlusOneOrTrivialSubdomain(engaged_site) &&
base::Contains(engaged_site.skeletons, skeleton)) {
*embedded_target = engaged_site.domain_and_registry;
return true;
}
}
}
for (const auto& skeleton : domain.skeletons) {
const url_formatter::TopDomainEntry top_domain =
url_formatter::LookupSkeletonInTopDomains(
skeleton, url_formatter::SkeletonType::kFull);
if (!top_domain.domain.empty() && top_domain.is_top_bucket) {
*embedded_target = top_domain.domain;
return true;
}
}
return false;
}
// Returns whether the e2LD of the provided domain is a common word (e.g.
// weather.com, ask.com). Target embeddings of these domains are often false
// positives (e.g. "super-best-fancy-hotels.com" isn't spoofing "hotels.com").
bool UsesCommonWord(const reputation::SafetyTipsConfig* config_proto,
const DomainInfo& domain) {
// kDomainsPermittedInEndEmbeddings are based on domains with common words,
// but they should not be excluded here (and instead are checked later).
for (auto* permitted_ending : kDomainsPermittedInEndEmbeddings) {
if (domain.domain_and_registry == permitted_ending) {
return false;
}
}
// Search for words in the big common word list.
if (url_formatter::common_words::IsCommonWord(
domain.domain_without_registry)) {
return true;
}
// Search for words in the component-provided word list.
if (lookalikes::IsCommonWordInConfigProto(config_proto,
domain.domain_without_registry)) {
return true;
}
// Search for words in the local word lists.
for (auto* common_word : kLocalAdditionalCommonWords) {
if (domain.domain_without_registry == common_word) {
return true;
}
}
return false;
}
// Returns whether |domain_labels| is in the same domain as embedding_domain.
// e.g. IsEmbeddingItself(["foo", "example", "com"], "example.com") -> true
// since foo.example.com is in the same domain as example.com.
bool IsEmbeddingItself(const base::span<const std::string_view>& domain_labels,
const std::string& embedding_domain) {
DCHECK(domain_labels.size() >= 2);
std::string potential_hostname(domain_labels.back());
// Attach each token from the end to the embedded target to check if that
// subdomain is the embedding domain. (e.g. using the earlier example, check
// each ["com", "example.com", "foo.example.com"] against "example.com".
for (size_t i = domain_labels.size() - 1; i; --i) {
potential_hostname =
base::StrCat({domain_labels[i - 1], ".", potential_hostname});
if (embedding_domain == potential_hostname) {
return true;
}
}
return false;
}
// Identical to url_formatter::top_domains::HostnameWithoutRegistry(), but
// respects de-facto public registries like .com.de using similar logic to
// GetETLDPlusOne. See kPrivateRegistriesTreatedAsPublic definition for more
// details. e.g. "google.com.de" returns "google". Call with an eTLD+1, not a
// full hostname.
std::string GetE2LDWithDeFactoPublicRegistries(
const std::string& domain_and_registry) {
if (domain_and_registry.empty()) {
return std::string();
}
size_t registry_size =
net::registry_controlled_domains::PermissiveGetHostRegistryLength(
domain_and_registry.c_str(),
net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
const size_t private_registry_size =
net::registry_controlled_domains::PermissiveGetHostRegistryLength(
domain_and_registry.c_str(),
net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
// If the registry lengths are the same using public and private registries,
// than this is just a public registry domain. Otherwise, we need to check if
// the registry ends with one of our anointed registries.
if (registry_size != private_registry_size) {
for (const auto* private_registry : kPrivateRegistriesTreatedAsPublic) {
if (base::EndsWith(domain_and_registry, private_registry)) {
registry_size = private_registry_size;
}
}
}
std::string out =
domain_and_registry.substr(0, domain_and_registry.size() - registry_size);
base::TrimString(out, ".", &out);
return out;
}
// Returns whether |embedded_target| and |embedding_domain| share the same e2LD,
// (as in, e.g., google.com and google.org, or airbnb.com.br and airbnb.com).
// Assumes |embedding_domain| is an eTLD+1. Respects de-facto public eTLDs.
bool IsCrossTLDMatch(const DomainInfo& embedded_target,
const std::string& embedding_domain) {
return (
GetE2LDWithDeFactoPublicRegistries(embedded_target.domain_and_registry) ==
GetE2LDWithDeFactoPublicRegistries(embedding_domain));
}
// Returns whether |embedded_target| is one of kDomainsPermittedInEndEmbeddings
// and that |embedding_domain| ends with that domain, e.g. "evil-office.com" is
// permitted, as "office.com" is in kDomainsPermittedInEndEmbeddings. Only
// impacts Target Embedding matches.
bool EndsWithPermittedDomains(const DomainInfo& embedded_target,
const std::string& embedding_domain) {
for (auto* permitted_ending : kDomainsPermittedInEndEmbeddings) {
if (embedded_target.domain_and_registry == permitted_ending &&
base::EndsWith(embedding_domain,
base::StrCat({"-", permitted_ending}))) {
return true;
}
}
return false;
}
// A domain is allowed to be embedded if is embedding itself, if its e2LD is a
// common word, any valid partial subdomain is allowlisted, or if it's a
// cross-TLD match (e.g. google.com vs google.com.mx).
bool IsAllowedToBeEmbedded(
const DomainInfo& embedded_target,
const base::span<const std::string_view>& subdomain_span,
const LookalikeTargetAllowlistChecker& in_target_allowlist,
const std::string& embedding_domain,
const reputation::SafetyTipsConfig* config_proto) {
return UsesCommonWord(config_proto, embedded_target) ||
ASubdomainIsAllowlisted(subdomain_span, in_target_allowlist) ||
IsEmbeddingItself(subdomain_span, embedding_domain) ||
IsCrossTLDMatch(embedded_target, embedding_domain) ||
EndsWithPermittedDomains(embedded_target, embedding_domain);
}
// Returns the first character of the first string that is different from the
// second string. Strings should be at least 1 edit distance apart.
char GetFirstDifferentChar(const std::string& str1, const std::string& str2) {
std::string::const_iterator i1 = str1.begin();
std::string::const_iterator i2 = str2.begin();
while (i1 != str1.end() && i2 != str2.end()) {
if (*i1 != *i2) {
return *i1;
}
i1++;
i2++;
}
NOTREACHED();
}
// Brand names with length of 4 or less should not be checked in domains for
// Combo Squatting. Short brand names can cause false positives in results.
bool IsComboSquattingCandidate(const std::string& brand) {
return brand.size() > kMinBrandNameLengthForComboSquatting;
}
// Extract brand names from engaged sites to be checked for Combo Squatting, if
// the brand is not one of the hard coded brand names.
std::vector<std::pair<std::string, std::string>> GetBrandNamesFromEngagedSites(
const std::vector<DomainInfo>& engaged_sites) {
std::vector<std::pair<std::string, std::string>> output;
for (const DomainInfo& engaged_site : engaged_sites) {
url_formatter::Skeletons domain_without_registry_skeletons =
engaged_site.domain_without_registry_skeletons;
for (const std::string& skeleton : domain_without_registry_skeletons)
if (IsComboSquattingCandidate(engaged_site.domain_without_registry)) {
std::pair<std::string, std::string> brand_name = {
engaged_site.domain_without_registry, skeleton};
output.emplace_back(brand_name);
}
}
return output;
}
// Registry of the navigated domain is needed to find matched_domain
// in Combo Squatting domains. For example, registry of
// `google-login[.]co[.]br` is `co[.]br`.
std::string GetRegistry(const DomainInfo& navigated_domain) {
size_t registry_size = navigated_domain.domain_and_registry.size() -
navigated_domain.domain_without_registry.size() - 1;
std::string domain_and_registry = navigated_domain.domain_and_registry;
std::string registry =
domain_and_registry.substr(domain_and_registry.size() - registry_size,
domain_and_registry.size() - 1);
return registry;
}
// If a matched domain including the brand name and TLD of
// navigated domain is found in top domains, |matched_domain|
// is set to the found top domain. Otherwise, |matched_domain| will
// be set to brand_name[.]com. Hard coded brand names should be checked to have
// valid brand_name[.]com url.
std::string FindMatchedDomainForHardCodedComboSquatting(
const std::string& brand_name,
const DomainInfo& navigated_domain) {
DomainInfo suggested_matched_domain =
GetDomainInfo(brand_name + '.' + GetRegistry(navigated_domain));
if (url_formatter::IsDomainAndRegistryATopDomain(
suggested_matched_domain.domain_and_registry)) {
return suggested_matched_domain.hostname;
} else {
return brand_name + ".com";
}
}
// Engaged sites are sorted based on engagement score, so |matched_domain|
// will be set to the first domain in the engaged sites lists that includes
// the brand name of the navigated domain.
std::string FindMatchedDomainForSiteEngagementComboSquatting(
const std::string& brand_name,
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites) {
for (auto& engaged_site : engaged_sites) {
if (brand_name == engaged_site.domain_without_registry) {
return engaged_site.hostname;
}
}
return std::string();
}
// Returns true if the navigated_domain is flagged as Combo Squatting.
// matched_domain is the suggested domain that will be shown to the user
// instead of the navigated_domain in the warning UI.
bool IsComboSquatting(
const std::vector<std::pair<std::string, std::string>>& brand_names,
const ComboSquattingParams& combo_squatting_params,
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites,
std::string* matched_domain,
bool is_hard_coded) {
// Check if the domain has any brand name and any popular keyword.
for (auto& brand : brand_names) {
auto brand_name = brand.first;
auto brand_skeleton = brand.second;
DCHECK(IsComboSquattingCandidate(brand_name));
for (auto& skeleton : navigated_domain.domain_without_registry_skeletons) {
size_t brand_skeleton_pos = skeleton.find(brand_skeleton);
if (skeleton.size() == brand_skeleton.size() ||
brand_skeleton_pos == std::string::npos) {
continue;
}
for (auto keyword : combo_squatting_params.popular_keywords) {
size_t keyword_pos = skeleton.find(keyword);
if (keyword_pos == std::string::npos) {
// Keyword not found, ignore.
continue;
}
if (std::string(brand_skeleton).find(keyword) != std::string::npos ||
std::string(keyword).find(brand_skeleton) != std::string::npos) {
// Keyword is a substring of brand or vice versa, ignore.
continue;
}
if ((keyword_pos > brand_skeleton_pos &&
keyword_pos < brand_skeleton_pos + brand_skeleton.size()) ||
(brand_skeleton_pos > keyword_pos &&
brand_skeleton_pos < keyword_pos + keyword.size())) {
// Keyword and brand overlap, ignore.
continue;
}
if (is_hard_coded) {
*matched_domain = FindMatchedDomainForHardCodedComboSquatting(
brand_name, navigated_domain);
} else {
*matched_domain = FindMatchedDomainForSiteEngagementComboSquatting(
brand_name, navigated_domain, engaged_sites);
}
return true;
}
}
}
return false;
}
// Hostnames containing these strings are considered unsafe due to ligature
// rendering in some fonts.
constexpr const char* kUnsafeLigatures[] = {
"g_logo", "o_logo", "l_logo", "e_logo",
// google_logo is also unsafe, but e_logo is its substring.
// super_g_logo is also unsafe, but g_logo is its substring.
"google_g", "glogoligature", "ologoligature", "llogoligature",
"elogoligature",
// googlelogoligature is also unsafe, but elogoligature is its
// substring
};
bool IsUnsafeLigature(const DomainInfo& domain) {
for (const char* unsafe_ligature : kUnsafeLigatures) {
if (domain.hostname.find(unsafe_ligature) != std::string::npos) {
return true;
}
}
return false;
}
} // namespace
namespace lookalikes {
const char kInterstitialHistogramName[] = "NavigationSuggestion.Event2";
const char kIncognitoInterstitialHistogramName[] =
"NavigationSuggestion.Event2.Incognito";
void RegisterProfilePrefs(user_prefs::PrefRegistrySyncable* registry) {
registry->RegisterListPref(prefs::kLookalikeWarningAllowlistDomains);
}
std::string GetConsoleMessage(const GURL& lookalike_url,
bool is_new_heuristic) {
const char* const kNewHeuristicMessage =
"Future Chrome versions will show a warning on this domain name.\n";
return base::StrCat({"Chrome has determined that ", lookalike_url.host(),
" could be fake or fraudulent.\n\n",
is_new_heuristic ? kNewHeuristicMessage : "",
"If you believe this is shown in error please visit "
"https://g.co/chrome/lookalike-warnings"});
}
DomainInfo::DomainInfo(
const std::string& arg_hostname,
const std::string& arg_domain_and_registry,
const std::string& arg_domain_without_registry,
const url_formatter::IDNConversionResult& arg_idn_result,
const url_formatter::Skeletons& arg_skeletons,
const url_formatter::Skeletons& arg_domain_without_registry_skeletons)
: hostname(arg_hostname),
domain_and_registry(arg_domain_and_registry),
domain_without_registry(arg_domain_without_registry),
idn_result(arg_idn_result),
skeletons(arg_skeletons),
domain_without_registry_skeletons(arg_domain_without_registry_skeletons) {
}
DomainInfo::~DomainInfo() = default;
DomainInfo::DomainInfo(const DomainInfo&) = default;
DomainInfo GetDomainInfo(const std::string& hostname) {
TRACE_EVENT0("navigation", "GetDomainInfo");
if (net::HostStringIsLocalhost(hostname) ||
net::IsHostnameNonUnique(hostname)) {
return DomainInfo(std::string(), std::string(), std::string(),
url_formatter::IDNConversionResult(),
url_formatter::Skeletons(), url_formatter::Skeletons());
}
const std::string domain_and_registry = GetETLDPlusOne(hostname);
const std::string domain_without_registry =
domain_and_registry.empty()
? std::string()
: url_formatter::top_domains::HostnameWithoutRegistry(
domain_and_registry);
// eTLD+1 can be empty for private domains.
if (domain_and_registry.empty()) {
return DomainInfo(hostname, domain_and_registry, domain_without_registry,
url_formatter::IDNConversionResult(),
url_formatter::Skeletons(), url_formatter::Skeletons());
}
// Compute skeletons using eTLD+1, skipping all spoofing checks. Spoofing
// checks in url_formatter can cause the converted result to be punycode.
// We want to avoid this in order to get an accurate skeleton for the unicode
// version of the domain.
const url_formatter::IDNConversionResult idn_result =
url_formatter::UnsafeIDNToUnicodeWithDetails(domain_and_registry);
const url_formatter::Skeletons skeletons =
url_formatter::GetSkeletons(idn_result.result);
const url_formatter::IDNConversionResult domain_without_registry_idn_result =
url_formatter::UnsafeIDNToUnicodeWithDetails(domain_without_registry);
const url_formatter::Skeletons domain_without_registry_skeletons =
url_formatter::GetSkeletons(domain_without_registry_idn_result.result);
return DomainInfo(hostname, domain_and_registry, domain_without_registry,
idn_result, skeletons, domain_without_registry_skeletons);
}
DomainInfo GetDomainInfo(const GURL& url) {
return GetDomainInfo(url.GetHost());
}
std::string GetETLDPlusOne(const std::string& hostname) {
auto pub = net::registry_controlled_domains::GetDomainAndRegistry(
hostname, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
auto priv = net::registry_controlled_domains::GetDomainAndRegistry(
hostname, net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
// If there is no difference in eTLD+1 with/without private registries, then
// the domain uses a public registry and we can return the eTLD+1 safely.
if (pub == priv) {
return pub;
}
// Otherwise, the domain uses a private registry and |pub| is that private
// registry. If it's a de-facto-public registry, return the private eTLD+1.
for (auto* private_registry : kPrivateRegistriesTreatedAsPublic) {
if (private_registry == pub) {
return priv;
}
}
// Otherwise, ignore the normal private registry and return the public eTLD+1.
return pub;
}
bool IsEditDistanceAtMostOne(const std::u16string& str1,
const std::u16string& str2) {
if (str1.size() > str2.size() + 1 || str2.size() > str1.size() + 1) {
return false;
}
std::u16string::const_iterator i = str1.begin();
std::u16string::const_iterator j = str2.begin();
size_t edit_count = 0;
while (i != str1.end() && j != str2.end()) {
if (*i == *j) {
i++;
j++;
} else {
edit_count++;
if (edit_count > 1) {
return false;
}
if (str1.size() > str2.size()) {
// First string is longer than the second. This can only happen if the
// first string has an extra character.
i++;
} else if (str2.size() > str1.size()) {
// Second string is longer than the first. This can only happen if the
// second string has an extra character.
j++;
} else {
// Both strings are the same length. This can only happen if the two
// strings differ by a single character.
i++;
j++;
}
}
}
if (i != str1.end() || j != str2.end()) {
// A character at the end did not match.
edit_count++;
}
return edit_count <= 1;
}
bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain,
const DomainInfo& matched_domain) {
DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
matched_domain.domain_and_registry));
DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
navigated_domain.domain_and_registry));
// If the only difference between the domains is the registry part, this is
// unlikely to be a spoofing attempt and we should ignore this match. E.g.
// exclude matches like google.com.tw and google.com.tr.
if (navigated_domain.domain_without_registry ==
matched_domain.domain_without_registry) {
return true;
}
// If the domains only differ by a numeric suffix on their e2LD (e.g.
// site45.tld and site35.tld), then ignore the match.
auto nav_trimmed = base::TrimString(navigated_domain.domain_without_registry,
kDigitChars, base::TRIM_TRAILING);
auto matched_trimmed = base::TrimString(
matched_domain.domain_without_registry, kDigitChars, base::TRIM_TRAILING);
DCHECK_NE(navigated_domain.domain_without_registry,
matched_domain.domain_without_registry);
// We previously verified that the domains without registries weren't equal,
// so if they're equal now, the match must have come from numeric suffixes.
if (nav_trimmed == matched_trimmed) {
return true;
}
// Ignore domains that only differ by an insertion/substitution at the
// start, as these are usually different words, not lookalikes.
const auto nav_dom_len = navigated_domain.domain_and_registry.length();
const auto matched_dom_len = matched_domain.domain_and_registry.length();
const auto& nav_dom = navigated_domain.domain_and_registry;
const auto& matched_dom = matched_domain.domain_and_registry;
if (nav_dom_len == matched_dom_len) {
// e.g. hank vs tank
if (nav_dom.substr(1) == matched_dom.substr(1)) {
return true;
}
} else if (nav_dom_len < matched_dom_len) {
// e.g. oodle vs poodle
if (nav_dom == matched_dom.substr(1)) {
return true;
}
} else { // navigated_dom_len > matched_dom_len
// e.g. poodle vs oodle
if (nav_dom.substr(1) == matched_dom) {
return true;
}
}
// Ignore domains that only differ by an insertion of a "-".
if (nav_dom_len != matched_dom_len) {
if (nav_dom_len < matched_dom_len &&
GetFirstDifferentChar(matched_dom, nav_dom) == '-') {
return true;
} else if (nav_dom_len > matched_dom_len &&
GetFirstDifferentChar(nav_dom, matched_dom) == '-') {
return true;
}
}
return false;
}
bool IsLikelyCharacterSwapFalsePositive(const DomainInfo& navigated_domain,
const DomainInfo& matched_domain) {
DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
matched_domain.domain_and_registry));
DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
navigated_domain.domain_and_registry));
// If the only difference between the domains is the registry part, this is
// unlikely to be a spoofing attempt and we should ignore this match. E.g.
// exclude matches like google.sr and google.rs.
return navigated_domain.domain_without_registry ==
matched_domain.domain_without_registry;
}
bool GetMatchingDomain(
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites,
const LookalikeTargetAllowlistChecker& in_target_allowlist,
const reputation::SafetyTipsConfig* config_proto,
std::string* matched_domain,
LookalikeUrlMatchType* match_type) {
DCHECK(!navigated_domain.domain_and_registry.empty());
DCHECK(matched_domain);
DCHECK(match_type);
if (navigated_domain.idn_result.has_idn_component) {
// If the navigated domain is IDN, check its skeleton against engaged sites
// and top domains.
const std::string matched_engaged_domain =
GetMatchingSiteEngagementDomain(engaged_sites, navigated_domain);
DCHECK_NE(navigated_domain.domain_and_registry, matched_engaged_domain);
if (!matched_engaged_domain.empty()) {
*matched_domain = matched_engaged_domain;
*match_type = LookalikeUrlMatchType::kSkeletonMatchSiteEngagement;
return true;
}
if (!navigated_domain.idn_result.matching_top_domain.domain.empty()) {
// In practice, this is not possible since the top domain list does not
// contain IDNs, so domain_and_registry can't both have IDN and be a top
// domain. Still, sanity check in case the top domain list changes in the
// future.
// At this point, navigated domain should not be a top domain.
DCHECK_NE(navigated_domain.domain_and_registry,
navigated_domain.idn_result.matching_top_domain.domain);
*matched_domain = navigated_domain.idn_result.matching_top_domain.domain;
*match_type =
navigated_domain.idn_result.matching_top_domain.is_top_bucket
? LookalikeUrlMatchType::kSkeletonMatchTop500
: LookalikeUrlMatchType::kSkeletonMatchTop5k;
return true;
}
}
if (url_formatter::top_domains::IsEditDistanceCandidate(
navigated_domain.domain_and_registry)) {
// If we can't find an exact top domain or an engaged site, try to find an
// engaged domain within an edit distance of one or a single character swap.
if (GetSimilarDomainFromEngagedSites(navigated_domain, engaged_sites,
in_target_allowlist, matched_domain,
match_type)) {
DCHECK_NE(navigated_domain.domain_and_registry, *matched_domain);
return true;
}
// Finally, try to find a top domain within an edit distance or character
// swap of one.
if (GetSimilarDomainFromTopBucket(navigated_domain, in_target_allowlist,
matched_domain, match_type)) {
DCHECK_NE(navigated_domain.domain_and_registry, *matched_domain);
DCHECK(!matched_domain->empty());
return true;
}
}
TargetEmbeddingType embedding_type =
GetTargetEmbeddingType(navigated_domain.hostname, engaged_sites,
in_target_allowlist, config_proto, matched_domain);
if (embedding_type == TargetEmbeddingType::kSafetyTip) {
*match_type = LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips;
return true;
} else if (embedding_type == TargetEmbeddingType::kInterstitial) {
*match_type = LookalikeUrlMatchType::kTargetEmbedding;
return true;
}
// If none of the previous heuristics work, check it for Combo Squatting.
ComboSquattingType combo_squatting_type =
GetComboSquattingType(navigated_domain, engaged_sites, matched_domain);
if (combo_squatting_type == ComboSquattingType::kHardCoded) {
*match_type = LookalikeUrlMatchType::kComboSquatting;
DCHECK(!matched_domain->empty());
return true;
} else if (combo_squatting_type == ComboSquattingType::kSiteEngagement) {
*match_type = LookalikeUrlMatchType::kComboSquattingSiteEngagement;
DCHECK(!matched_domain->empty());
return true;
}
DCHECK(embedding_type == TargetEmbeddingType::kNone);
DCHECK(combo_squatting_type == ComboSquattingType::kNone);
return false;
}
void RecordUMAFromMatchType(LookalikeUrlMatchType match_type,
bool is_incognito) {
std::optional<NavigationSuggestionEvent> event =
ToNavigationSuggestionEvent(match_type);
if (event) {
if (is_incognito) {
UMA_HISTOGRAM_ENUMERATION(lookalikes::kIncognitoInterstitialHistogramName,
*event);
} else {
UMA_HISTOGRAM_ENUMERATION(lookalikes::kInterstitialHistogramName, *event);
}
}
}
TargetEmbeddingType GetTargetEmbeddingType(
const std::string& hostname,
const std::vector<DomainInfo>& engaged_sites,
const LookalikeTargetAllowlistChecker& in_target_allowlist,
const reputation::SafetyTipsConfig* config_proto,
std::string* safe_hostname) {
// Because of how target embeddings are detected (i.e. by sweeping the URL
// from back to front), we're guaranteed to find tail-embedding before other
// target embedding. Tail embedding triggers a safety tip, but interstitials
// are more important than safety tips, so if we find a safety tippable
// embedding with SearchForEmbeddings, go search again not permitting safety
// tips to see if we can also find an interstitiallable embedding.
auto result = SearchForEmbeddings(
hostname, engaged_sites, in_target_allowlist, config_proto,
/*safety_tips_allowed=*/true, safe_hostname);
if (result == TargetEmbeddingType::kSafetyTip) {
std::string no_st_safe_hostname;
auto no_st_result = SearchForEmbeddings(
hostname, engaged_sites, in_target_allowlist, config_proto,
/*safety_tips_allowed=*/false, &no_st_safe_hostname);
if (no_st_result == TargetEmbeddingType::kNone) {
return result;
}
*safe_hostname = no_st_safe_hostname;
return no_st_result;
}
return result;
}
TargetEmbeddingType SearchForEmbeddings(
const std::string& hostname,
const std::vector<DomainInfo>& engaged_sites,
const LookalikeTargetAllowlistChecker& in_target_allowlist,
const reputation::SafetyTipsConfig* config_proto,
bool safety_tips_allowed,
std::string* safe_hostname) {
const std::string embedding_domain = GetETLDPlusOne(hostname);
const std::vector<std::string_view> hostname_tokens =
SplitDomainIntoTokens(hostname);
// There are O(n^2) potential target embeddings in a domain name. We want to
// be comprehensive, but optimize so that usually we needn't check all of
// them. We do that by sweeping from the back of the embedding domain, towards
// the front, checking for a valid eTLD. If we find one, then we consider the
// possible embedded domains that end in that eTLD (i.e. all possible start
// points from the beginning of the string onward).
for (size_t end = hostname_tokens.size(); end > 0; --end) {
base::span<const std::string_view> UNSAFE_TODO(
etld_check_span(hostname_tokens.data(), end));
std::string etld_check_host = base::JoinString(etld_check_span, ".");
auto etld_check_dominfo = GetDomainInfo(etld_check_host);
// Check if the final token is a no-separator target (e.g. "googlecom").
// This check happens first so that we can exclude invalid eTLD+1s next.
std::string embedded_target =
GetMatchingTopDomainWithoutSeparators(hostname_tokens[end - 1]);
if (!embedded_target.empty()) {
// Extract the full possibly-spoofed domain. To get this, we take the
// hostname up until this point, strip off the no-separator bit (e.g.
// googlecom) and then re-add the the separated version (e.g. google.com).
auto spoofed_domain =
etld_check_host.substr(
0, etld_check_host.length() - hostname_tokens[end - 1].length()) +
embedded_target;
const auto no_separator_tokens = base::SplitStringPiece(
spoofed_domain, kTargetEmbeddingSeparators, base::TRIM_WHITESPACE,
base::SPLIT_WANT_NONEMPTY);
auto no_separator_dominfo = GetDomainInfo(embedded_target);
// Only flag on domains that are long enough, don't use common words, and
// aren't target-allowlisted.
if (no_separator_dominfo.domain_without_registry.length() >
kMinE2LDLengthForTargetEmbedding &&
!IsAllowedToBeEmbedded(no_separator_dominfo, no_separator_tokens,
in_target_allowlist, embedding_domain,
config_proto)) {
*safe_hostname = embedded_target;
return TargetEmbeddingType::kInterstitial;
}
}
// Exclude otherwise-invalid eTLDs.
if (etld_check_dominfo.domain_without_registry.empty()) {
continue;
}
// Exclude e2LDs that are too short. <= because domain_without_registry has
// a trailing ".".
if (etld_check_dominfo.domain_without_registry.length() <=
kMinE2LDLengthForTargetEmbedding) {
continue;
}
// Check for exact matches against engaged sites, among all possible
// subdomains ending at |end|.
for (size_t start = 0; start < end - 1; ++start) {
const base::span<const std::string_view> UNSAFE_TODO(
span(hostname_tokens.data() + start, end - start));
auto embedded_hostname = base::JoinString(span, ".");
auto embedded_dominfo = GetDomainInfo(embedded_hostname);
for (auto& engaged_site : engaged_sites) {
if (engaged_site.hostname == embedded_dominfo.hostname &&
!IsAllowedToBeEmbedded(embedded_dominfo, span, in_target_allowlist,
embedding_domain, config_proto)) {
*safe_hostname = engaged_site.hostname;
// Tail-embedding (e.g. evil-google.com, where the embedding happens
// at the very end of the hostname) is a safety tip, but only when
// safety tips are allowed. If it's tail embedding but we can't create
// a safety tip, keep looking. Non-tail-embeddings are interstitials.
if (end != hostname_tokens.size()) {
return TargetEmbeddingType::kInterstitial;
} else if (safety_tips_allowed) {
return TargetEmbeddingType::kSafetyTip;
} // else keep searching.
}
}
}
// There were no exact engaged site matches, but there may yet still be a
// match against the eTLD+1 of an engaged or top site.
if (DoesETLDPlus1MatchTopDomainOrEngagedSite(
etld_check_dominfo, engaged_sites, safe_hostname) &&
!IsAllowedToBeEmbedded(etld_check_dominfo, etld_check_span,
in_target_allowlist, embedding_domain,
config_proto)) {
// Tail-embedding (e.g. evil-google.com, where the embedding happens at
// the very end of the hostname) is a safety tip, but only when safety
// tips are allowed. If it's tail embedding but we can't create a safety
// tip, keep looking. Non-tail-embeddings are interstitials.
if (end != hostname_tokens.size()) {
return TargetEmbeddingType::kInterstitial;
} else if (safety_tips_allowed) {
return TargetEmbeddingType::kSafetyTip;
} // else keep searching.
}
}
return TargetEmbeddingType::kNone;
}
bool IsASCII(UChar32 codepoint) {
return !(codepoint & ~0x7F);
}
// Returns true if |codepoint| has emoji related properties.
bool IsEmojiRelatedCodepoint(UChar32 codepoint) {
return u_hasBinaryProperty(codepoint, UCHAR_EMOJI) ||
// Characters that have emoji presentation by default (e.g. hourglass)
u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION) ||
// Characters displayed as country flags when used as a valid pair.
// E.g. Regional Indicator Symbol Letter B used once in a string
// is rendered as 🇧, used twice is rendered as the flag of Barbados
// (with country code BB). It's therefore possible to come up with
// a spoof using regional indicator characters as text, but these
// domain names will be readily punycoded and detecting pairs isn't
// easy so we keep the code simple here.
u_hasBinaryProperty(codepoint, UCHAR_REGIONAL_INDICATOR) ||
// Pictographs such as Black Cross On Shield (U+26E8).
u_hasBinaryProperty(codepoint, UCHAR_EXTENDED_PICTOGRAPHIC);
}
// Returns true if |text| contains only ASCII characters, pictographs
// or emojis. This check is only used to determine if a domain that already
// failed spoof checks should be blocked by an interstitial. Ideally, we would
// check this for non-ASCII scripts as well (e.g. Cyrillic + emoji), but such
// usage isn't common.
bool IsASCIIAndEmojiOnly(std::u16string_view text) {
for (base::i18n::UTF16CharIterator iter(text); !iter.end(); iter.Advance()) {
const UChar32 codepoint = iter.get();
if (!IsASCII(codepoint) && !IsEmojiRelatedCodepoint(codepoint)) {
return false;
}
}
return true;
}
// Returns true if the e2LD of domain is long enough to display a punycode
// interstitial.
bool IsPunycodeInterstitialCandidate(const DomainInfo& domain) {
const url_formatter::IDNConversionResult idn_result =
url_formatter::UnsafeIDNToUnicodeWithDetails(
domain.domain_without_registry);
return idn_result.result.size() >=
kMinimumE2LDLengthToShowPunycodeInterstitial;
}
bool ShouldBlockBySpoofCheckResult(const DomainInfo& navigated_domain) {
if (IsUnsafeLigature(navigated_domain)) {
return true;
}
// Here, only a subset of spoof checks that cause an IDN to fallback to
// punycode are configured to show an interstitial.
switch (navigated_domain.idn_result.spoof_check_result) {
case url_formatter::IDNSpoofCheckerResult::kNone:
case url_formatter::IDNSpoofCheckerResult::kSafe:
return false;
case url_formatter::IDNSpoofCheckerResult::kICUSpoofChecks:
// If the eTLD+1 contains only a mix of ASCII + Emoji, allow.
return !IsASCIIAndEmojiOnly(navigated_domain.idn_result.result) &&
IsPunycodeInterstitialCandidate(navigated_domain);
case url_formatter::IDNSpoofCheckerResult::kDeviationCharacters:
// Failures because of deviation characters, especially ß, is common.
return false;
case url_formatter::IDNSpoofCheckerResult::kTLDSpecificCharacters:
case url_formatter::IDNSpoofCheckerResult::kUnsafeMiddleDot:
case url_formatter::IDNSpoofCheckerResult::kWholeScriptConfusable:
case url_formatter::IDNSpoofCheckerResult::kDigitLookalikes:
case url_formatter::IDNSpoofCheckerResult::
kNonAsciiLatinCharMixedWithNonLatin:
case url_formatter::IDNSpoofCheckerResult::kDangerousPattern:
return IsPunycodeInterstitialCandidate(navigated_domain);
}
}
bool IsAllowedByEnterprisePolicy(const PrefService* pref_service,
const GURL& url) {
const base::Value::List& list =
pref_service->GetList(prefs::kLookalikeWarningAllowlistDomains);
for (const auto& domain_val : list) {
const std::string& domain = domain_val.GetString();
if (url.DomainIs(domain)) {
return true;
}
}
return false;
}
void SetEnterpriseAllowlistForTesting(PrefService* pref_service,
const std::vector<std::string>& hosts) {
base::Value::List list;
for (const auto& host : hosts) {
list.Append(host);
}
pref_service->SetList(prefs::kLookalikeWarningAllowlistDomains,
std::move(list));
}
bool HasOneCharacterSwap(const std::u16string& str1,
const std::u16string& str2) {
if (str1.size() != str2.size()) {
return false;
}
if (str1 == str2) {
return false;
}
bool has_swap = false;
std::u16string::const_iterator i = str1.begin();
std::u16string::const_iterator j = str2.begin();
while (i != str1.end()) {
DCHECK(j < str2.end());
wchar_t left1 = *i;
wchar_t right1 = *j;
i++;
j++;
if (left1 == right1) {
continue;
}
wchar_t left2 = *i;
wchar_t right2 = *j;
if (!has_swap && (left1 == right2 && right1 == left2)) {
has_swap = true;
i++;
j++;
continue;
}
// Either there are multiple swaps, or strings have completely different
// characters.
return false;
}
return has_swap;
}
void SetTopBucketDomainsParamsForTesting(const TopBucketDomainsParams& params) {
*GetTopDomainParams() = params;
}
void ResetTopBucketDomainsParamsForTesting() {
TopBucketDomainsParams* params = GetTopDomainParams();
*params = {top_bucket_domains::kTopBucketEditDistanceSkeletons,
top_bucket_domains::kNumTopBucketEditDistanceSkeletons};
}
bool IsHeuristicEnabledForHostname(
const reputation::SafetyTipsConfig* config_proto,
const reputation::HeuristicLaunchConfig::Heuristic heuristic,
const std::string& lookalike_etld_plus_one,
version_info::Channel channel) {
DCHECK(!lookalike_etld_plus_one.empty());
if (!config_proto) {
return false;
}
base::SHA1Digest hash =
base::SHA1Hash(base::as_byte_span(lookalike_etld_plus_one));
float cohort = hash[0u] / 2.56;
for (const reputation::HeuristicLaunchConfig& config :
config_proto->launch_config()) {
if (heuristic == config.heuristic()) {
switch (channel) {
// Enable by default on local builds.
case version_info::Channel::UNKNOWN:
return true;
// Use pre-defined launch percentages for Canary/Dev and Beta. Use the
// launch percentage from config for Stable.
case version_info::Channel::CANARY:
case version_info::Channel::DEV:
return kDefaultLaunchPercentageOnCanaryDev > cohort;
case version_info::Channel::BETA:
return kDefaultLaunchPercentageOnBeta > cohort;
case version_info::Channel::STABLE:
return config.launch_percentage() > cohort;
}
}
}
return false;
}
void SetComboSquattingParamsForTesting(const ComboSquattingParams& params) {
*GetComboSquattingParams() = params;
}
void ResetComboSquattingParamsForTesting() {
ComboSquattingParams* params = GetComboSquattingParams();
*params = {kBrandNamesForCSQ, kSkeletonsOfPopularKeywordsForCSQ};
}
ComboSquattingType GetComboSquattingType(
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites,
std::string* matched_domain) {
const ComboSquattingParams* combo_squatting_params =
GetComboSquattingParams();
// First check Combo Squatting with hard coded brand names.
std::vector<std::pair<std::string, std::string>> brand_names;
for (auto* it : combo_squatting_params->brand_names) {
brand_names.emplace_back(std::string(it[0]),
std::string(UNSAFE_TODO(it[1])));
}
if (IsComboSquatting(brand_names, *combo_squatting_params, navigated_domain,
engaged_sites, matched_domain,
/*is_hard_coded=*/true)) {
return ComboSquattingType::kHardCoded;
}
// Then check Combo Squatting with brand names in engaged sites.
brand_names = GetBrandNamesFromEngagedSites(engaged_sites);
if (IsComboSquatting(brand_names, *combo_squatting_params, navigated_domain,
engaged_sites, matched_domain,
/*is_hard_coded=*/false)) {
return ComboSquattingType::kSiteEngagement;
}
return ComboSquattingType::kNone;
}
bool IsSafeTLD(const std::string& hostname) {
// This is intentionally kept simple and currently ignores hostnames with
// ccTLDs (e.g. gov.in).
return base::EndsWith(hostname, ".gov") || base::EndsWith(hostname, ".mil");
}
LookalikeActionType GetActionForMatchType(
const reputation::SafetyTipsConfig* config,
version_info::Channel channel,
const std::string& etld_plus_one,
LookalikeUrlMatchType match_type) {
switch (match_type) {
case LookalikeUrlMatchType::kEditDistance:
// Edit distance is too noisy, just record metrics.
return LookalikeActionType::kRecordMetrics;
case LookalikeUrlMatchType::kEditDistanceSiteEngagement:
return LookalikeActionType::kShowSafetyTip;
case LookalikeUrlMatchType::kTargetEmbedding:
#if BUILDFLAG(IS_IOS)
// TODO(crbug.com/40705070): Only enable target embedding on iOS once we
// can
// check engaged sites. Otherwise, false positives are too high.
return LookalikeActionType::kRecordMetrics;
#else
return LookalikeActionType::kShowInterstitial;
#endif
case LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips:
return LookalikeActionType::kShowSafetyTip;
case LookalikeUrlMatchType::kSkeletonMatchTop5k:
return LookalikeActionType::kShowSafetyTip;
case LookalikeUrlMatchType::kFailedSpoofChecks:
return LookalikeActionType::kShowInterstitial;
case LookalikeUrlMatchType::kSkeletonMatchSiteEngagement:
case LookalikeUrlMatchType::kSkeletonMatchTop500:
return LookalikeActionType::kShowInterstitial;
case LookalikeUrlMatchType::kCharacterSwapSiteEngagement:
case LookalikeUrlMatchType::kCharacterSwapTop500:
return LookalikeActionType::kShowSafetyTip;
case LookalikeUrlMatchType::kComboSquatting:
return IsHeuristicEnabledForHostname(
config,
reputation::HeuristicLaunchConfig::
HEURISTIC_COMBO_SQUATTING_TOP_DOMAINS,
etld_plus_one, channel)
? LookalikeActionType::kShowSafetyTip
: LookalikeActionType::kRecordMetrics;
case LookalikeUrlMatchType::kComboSquattingSiteEngagement:
return IsHeuristicEnabledForHostname(
config,
reputation::HeuristicLaunchConfig::
HEURISTIC_COMBO_SQUATTING_ENGAGED_SITES,
etld_plus_one, channel)
? LookalikeActionType::kShowSafetyTip
: LookalikeActionType::kRecordMetrics;
case LookalikeUrlMatchType::kNone:
NOTREACHED();
}
NOTREACHED();
}
GURL GetSuggestedURL(LookalikeUrlMatchType match_type,
const GURL& navigated_url,
const std::string& matched_hostname) {
// matched_hostname can be a top domain or an engaged domain. Simply use its
// eTLD+1 as the suggested domain.
// 1. If matched_hostname is a top domain: Top domain list already contains
// eTLD+1s only so this works well.
// 2. If matched_hostname is an engaged domain and is not an eTLD+1, don't
// suggest it. Otherwise, navigating to googlé.com and having engaged with
// docs.google.com would suggest docs.google.com.
//
// When the navigated and matched domains are not eTLD+1s (e.g.
// docs.googlé.com and docs.google.com), this will suggest google.com
// instead of docs.google.com. This is less than ideal, but has two
// benefits:
// - Simpler code
// - Fewer suggestions to non-existent domains. E.g. When the navigated
// domain is nonexistent.googlé.com and the matched domain is
// docs.google.com, we will suggest google.com instead of
// nonexistent.google.com.
std::string suggested_domain = GetETLDPlusOne(matched_hostname);
DCHECK(!suggested_domain.empty());
// Drop everything but the parts of the origin.
GURL::Replacements replace_host;
replace_host.SetHostStr(suggested_domain);
GURL suggested_url =
navigated_url.ReplaceComponents(replace_host).GetWithEmptyPath();
// Use https for top domain matches.
// TODO(crbug.com/40755923): If the match is against an engaged site, use the
// scheme of the engaged site instead.
if (suggested_url.SchemeIs(url::kHttpScheme) &&
suggested_url.IntPort() == url::PORT_UNSPECIFIED &&
(match_type == LookalikeUrlMatchType::kEditDistance ||
match_type == LookalikeUrlMatchType::kSkeletonMatchTop500 ||
match_type == LookalikeUrlMatchType::kSkeletonMatchTop5k)) {
GURL::Replacements replace_scheme;
replace_scheme.SetSchemeStr(url::kHttpsScheme);
suggested_url = suggested_url.ReplaceComponents(replace_scheme);
}
return suggested_url;
}
} // namespace lookalikes