| // Copyright 2020 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "components/lookalikes/core/lookalike_url_util.h" |
| |
| #include <algorithm> |
| #include <string_view> |
| #include <utility> |
| |
| #include "base/compiler_specific.h" |
| #include "base/containers/contains.h" |
| #include "base/functional/callback.h" |
| #include "base/hash/sha1.h" |
| #include "base/i18n/char_iterator.h" |
| #include "base/metrics/histogram_macros.h" |
| #include "base/no_destructor.h" |
| #include "base/strings/strcat.h" |
| #include "base/strings/string_split.h" |
| #include "base/strings/string_util.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "base/trace_event/trace_event.h" |
| #include "base/values.h" |
| #include "build/build_config.h" |
| #include "components/lookalikes/core/safety_tips_config.h" |
| #include "components/security_interstitials/core/pref_names.h" |
| #include "components/url_formatter/spoof_checks/common_words/common_words_util.h" |
| #include "components/url_formatter/spoof_checks/top_domains/top_bucket_domains.h" |
| #include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h" |
| #include "components/url_formatter/url_formatter.h" |
| #include "net/base/registry_controlled_domains/registry_controlled_domain.h" |
| #include "net/base/url_util.h" |
| #include "third_party/icu/source/common/unicode/uchar.h" |
| #include "third_party/icu/source/common/unicode/utypes.h" |
| |
| using lookalikes::ComboSquattingParams; |
| using lookalikes::DomainInfo; |
| using lookalikes::GetDomainInfo; |
| using lookalikes::HasOneCharacterSwap; |
| using lookalikes::IsEditDistanceAtMostOne; |
| using lookalikes::LookalikeTargetAllowlistChecker; |
| using lookalikes::LookalikeUrlMatchType; |
| using lookalikes::NavigationSuggestionEvent; |
| using lookalikes::TopBucketDomainsParams; |
| |
| namespace { |
| |
| // Digits. Used for trimming domains in Edit Distance heuristic matches. Domains |
| // that only differ by trailing digits (e.g. a1.tld and a2.tld) are ignored. |
| const char kDigitChars[] = "0123456789"; |
| |
| // Minimum length of e2LD protected against target embedding. For example, |
| // foo.bar.baz.com-evil.com embeds foo.bar.baz.com, but we don't flag it since |
| // "baz" is shorter than kMinTargetE2LDLength. |
| const size_t kMinE2LDLengthForTargetEmbedding = 4; |
| |
| // We might not protect a domain whose e2LD is a common word in target embedding |
| // based on the TLD that is paired with it. This list supplements words from |
| // url_formatter::common_words::IsCommonWord(). |
| constexpr const char* kLocalAdditionalCommonWords[] = {"asahi", "hoteles", |
| "jharkhand", "nifty"}; |
| |
| // These domains are plausible lookalike targets, but they also use common words |
| // in their names. Selectively prevent flagging embeddings where the embedder |
| // ends in "-DOMAIN.TLD", since these tend to have higher false positive rates. |
| constexpr const char* kDomainsPermittedInEndEmbeddings[] = { |
| "office.com", "medium.com", "orange.fr"}; |
| |
| // What separators can be used to separate tokens in target embedding spoofs? |
| // e.g. www-google.com.example.com uses "-" (www-google) and "." (google.com). |
| const char kTargetEmbeddingSeparators[] = "-."; |
| |
| // A small subset of private registries on the PSL that act like public |
| // registries AND are a common source of false positives in lookalike checks. We |
| // treat them as public for the purposes of lookalike checks. |
| constexpr const char* kPrivateRegistriesTreatedAsPublic[] = {"com.de", |
| "com.se"}; |
| |
| TopBucketDomainsParams* GetTopDomainParams() { |
| static TopBucketDomainsParams params{ |
| top_bucket_domains::kTopBucketEditDistanceSkeletons, |
| top_bucket_domains::kNumTopBucketEditDistanceSkeletons}; |
| return ¶ms; |
| } |
| |
| // Minimum length of the eTLD+1 without registry needed to show the punycode |
| // interstitial. IDN whose eTLD+1 without registry is shorter than this are |
| // still displayed in punycode, but don't show an interstitial. |
| const size_t kMinimumE2LDLengthToShowPunycodeInterstitial = 2; |
| |
| // Default launch percentage of a new heuristic on Canary/Dev and Beta. These |
| // are used if there is a launch config for the heuristic in the proto. |
| const int kDefaultLaunchPercentageOnCanaryDev = 90; |
| const int kDefaultLaunchPercentageOnBeta = 50; |
| |
| // Define skeletons of brand names and popular keywords for using in Combo |
| // Squatting heuristic. These lists are manually curated using Chrome metrics. |
| // We will check combinations of brand names and popular keywords. |
| // e. g. google-login.com or youtubesecure.com. |
| // For every brand name, brand_name[.]com should be checked to be valid. If |
| // no matched domain is found in top domains, brand_name[.]com will be |
| // suggested to the user for navigation. |
| // If brand_name[.]com is not valid for any brand name, each brand name should |
| // be mapped to a valid url manually and the data structure of |
| // ForCSQ should be changed accordingly. |
| // In each element of `kBrandNamesForCSQ`, first string is an original brand |
| // name and second string is its skeleton. If you are adding a brand name here, |
| // you can generate its skeleton using the format_url binary |
| // (components/url_formatter/tools/format_url.cc) |
| // TODO(crbug.com/40855941): Generate skeletons of hard coded brand names in |
| // Chrome initialization and remove manual adding of skeletons to this list. |
| constexpr std::string_view kBrandNamesForCSQ[][2] = { |
| {"adobe", "adobe"}, |
| {"airbnb", "airbnb"}, |
| {"alibaba", "alibaba"}, |
| {"aliexpress", "aliexpress"}, |
| {"amazon", "arnazon"}, |
| {"baidu", "baidu"}, |
| {"bestbuy", "bestbuy"}, |
| {"blogspot", "blogspot"}, |
| {"costco", "costco"}, |
| {"craigslist", "craigslist"}, |
| {"dropbox", "dropbox"}, |
| {"expedia", "expedia"}, |
| {"facebook", "facebook"}, |
| {"fedex", "fedex"}, |
| {"flickr", "flickr"}, |
| {"github", "github"}, |
| {"glassdoor", "glassdoor"}, |
| {"gofundme", "gofundrne"}, |
| {"google", "google"}, |
| {"homedepot", "hornedepot"}, |
| {"icloud", "icloud"}, |
| {"indeed", "indeed"}, |
| {"instagram", "instagrarn"}, |
| {"intuit", "intuit"}, |
| {"microsoft", "rnicrosoft"}, |
| {"nbcnews", "nbcnews"}, |
| {"netflix", "netflix"}, |
| {"norton", "norton"}, |
| {"nytimes", "nytirnes"}, |
| {"office365", "office365"}, |
| {"paypal", "paypal"}, |
| {"pinterest", "pinterest"}, |
| {"playstation", "playstation"}, |
| {"quora", "quora"}, |
| {"reddit", "reddit"}, |
| {"reuters", "reuters"}, |
| {"samsung", "sarnsung"}, |
| {"spotify", "spotify"}, |
| {"stackexchange", "stackexchange"}, |
| {"stackoverflow", "stackoverflow"}, |
| {"trello", "trello"}, |
| {"twitch", "twitch"}, |
| {"twitter", "twitter"}, |
| {"uderny", "udemy"}, |
| {"wikipedia", "wikipedia"}, |
| {"wordpress", "wordpress"}, |
| {"xfinity", "xfinity"}, |
| {"yahoo", "yahoo"}, |
| {"youtube", "youtube"}, |
| {"zillow", "zillow"}}; |
| |
| // Each element in `kSkeletonsOfPopularKeywordsForCSQ` is a skeleton of a |
| // popular keyword. In contrast to `kBrandNamesForCSQ`, the original keywords |
| // are not included. Because in `kBrandNamesForCSQ`, original brand names are |
| // used to generate the matched domain, and original keywords are not needed for |
| // that process. If you are adding a keyword here, you can generate its skeleton |
| // using the format_url binary (components/url_formatter/tools/format_url.cc) |
| constexpr std::string_view kSkeletonsOfPopularKeywordsForCSQ[] = { |
| // Security |
| "account", "activate", "adrnin", "coin", "crypto", "login", "logout", |
| "password", "secure", "security", "signin", "signout", "wallet"}; |
| |
| // Minimum length of brand to be checked for Combo Squatting. |
| const size_t kMinBrandNameLengthForComboSquatting = 4; |
| |
| ComboSquattingParams* GetComboSquattingParams() { |
| static base::NoDestructor<ComboSquattingParams> params( |
| {kBrandNamesForCSQ, kSkeletonsOfPopularKeywordsForCSQ}); |
| return params.get(); |
| } |
| |
| bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1, |
| const url_formatter::Skeletons& skeletons2) { |
| DCHECK(!skeletons1.empty()); |
| DCHECK(!skeletons2.empty()); |
| for (const std::string& skeleton1 : skeletons1) { |
| if (base::Contains(skeletons2, skeleton1)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| // Returns a site that the user has used before that the eTLD+1 in |
| // |domain_and_registry| may be attempting to spoof, based on skeleton |
| // comparison. |
| std::string GetMatchingSiteEngagementDomain( |
| const std::vector<DomainInfo>& engaged_sites, |
| const DomainInfo& navigated_domain) { |
| DCHECK(!navigated_domain.domain_and_registry.empty()); |
| for (const DomainInfo& engaged_site : engaged_sites) { |
| DCHECK(!engaged_site.domain_and_registry.empty()); |
| if (SkeletonsMatch(navigated_domain.skeletons, engaged_site.skeletons)) { |
| return engaged_site.domain_and_registry; |
| } |
| } |
| return std::string(); |
| } |
| |
| // Scans the top sites list and returns true if it finds a domain with an edit |
| // distance or character swap of one to |domain_and_registry|. This search is |
| // done in lexicographic order on the top 500 suitable domains, instead of in |
| // order by popularity. This means that the resulting "similar" domain may not |
| // be the most popular domain that matches. |
| bool GetSimilarDomainFromTopBucket( |
| const DomainInfo& navigated_domain, |
| const LookalikeTargetAllowlistChecker& target_allowlisted, |
| std::string* matched_domain, |
| LookalikeUrlMatchType* match_type) { |
| TopBucketDomainsParams* top_bucket_domain_params = GetTopDomainParams(); |
| for (const std::string& navigated_skeleton : navigated_domain.skeletons) { |
| for (size_t i = 0; |
| i < top_bucket_domain_params->num_edit_distance_skeletons; i++) { |
| const char* const top_domain_skeleton = |
| UNSAFE_TODO(top_bucket_domain_params->edit_distance_skeletons[i]); |
| DCHECK(strlen(top_domain_skeleton)); |
| // Check edit distance on skeletons. |
| if (IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton), |
| base::UTF8ToUTF16(top_domain_skeleton))) { |
| const std::string top_domain = |
| url_formatter::LookupSkeletonInTopDomains( |
| top_domain_skeleton, url_formatter::SkeletonType::kFull) |
| .domain; |
| DCHECK(!top_domain.empty()); |
| |
| if (!IsLikelyEditDistanceFalsePositive(navigated_domain, |
| GetDomainInfo(top_domain)) && |
| !target_allowlisted.Run(top_domain)) { |
| *matched_domain = top_domain; |
| *match_type = LookalikeUrlMatchType::kEditDistance; |
| return true; |
| } |
| } |
| |
| // Check character swap on skeletons. |
| // TODO(crbug.com/40707797): Also check character swap on actual hostnames |
| // with diacritics etc removed. This is because some characters have two |
| // character skeletons such as m -> rn, and this prevents us from |
| // detecting character swaps between example.com and exapmle.com. |
| if (HasOneCharacterSwap(base::UTF8ToUTF16(navigated_skeleton), |
| base::UTF8ToUTF16(top_domain_skeleton))) { |
| const std::string top_domain = |
| url_formatter::LookupSkeletonInTopDomains( |
| top_domain_skeleton, url_formatter::SkeletonType::kFull) |
| .domain; |
| DCHECK(!top_domain.empty()); |
| if (!IsLikelyCharacterSwapFalsePositive(navigated_domain, |
| GetDomainInfo(top_domain)) && |
| !target_allowlisted.Run(top_domain)) { |
| *matched_domain = top_domain; |
| *match_type = LookalikeUrlMatchType::kCharacterSwapTop500; |
| return true; |
| } |
| } |
| } |
| } |
| return false; |
| } |
| |
| // Scans the engaged site list for edit distance and character swap matches. |
| // Returns true if there is a match and fills |matched_domain| with the first |
| // matching engaged domain and |match_type| with the matching heuristic type. |
| bool GetSimilarDomainFromEngagedSites( |
| const DomainInfo& navigated_domain, |
| const std::vector<DomainInfo>& engaged_sites, |
| const LookalikeTargetAllowlistChecker& target_allowlisted, |
| std::string* matched_domain, |
| LookalikeUrlMatchType* match_type) { |
| for (const std::string& navigated_skeleton : navigated_domain.skeletons) { |
| for (const DomainInfo& engaged_site : engaged_sites) { |
| DCHECK_NE(navigated_domain.domain_and_registry, |
| engaged_site.domain_and_registry); |
| |
| if (!url_formatter::top_domains::IsEditDistanceCandidate( |
| engaged_site.domain_and_registry)) { |
| continue; |
| } |
| // Skip past domains that are allowed to be spoofed. |
| if (target_allowlisted.Run(engaged_site.domain_and_registry)) { |
| continue; |
| } |
| for (const std::string& engaged_skeleton : engaged_site.skeletons) { |
| // Check edit distance on skeletons. |
| if (IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton), |
| base::UTF8ToUTF16(engaged_skeleton)) && |
| !IsLikelyEditDistanceFalsePositive(navigated_domain, |
| engaged_site)) { |
| *matched_domain = engaged_site.domain_and_registry; |
| *match_type = LookalikeUrlMatchType::kEditDistanceSiteEngagement; |
| return true; |
| } |
| // Check character swap on skeletons. |
| if (HasOneCharacterSwap(base::UTF8ToUTF16(navigated_skeleton), |
| base::UTF8ToUTF16(engaged_skeleton)) && |
| !IsLikelyCharacterSwapFalsePositive(navigated_domain, |
| engaged_site)) { |
| *matched_domain = engaged_site.domain_and_registry; |
| *match_type = LookalikeUrlMatchType::kCharacterSwapSiteEngagement; |
| return true; |
| } |
| } |
| } |
| } |
| |
| // Also check character swap on actual hostnames with diacritics etc removed. |
| // This is because some characters have two character skeletons such as m -> |
| // rn, and this prevents us from detecting character swaps between example.com |
| // and exapmle.com. |
| const std::u16string navigated_hostname_without_diacritics = |
| url_formatter::MaybeRemoveDiacritics(navigated_domain.idn_result.result); |
| if (navigated_hostname_without_diacritics != |
| navigated_domain.idn_result.result) { |
| for (const DomainInfo& engaged_site : engaged_sites) { |
| DCHECK_NE(navigated_domain.domain_and_registry, |
| engaged_site.domain_and_registry); |
| const std::u16string engaged_hostname_without_diacritics = |
| url_formatter::MaybeRemoveDiacritics(engaged_site.idn_result.result); |
| |
| if (HasOneCharacterSwap(navigated_hostname_without_diacritics, |
| engaged_hostname_without_diacritics)) { |
| *matched_domain = engaged_site.domain_and_registry; |
| *match_type = LookalikeUrlMatchType::kCharacterSwapSiteEngagement; |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| std::optional<NavigationSuggestionEvent> ToNavigationSuggestionEvent( |
| LookalikeUrlMatchType match_type) { |
| switch (match_type) { |
| case LookalikeUrlMatchType::kSkeletonMatchSiteEngagement: |
| return NavigationSuggestionEvent::kMatchSiteEngagement; |
| case LookalikeUrlMatchType::kEditDistance: |
| return NavigationSuggestionEvent::kMatchEditDistance; |
| case LookalikeUrlMatchType::kEditDistanceSiteEngagement: |
| return NavigationSuggestionEvent::kMatchEditDistanceSiteEngagement; |
| case LookalikeUrlMatchType::kTargetEmbedding: |
| return NavigationSuggestionEvent::kMatchTargetEmbedding; |
| case LookalikeUrlMatchType::kSkeletonMatchTop500: |
| return NavigationSuggestionEvent::kMatchSkeletonTop500; |
| case LookalikeUrlMatchType::kSkeletonMatchTop5k: |
| return NavigationSuggestionEvent::kMatchSkeletonTop5k; |
| case LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips: |
| return NavigationSuggestionEvent::kMatchTargetEmbeddingForSafetyTips; |
| case LookalikeUrlMatchType::kFailedSpoofChecks: |
| return NavigationSuggestionEvent::kFailedSpoofChecks; |
| case LookalikeUrlMatchType::kCharacterSwapSiteEngagement: |
| return NavigationSuggestionEvent::kMatchCharacterSwapSiteEngagement; |
| case LookalikeUrlMatchType::kCharacterSwapTop500: |
| return NavigationSuggestionEvent::kMatchCharacterSwapTop500; |
| case LookalikeUrlMatchType::kComboSquatting: |
| return NavigationSuggestionEvent::kComboSquatting; |
| case LookalikeUrlMatchType::kComboSquattingSiteEngagement: |
| return NavigationSuggestionEvent::kComboSquattingSiteEngagement; |
| case LookalikeUrlMatchType::kNone: |
| return std::nullopt; |
| } |
| } |
| |
| // Returns the parts of the domain that are separated by "." or "-", not |
| // including the eTLD. |
| // |
| // |hostname| must outlive the return value since the vector contains |
| // StringPieces. |
| std::vector<std::string_view> SplitDomainIntoTokens( |
| const std::string& hostname) { |
| return base::SplitStringPiece(hostname, kTargetEmbeddingSeparators, |
| base::TRIM_WHITESPACE, |
| base::SPLIT_WANT_NONEMPTY); |
| } |
| |
| // Returns whether any subdomain ending in the last entry of |domain_labels| is |
| // allowlisted. e.g. if domain_labels = {foo,scholar,google,com}, checks the |
| // allowlist for google.com, scholar.google.com, and foo.scholar.google.com. |
| bool ASubdomainIsAllowlisted( |
| const base::span<const std::string_view>& domain_labels, |
| const LookalikeTargetAllowlistChecker& in_target_allowlist) { |
| CHECK_GT(domain_labels.size(), 1u); |
| std::string potential_hostname(domain_labels.back()); |
| // Attach each token from the end to the embedded target to check if that |
| // subdomain has been allowlisted. |
| for (size_t i = domain_labels.size() - 1; i; --i) { |
| potential_hostname = |
| base::StrCat({domain_labels[i - 1], ".", potential_hostname}); |
| if (in_target_allowlist.Run(potential_hostname)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| // Returns the top domain if the top domain without its separators matches the |
| // |potential_target| (e.g. googlecom). The matching is a skeleton matching. |
| std::string GetMatchingTopDomainWithoutSeparators( |
| std::string_view potential_target) { |
| const url_formatter::Skeletons skeletons = |
| url_formatter::GetSkeletons(base::UTF8ToUTF16(potential_target)); |
| |
| for (const auto& skeleton : skeletons) { |
| url_formatter::TopDomainEntry matched_domain = |
| url_formatter::LookupSkeletonInTopDomains( |
| skeleton, url_formatter::SkeletonType::kSeparatorsRemoved); |
| if (!matched_domain.domain.empty() && |
| matched_domain.skeleton_type == |
| url_formatter::SkeletonType::kSeparatorsRemoved) { |
| return matched_domain.domain; |
| } |
| } |
| return std::string(); |
| } |
| |
| // Returns whether the visited domain is either for a bare eTLD+1 (e.g. |
| // 'google.com') or a trivial subdomain (e.g. 'www.google.com'). |
| bool IsETLDPlusOneOrTrivialSubdomain(const DomainInfo& host) { |
| return (host.domain_and_registry == host.hostname || |
| "www." + host.domain_and_registry == host.hostname); |
| } |
| |
| // Returns if |etld_plus_one| shares the skeleton of an eTLD+1 with an engaged |
| // site or a top bucket domain. |embedded_target| is set to matching eTLD+1. |
| bool DoesETLDPlus1MatchTopDomainOrEngagedSite( |
| const DomainInfo& domain, |
| const std::vector<DomainInfo>& engaged_sites, |
| std::string* embedded_target) { |
| for (const auto& skeleton : domain.skeletons) { |
| for (const auto& engaged_site : engaged_sites) { |
| // Skeleton matching only calculates skeletons of the eTLD+1, so only |
| // consider engaged sites that are bare eTLD+1s (or a trivial subdomain) |
| // and are a skeleton match. |
| if (IsETLDPlusOneOrTrivialSubdomain(engaged_site) && |
| base::Contains(engaged_site.skeletons, skeleton)) { |
| *embedded_target = engaged_site.domain_and_registry; |
| return true; |
| } |
| } |
| } |
| for (const auto& skeleton : domain.skeletons) { |
| const url_formatter::TopDomainEntry top_domain = |
| url_formatter::LookupSkeletonInTopDomains( |
| skeleton, url_formatter::SkeletonType::kFull); |
| if (!top_domain.domain.empty() && top_domain.is_top_bucket) { |
| *embedded_target = top_domain.domain; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| // Returns whether the e2LD of the provided domain is a common word (e.g. |
| // weather.com, ask.com). Target embeddings of these domains are often false |
| // positives (e.g. "super-best-fancy-hotels.com" isn't spoofing "hotels.com"). |
| bool UsesCommonWord(const reputation::SafetyTipsConfig* config_proto, |
| const DomainInfo& domain) { |
| // kDomainsPermittedInEndEmbeddings are based on domains with common words, |
| // but they should not be excluded here (and instead are checked later). |
| for (auto* permitted_ending : kDomainsPermittedInEndEmbeddings) { |
| if (domain.domain_and_registry == permitted_ending) { |
| return false; |
| } |
| } |
| |
| // Search for words in the big common word list. |
| if (url_formatter::common_words::IsCommonWord( |
| domain.domain_without_registry)) { |
| return true; |
| } |
| |
| // Search for words in the component-provided word list. |
| if (lookalikes::IsCommonWordInConfigProto(config_proto, |
| domain.domain_without_registry)) { |
| return true; |
| } |
| |
| // Search for words in the local word lists. |
| for (auto* common_word : kLocalAdditionalCommonWords) { |
| if (domain.domain_without_registry == common_word) { |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| // Returns whether |domain_labels| is in the same domain as embedding_domain. |
| // e.g. IsEmbeddingItself(["foo", "example", "com"], "example.com") -> true |
| // since foo.example.com is in the same domain as example.com. |
| bool IsEmbeddingItself(const base::span<const std::string_view>& domain_labels, |
| const std::string& embedding_domain) { |
| DCHECK(domain_labels.size() >= 2); |
| std::string potential_hostname(domain_labels.back()); |
| // Attach each token from the end to the embedded target to check if that |
| // subdomain is the embedding domain. (e.g. using the earlier example, check |
| // each ["com", "example.com", "foo.example.com"] against "example.com". |
| for (size_t i = domain_labels.size() - 1; i; --i) { |
| potential_hostname = |
| base::StrCat({domain_labels[i - 1], ".", potential_hostname}); |
| if (embedding_domain == potential_hostname) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| // Identical to url_formatter::top_domains::HostnameWithoutRegistry(), but |
| // respects de-facto public registries like .com.de using similar logic to |
| // GetETLDPlusOne. See kPrivateRegistriesTreatedAsPublic definition for more |
| // details. e.g. "google.com.de" returns "google". Call with an eTLD+1, not a |
| // full hostname. |
| std::string GetE2LDWithDeFactoPublicRegistries( |
| const std::string& domain_and_registry) { |
| if (domain_and_registry.empty()) { |
| return std::string(); |
| } |
| |
| size_t registry_size = |
| net::registry_controlled_domains::PermissiveGetHostRegistryLength( |
| domain_and_registry.c_str(), |
| net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES, |
| net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); |
| const size_t private_registry_size = |
| net::registry_controlled_domains::PermissiveGetHostRegistryLength( |
| domain_and_registry.c_str(), |
| net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES, |
| net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES); |
| |
| // If the registry lengths are the same using public and private registries, |
| // than this is just a public registry domain. Otherwise, we need to check if |
| // the registry ends with one of our anointed registries. |
| if (registry_size != private_registry_size) { |
| for (const auto* private_registry : kPrivateRegistriesTreatedAsPublic) { |
| if (base::EndsWith(domain_and_registry, private_registry)) { |
| registry_size = private_registry_size; |
| } |
| } |
| } |
| |
| std::string out = |
| domain_and_registry.substr(0, domain_and_registry.size() - registry_size); |
| base::TrimString(out, ".", &out); |
| return out; |
| } |
| |
| // Returns whether |embedded_target| and |embedding_domain| share the same e2LD, |
| // (as in, e.g., google.com and google.org, or airbnb.com.br and airbnb.com). |
| // Assumes |embedding_domain| is an eTLD+1. Respects de-facto public eTLDs. |
| bool IsCrossTLDMatch(const DomainInfo& embedded_target, |
| const std::string& embedding_domain) { |
| return ( |
| GetE2LDWithDeFactoPublicRegistries(embedded_target.domain_and_registry) == |
| GetE2LDWithDeFactoPublicRegistries(embedding_domain)); |
| } |
| |
| // Returns whether |embedded_target| is one of kDomainsPermittedInEndEmbeddings |
| // and that |embedding_domain| ends with that domain, e.g. "evil-office.com" is |
| // permitted, as "office.com" is in kDomainsPermittedInEndEmbeddings. Only |
| // impacts Target Embedding matches. |
| bool EndsWithPermittedDomains(const DomainInfo& embedded_target, |
| const std::string& embedding_domain) { |
| for (auto* permitted_ending : kDomainsPermittedInEndEmbeddings) { |
| if (embedded_target.domain_and_registry == permitted_ending && |
| base::EndsWith(embedding_domain, |
| base::StrCat({"-", permitted_ending}))) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| // A domain is allowed to be embedded if is embedding itself, if its e2LD is a |
| // common word, any valid partial subdomain is allowlisted, or if it's a |
| // cross-TLD match (e.g. google.com vs google.com.mx). |
| bool IsAllowedToBeEmbedded( |
| const DomainInfo& embedded_target, |
| const base::span<const std::string_view>& subdomain_span, |
| const LookalikeTargetAllowlistChecker& in_target_allowlist, |
| const std::string& embedding_domain, |
| const reputation::SafetyTipsConfig* config_proto) { |
| return UsesCommonWord(config_proto, embedded_target) || |
| ASubdomainIsAllowlisted(subdomain_span, in_target_allowlist) || |
| IsEmbeddingItself(subdomain_span, embedding_domain) || |
| IsCrossTLDMatch(embedded_target, embedding_domain) || |
| EndsWithPermittedDomains(embedded_target, embedding_domain); |
| } |
| |
| // Returns the first character of the first string that is different from the |
| // second string. Strings should be at least 1 edit distance apart. |
| char GetFirstDifferentChar(const std::string& str1, const std::string& str2) { |
| std::string::const_iterator i1 = str1.begin(); |
| std::string::const_iterator i2 = str2.begin(); |
| while (i1 != str1.end() && i2 != str2.end()) { |
| if (*i1 != *i2) { |
| return *i1; |
| } |
| i1++; |
| i2++; |
| } |
| NOTREACHED(); |
| } |
| |
| // Brand names with length of 4 or less should not be checked in domains for |
| // Combo Squatting. Short brand names can cause false positives in results. |
| bool IsComboSquattingCandidate(const std::string& brand) { |
| return brand.size() > kMinBrandNameLengthForComboSquatting; |
| } |
| |
| // Extract brand names from engaged sites to be checked for Combo Squatting, if |
| // the brand is not one of the hard coded brand names. |
| std::vector<std::pair<std::string, std::string>> GetBrandNamesFromEngagedSites( |
| const std::vector<DomainInfo>& engaged_sites) { |
| std::vector<std::pair<std::string, std::string>> output; |
| |
| for (const DomainInfo& engaged_site : engaged_sites) { |
| url_formatter::Skeletons domain_without_registry_skeletons = |
| engaged_site.domain_without_registry_skeletons; |
| for (const std::string& skeleton : domain_without_registry_skeletons) |
| if (IsComboSquattingCandidate(engaged_site.domain_without_registry)) { |
| std::pair<std::string, std::string> brand_name = { |
| engaged_site.domain_without_registry, skeleton}; |
| output.emplace_back(brand_name); |
| } |
| } |
| return output; |
| } |
| |
| // Registry of the navigated domain is needed to find matched_domain |
| // in Combo Squatting domains. For example, registry of |
| // `google-login[.]co[.]br` is `co[.]br`. |
| std::string GetRegistry(const DomainInfo& navigated_domain) { |
| size_t registry_size = navigated_domain.domain_and_registry.size() - |
| navigated_domain.domain_without_registry.size() - 1; |
| |
| std::string domain_and_registry = navigated_domain.domain_and_registry; |
| std::string registry = |
| domain_and_registry.substr(domain_and_registry.size() - registry_size, |
| domain_and_registry.size() - 1); |
| return registry; |
| } |
| |
| // If a matched domain including the brand name and TLD of |
| // navigated domain is found in top domains, |matched_domain| |
| // is set to the found top domain. Otherwise, |matched_domain| will |
| // be set to brand_name[.]com. Hard coded brand names should be checked to have |
| // valid brand_name[.]com url. |
| std::string FindMatchedDomainForHardCodedComboSquatting( |
| const std::string& brand_name, |
| const DomainInfo& navigated_domain) { |
| DomainInfo suggested_matched_domain = |
| GetDomainInfo(brand_name + '.' + GetRegistry(navigated_domain)); |
| if (url_formatter::IsDomainAndRegistryATopDomain( |
| suggested_matched_domain.domain_and_registry)) { |
| return suggested_matched_domain.hostname; |
| } else { |
| return brand_name + ".com"; |
| } |
| } |
| |
| // Engaged sites are sorted based on engagement score, so |matched_domain| |
| // will be set to the first domain in the engaged sites lists that includes |
| // the brand name of the navigated domain. |
| std::string FindMatchedDomainForSiteEngagementComboSquatting( |
| const std::string& brand_name, |
| const DomainInfo& navigated_domain, |
| const std::vector<DomainInfo>& engaged_sites) { |
| for (auto& engaged_site : engaged_sites) { |
| if (brand_name == engaged_site.domain_without_registry) { |
| return engaged_site.hostname; |
| } |
| } |
| return std::string(); |
| } |
| |
| // Returns true if the navigated_domain is flagged as Combo Squatting. |
| // matched_domain is the suggested domain that will be shown to the user |
| // instead of the navigated_domain in the warning UI. |
| bool IsComboSquatting( |
| const std::vector<std::pair<std::string, std::string>>& brand_names, |
| const ComboSquattingParams& combo_squatting_params, |
| const DomainInfo& navigated_domain, |
| const std::vector<DomainInfo>& engaged_sites, |
| std::string* matched_domain, |
| bool is_hard_coded) { |
| // Check if the domain has any brand name and any popular keyword. |
| for (auto& brand : brand_names) { |
| auto brand_name = brand.first; |
| auto brand_skeleton = brand.second; |
| DCHECK(IsComboSquattingCandidate(brand_name)); |
| for (auto& skeleton : navigated_domain.domain_without_registry_skeletons) { |
| size_t brand_skeleton_pos = skeleton.find(brand_skeleton); |
| if (skeleton.size() == brand_skeleton.size() || |
| brand_skeleton_pos == std::string::npos) { |
| continue; |
| } |
| |
| for (auto keyword : combo_squatting_params.popular_keywords) { |
| size_t keyword_pos = skeleton.find(keyword); |
| if (keyword_pos == std::string::npos) { |
| // Keyword not found, ignore. |
| continue; |
| } |
| |
| if (std::string(brand_skeleton).find(keyword) != std::string::npos || |
| std::string(keyword).find(brand_skeleton) != std::string::npos) { |
| // Keyword is a substring of brand or vice versa, ignore. |
| continue; |
| } |
| |
| if ((keyword_pos > brand_skeleton_pos && |
| keyword_pos < brand_skeleton_pos + brand_skeleton.size()) || |
| (brand_skeleton_pos > keyword_pos && |
| brand_skeleton_pos < keyword_pos + keyword.size())) { |
| // Keyword and brand overlap, ignore. |
| continue; |
| } |
| |
| if (is_hard_coded) { |
| *matched_domain = FindMatchedDomainForHardCodedComboSquatting( |
| brand_name, navigated_domain); |
| } else { |
| *matched_domain = FindMatchedDomainForSiteEngagementComboSquatting( |
| brand_name, navigated_domain, engaged_sites); |
| } |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| // Hostnames containing these strings are considered unsafe due to ligature |
| // rendering in some fonts. |
| constexpr const char* kUnsafeLigatures[] = { |
| "g_logo", "o_logo", "l_logo", "e_logo", |
| // google_logo is also unsafe, but e_logo is its substring. |
| // super_g_logo is also unsafe, but g_logo is its substring. |
| "google_g", "glogoligature", "ologoligature", "llogoligature", |
| "elogoligature", |
| // googlelogoligature is also unsafe, but elogoligature is its |
| // substring |
| }; |
| |
| bool IsUnsafeLigature(const DomainInfo& domain) { |
| for (const char* unsafe_ligature : kUnsafeLigatures) { |
| if (domain.hostname.find(unsafe_ligature) != std::string::npos) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| } // namespace |
| |
| namespace lookalikes { |
| |
| const char kInterstitialHistogramName[] = "NavigationSuggestion.Event2"; |
| const char kIncognitoInterstitialHistogramName[] = |
| "NavigationSuggestion.Event2.Incognito"; |
| |
| void RegisterProfilePrefs(user_prefs::PrefRegistrySyncable* registry) { |
| registry->RegisterListPref(prefs::kLookalikeWarningAllowlistDomains); |
| } |
| |
| std::string GetConsoleMessage(const GURL& lookalike_url, |
| bool is_new_heuristic) { |
| const char* const kNewHeuristicMessage = |
| "Future Chrome versions will show a warning on this domain name.\n"; |
| return base::StrCat({"Chrome has determined that ", lookalike_url.host(), |
| " could be fake or fraudulent.\n\n", |
| is_new_heuristic ? kNewHeuristicMessage : "", |
| "If you believe this is shown in error please visit " |
| "https://g.co/chrome/lookalike-warnings"}); |
| } |
| |
| DomainInfo::DomainInfo( |
| const std::string& arg_hostname, |
| const std::string& arg_domain_and_registry, |
| const std::string& arg_domain_without_registry, |
| const url_formatter::IDNConversionResult& arg_idn_result, |
| const url_formatter::Skeletons& arg_skeletons, |
| const url_formatter::Skeletons& arg_domain_without_registry_skeletons) |
| : hostname(arg_hostname), |
| domain_and_registry(arg_domain_and_registry), |
| domain_without_registry(arg_domain_without_registry), |
| idn_result(arg_idn_result), |
| skeletons(arg_skeletons), |
| domain_without_registry_skeletons(arg_domain_without_registry_skeletons) { |
| } |
| |
| DomainInfo::~DomainInfo() = default; |
| |
| DomainInfo::DomainInfo(const DomainInfo&) = default; |
| |
| DomainInfo GetDomainInfo(const std::string& hostname) { |
| TRACE_EVENT0("navigation", "GetDomainInfo"); |
| if (net::HostStringIsLocalhost(hostname) || |
| net::IsHostnameNonUnique(hostname)) { |
| return DomainInfo(std::string(), std::string(), std::string(), |
| url_formatter::IDNConversionResult(), |
| url_formatter::Skeletons(), url_formatter::Skeletons()); |
| } |
| const std::string domain_and_registry = GetETLDPlusOne(hostname); |
| const std::string domain_without_registry = |
| domain_and_registry.empty() |
| ? std::string() |
| : url_formatter::top_domains::HostnameWithoutRegistry( |
| domain_and_registry); |
| |
| // eTLD+1 can be empty for private domains. |
| if (domain_and_registry.empty()) { |
| return DomainInfo(hostname, domain_and_registry, domain_without_registry, |
| url_formatter::IDNConversionResult(), |
| url_formatter::Skeletons(), url_formatter::Skeletons()); |
| } |
| // Compute skeletons using eTLD+1, skipping all spoofing checks. Spoofing |
| // checks in url_formatter can cause the converted result to be punycode. |
| // We want to avoid this in order to get an accurate skeleton for the unicode |
| // version of the domain. |
| const url_formatter::IDNConversionResult idn_result = |
| url_formatter::UnsafeIDNToUnicodeWithDetails(domain_and_registry); |
| const url_formatter::Skeletons skeletons = |
| url_formatter::GetSkeletons(idn_result.result); |
| |
| const url_formatter::IDNConversionResult domain_without_registry_idn_result = |
| url_formatter::UnsafeIDNToUnicodeWithDetails(domain_without_registry); |
| const url_formatter::Skeletons domain_without_registry_skeletons = |
| url_formatter::GetSkeletons(domain_without_registry_idn_result.result); |
| return DomainInfo(hostname, domain_and_registry, domain_without_registry, |
| idn_result, skeletons, domain_without_registry_skeletons); |
| } |
| |
| DomainInfo GetDomainInfo(const GURL& url) { |
| return GetDomainInfo(url.GetHost()); |
| } |
| |
| std::string GetETLDPlusOne(const std::string& hostname) { |
| auto pub = net::registry_controlled_domains::GetDomainAndRegistry( |
| hostname, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); |
| auto priv = net::registry_controlled_domains::GetDomainAndRegistry( |
| hostname, net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES); |
| // If there is no difference in eTLD+1 with/without private registries, then |
| // the domain uses a public registry and we can return the eTLD+1 safely. |
| if (pub == priv) { |
| return pub; |
| } |
| // Otherwise, the domain uses a private registry and |pub| is that private |
| // registry. If it's a de-facto-public registry, return the private eTLD+1. |
| for (auto* private_registry : kPrivateRegistriesTreatedAsPublic) { |
| if (private_registry == pub) { |
| return priv; |
| } |
| } |
| // Otherwise, ignore the normal private registry and return the public eTLD+1. |
| return pub; |
| } |
| |
| bool IsEditDistanceAtMostOne(const std::u16string& str1, |
| const std::u16string& str2) { |
| if (str1.size() > str2.size() + 1 || str2.size() > str1.size() + 1) { |
| return false; |
| } |
| std::u16string::const_iterator i = str1.begin(); |
| std::u16string::const_iterator j = str2.begin(); |
| size_t edit_count = 0; |
| while (i != str1.end() && j != str2.end()) { |
| if (*i == *j) { |
| i++; |
| j++; |
| } else { |
| edit_count++; |
| if (edit_count > 1) { |
| return false; |
| } |
| |
| if (str1.size() > str2.size()) { |
| // First string is longer than the second. This can only happen if the |
| // first string has an extra character. |
| i++; |
| } else if (str2.size() > str1.size()) { |
| // Second string is longer than the first. This can only happen if the |
| // second string has an extra character. |
| j++; |
| } else { |
| // Both strings are the same length. This can only happen if the two |
| // strings differ by a single character. |
| i++; |
| j++; |
| } |
| } |
| } |
| if (i != str1.end() || j != str2.end()) { |
| // A character at the end did not match. |
| edit_count++; |
| } |
| return edit_count <= 1; |
| } |
| |
| bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain, |
| const DomainInfo& matched_domain) { |
| DCHECK(url_formatter::top_domains::IsEditDistanceCandidate( |
| matched_domain.domain_and_registry)); |
| DCHECK(url_formatter::top_domains::IsEditDistanceCandidate( |
| navigated_domain.domain_and_registry)); |
| // If the only difference between the domains is the registry part, this is |
| // unlikely to be a spoofing attempt and we should ignore this match. E.g. |
| // exclude matches like google.com.tw and google.com.tr. |
| if (navigated_domain.domain_without_registry == |
| matched_domain.domain_without_registry) { |
| return true; |
| } |
| |
| // If the domains only differ by a numeric suffix on their e2LD (e.g. |
| // site45.tld and site35.tld), then ignore the match. |
| auto nav_trimmed = base::TrimString(navigated_domain.domain_without_registry, |
| kDigitChars, base::TRIM_TRAILING); |
| auto matched_trimmed = base::TrimString( |
| matched_domain.domain_without_registry, kDigitChars, base::TRIM_TRAILING); |
| DCHECK_NE(navigated_domain.domain_without_registry, |
| matched_domain.domain_without_registry); |
| // We previously verified that the domains without registries weren't equal, |
| // so if they're equal now, the match must have come from numeric suffixes. |
| if (nav_trimmed == matched_trimmed) { |
| return true; |
| } |
| |
| // Ignore domains that only differ by an insertion/substitution at the |
| // start, as these are usually different words, not lookalikes. |
| const auto nav_dom_len = navigated_domain.domain_and_registry.length(); |
| const auto matched_dom_len = matched_domain.domain_and_registry.length(); |
| const auto& nav_dom = navigated_domain.domain_and_registry; |
| const auto& matched_dom = matched_domain.domain_and_registry; |
| if (nav_dom_len == matched_dom_len) { |
| // e.g. hank vs tank |
| if (nav_dom.substr(1) == matched_dom.substr(1)) { |
| return true; |
| } |
| } else if (nav_dom_len < matched_dom_len) { |
| // e.g. oodle vs poodle |
| if (nav_dom == matched_dom.substr(1)) { |
| return true; |
| } |
| } else { // navigated_dom_len > matched_dom_len |
| // e.g. poodle vs oodle |
| if (nav_dom.substr(1) == matched_dom) { |
| return true; |
| } |
| } |
| |
| // Ignore domains that only differ by an insertion of a "-". |
| if (nav_dom_len != matched_dom_len) { |
| if (nav_dom_len < matched_dom_len && |
| GetFirstDifferentChar(matched_dom, nav_dom) == '-') { |
| return true; |
| } else if (nav_dom_len > matched_dom_len && |
| GetFirstDifferentChar(nav_dom, matched_dom) == '-') { |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| bool IsLikelyCharacterSwapFalsePositive(const DomainInfo& navigated_domain, |
| const DomainInfo& matched_domain) { |
| DCHECK(url_formatter::top_domains::IsEditDistanceCandidate( |
| matched_domain.domain_and_registry)); |
| DCHECK(url_formatter::top_domains::IsEditDistanceCandidate( |
| navigated_domain.domain_and_registry)); |
| // If the only difference between the domains is the registry part, this is |
| // unlikely to be a spoofing attempt and we should ignore this match. E.g. |
| // exclude matches like google.sr and google.rs. |
| return navigated_domain.domain_without_registry == |
| matched_domain.domain_without_registry; |
| } |
| |
| bool GetMatchingDomain( |
| const DomainInfo& navigated_domain, |
| const std::vector<DomainInfo>& engaged_sites, |
| const LookalikeTargetAllowlistChecker& in_target_allowlist, |
| const reputation::SafetyTipsConfig* config_proto, |
| std::string* matched_domain, |
| LookalikeUrlMatchType* match_type) { |
| DCHECK(!navigated_domain.domain_and_registry.empty()); |
| DCHECK(matched_domain); |
| DCHECK(match_type); |
| |
| if (navigated_domain.idn_result.has_idn_component) { |
| // If the navigated domain is IDN, check its skeleton against engaged sites |
| // and top domains. |
| const std::string matched_engaged_domain = |
| GetMatchingSiteEngagementDomain(engaged_sites, navigated_domain); |
| DCHECK_NE(navigated_domain.domain_and_registry, matched_engaged_domain); |
| if (!matched_engaged_domain.empty()) { |
| *matched_domain = matched_engaged_domain; |
| *match_type = LookalikeUrlMatchType::kSkeletonMatchSiteEngagement; |
| return true; |
| } |
| |
| if (!navigated_domain.idn_result.matching_top_domain.domain.empty()) { |
| // In practice, this is not possible since the top domain list does not |
| // contain IDNs, so domain_and_registry can't both have IDN and be a top |
| // domain. Still, sanity check in case the top domain list changes in the |
| // future. |
| // At this point, navigated domain should not be a top domain. |
| DCHECK_NE(navigated_domain.domain_and_registry, |
| navigated_domain.idn_result.matching_top_domain.domain); |
| *matched_domain = navigated_domain.idn_result.matching_top_domain.domain; |
| *match_type = |
| navigated_domain.idn_result.matching_top_domain.is_top_bucket |
| ? LookalikeUrlMatchType::kSkeletonMatchTop500 |
| : LookalikeUrlMatchType::kSkeletonMatchTop5k; |
| return true; |
| } |
| } |
| |
| if (url_formatter::top_domains::IsEditDistanceCandidate( |
| navigated_domain.domain_and_registry)) { |
| // If we can't find an exact top domain or an engaged site, try to find an |
| // engaged domain within an edit distance of one or a single character swap. |
| if (GetSimilarDomainFromEngagedSites(navigated_domain, engaged_sites, |
| in_target_allowlist, matched_domain, |
| match_type)) { |
| DCHECK_NE(navigated_domain.domain_and_registry, *matched_domain); |
| return true; |
| } |
| |
| // Finally, try to find a top domain within an edit distance or character |
| // swap of one. |
| if (GetSimilarDomainFromTopBucket(navigated_domain, in_target_allowlist, |
| matched_domain, match_type)) { |
| DCHECK_NE(navigated_domain.domain_and_registry, *matched_domain); |
| DCHECK(!matched_domain->empty()); |
| return true; |
| } |
| } |
| |
| TargetEmbeddingType embedding_type = |
| GetTargetEmbeddingType(navigated_domain.hostname, engaged_sites, |
| in_target_allowlist, config_proto, matched_domain); |
| if (embedding_type == TargetEmbeddingType::kSafetyTip) { |
| *match_type = LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips; |
| return true; |
| } else if (embedding_type == TargetEmbeddingType::kInterstitial) { |
| *match_type = LookalikeUrlMatchType::kTargetEmbedding; |
| return true; |
| } |
| |
| // If none of the previous heuristics work, check it for Combo Squatting. |
| ComboSquattingType combo_squatting_type = |
| GetComboSquattingType(navigated_domain, engaged_sites, matched_domain); |
| if (combo_squatting_type == ComboSquattingType::kHardCoded) { |
| *match_type = LookalikeUrlMatchType::kComboSquatting; |
| DCHECK(!matched_domain->empty()); |
| return true; |
| } else if (combo_squatting_type == ComboSquattingType::kSiteEngagement) { |
| *match_type = LookalikeUrlMatchType::kComboSquattingSiteEngagement; |
| DCHECK(!matched_domain->empty()); |
| return true; |
| } |
| |
| DCHECK(embedding_type == TargetEmbeddingType::kNone); |
| DCHECK(combo_squatting_type == ComboSquattingType::kNone); |
| return false; |
| } |
| |
| void RecordUMAFromMatchType(LookalikeUrlMatchType match_type, |
| bool is_incognito) { |
| std::optional<NavigationSuggestionEvent> event = |
| ToNavigationSuggestionEvent(match_type); |
| if (event) { |
| if (is_incognito) { |
| UMA_HISTOGRAM_ENUMERATION(lookalikes::kIncognitoInterstitialHistogramName, |
| *event); |
| } else { |
| UMA_HISTOGRAM_ENUMERATION(lookalikes::kInterstitialHistogramName, *event); |
| } |
| } |
| } |
| |
| TargetEmbeddingType GetTargetEmbeddingType( |
| const std::string& hostname, |
| const std::vector<DomainInfo>& engaged_sites, |
| const LookalikeTargetAllowlistChecker& in_target_allowlist, |
| const reputation::SafetyTipsConfig* config_proto, |
| std::string* safe_hostname) { |
| // Because of how target embeddings are detected (i.e. by sweeping the URL |
| // from back to front), we're guaranteed to find tail-embedding before other |
| // target embedding. Tail embedding triggers a safety tip, but interstitials |
| // are more important than safety tips, so if we find a safety tippable |
| // embedding with SearchForEmbeddings, go search again not permitting safety |
| // tips to see if we can also find an interstitiallable embedding. |
| auto result = SearchForEmbeddings( |
| hostname, engaged_sites, in_target_allowlist, config_proto, |
| /*safety_tips_allowed=*/true, safe_hostname); |
| if (result == TargetEmbeddingType::kSafetyTip) { |
| std::string no_st_safe_hostname; |
| auto no_st_result = SearchForEmbeddings( |
| hostname, engaged_sites, in_target_allowlist, config_proto, |
| /*safety_tips_allowed=*/false, &no_st_safe_hostname); |
| if (no_st_result == TargetEmbeddingType::kNone) { |
| return result; |
| } |
| *safe_hostname = no_st_safe_hostname; |
| return no_st_result; |
| } |
| return result; |
| } |
| |
| TargetEmbeddingType SearchForEmbeddings( |
| const std::string& hostname, |
| const std::vector<DomainInfo>& engaged_sites, |
| const LookalikeTargetAllowlistChecker& in_target_allowlist, |
| const reputation::SafetyTipsConfig* config_proto, |
| bool safety_tips_allowed, |
| std::string* safe_hostname) { |
| const std::string embedding_domain = GetETLDPlusOne(hostname); |
| const std::vector<std::string_view> hostname_tokens = |
| SplitDomainIntoTokens(hostname); |
| |
| // There are O(n^2) potential target embeddings in a domain name. We want to |
| // be comprehensive, but optimize so that usually we needn't check all of |
| // them. We do that by sweeping from the back of the embedding domain, towards |
| // the front, checking for a valid eTLD. If we find one, then we consider the |
| // possible embedded domains that end in that eTLD (i.e. all possible start |
| // points from the beginning of the string onward). |
| for (size_t end = hostname_tokens.size(); end > 0; --end) { |
| base::span<const std::string_view> UNSAFE_TODO( |
| etld_check_span(hostname_tokens.data(), end)); |
| std::string etld_check_host = base::JoinString(etld_check_span, "."); |
| auto etld_check_dominfo = GetDomainInfo(etld_check_host); |
| |
| // Check if the final token is a no-separator target (e.g. "googlecom"). |
| // This check happens first so that we can exclude invalid eTLD+1s next. |
| std::string embedded_target = |
| GetMatchingTopDomainWithoutSeparators(hostname_tokens[end - 1]); |
| if (!embedded_target.empty()) { |
| // Extract the full possibly-spoofed domain. To get this, we take the |
| // hostname up until this point, strip off the no-separator bit (e.g. |
| // googlecom) and then re-add the the separated version (e.g. google.com). |
| auto spoofed_domain = |
| etld_check_host.substr( |
| 0, etld_check_host.length() - hostname_tokens[end - 1].length()) + |
| embedded_target; |
| const auto no_separator_tokens = base::SplitStringPiece( |
| spoofed_domain, kTargetEmbeddingSeparators, base::TRIM_WHITESPACE, |
| base::SPLIT_WANT_NONEMPTY); |
| auto no_separator_dominfo = GetDomainInfo(embedded_target); |
| |
| // Only flag on domains that are long enough, don't use common words, and |
| // aren't target-allowlisted. |
| if (no_separator_dominfo.domain_without_registry.length() > |
| kMinE2LDLengthForTargetEmbedding && |
| !IsAllowedToBeEmbedded(no_separator_dominfo, no_separator_tokens, |
| in_target_allowlist, embedding_domain, |
| config_proto)) { |
| *safe_hostname = embedded_target; |
| return TargetEmbeddingType::kInterstitial; |
| } |
| } |
| |
| // Exclude otherwise-invalid eTLDs. |
| if (etld_check_dominfo.domain_without_registry.empty()) { |
| continue; |
| } |
| |
| // Exclude e2LDs that are too short. <= because domain_without_registry has |
| // a trailing ".". |
| if (etld_check_dominfo.domain_without_registry.length() <= |
| kMinE2LDLengthForTargetEmbedding) { |
| continue; |
| } |
| |
| // Check for exact matches against engaged sites, among all possible |
| // subdomains ending at |end|. |
| for (size_t start = 0; start < end - 1; ++start) { |
| const base::span<const std::string_view> UNSAFE_TODO( |
| span(hostname_tokens.data() + start, end - start)); |
| auto embedded_hostname = base::JoinString(span, "."); |
| auto embedded_dominfo = GetDomainInfo(embedded_hostname); |
| |
| for (auto& engaged_site : engaged_sites) { |
| if (engaged_site.hostname == embedded_dominfo.hostname && |
| !IsAllowedToBeEmbedded(embedded_dominfo, span, in_target_allowlist, |
| embedding_domain, config_proto)) { |
| *safe_hostname = engaged_site.hostname; |
| // Tail-embedding (e.g. evil-google.com, where the embedding happens |
| // at the very end of the hostname) is a safety tip, but only when |
| // safety tips are allowed. If it's tail embedding but we can't create |
| // a safety tip, keep looking. Non-tail-embeddings are interstitials. |
| if (end != hostname_tokens.size()) { |
| return TargetEmbeddingType::kInterstitial; |
| } else if (safety_tips_allowed) { |
| return TargetEmbeddingType::kSafetyTip; |
| } // else keep searching. |
| } |
| } |
| } |
| |
| // There were no exact engaged site matches, but there may yet still be a |
| // match against the eTLD+1 of an engaged or top site. |
| if (DoesETLDPlus1MatchTopDomainOrEngagedSite( |
| etld_check_dominfo, engaged_sites, safe_hostname) && |
| !IsAllowedToBeEmbedded(etld_check_dominfo, etld_check_span, |
| in_target_allowlist, embedding_domain, |
| config_proto)) { |
| // Tail-embedding (e.g. evil-google.com, where the embedding happens at |
| // the very end of the hostname) is a safety tip, but only when safety |
| // tips are allowed. If it's tail embedding but we can't create a safety |
| // tip, keep looking. Non-tail-embeddings are interstitials. |
| if (end != hostname_tokens.size()) { |
| return TargetEmbeddingType::kInterstitial; |
| } else if (safety_tips_allowed) { |
| return TargetEmbeddingType::kSafetyTip; |
| } // else keep searching. |
| } |
| } |
| return TargetEmbeddingType::kNone; |
| } |
| |
| bool IsASCII(UChar32 codepoint) { |
| return !(codepoint & ~0x7F); |
| } |
| |
| // Returns true if |codepoint| has emoji related properties. |
| bool IsEmojiRelatedCodepoint(UChar32 codepoint) { |
| return u_hasBinaryProperty(codepoint, UCHAR_EMOJI) || |
| // Characters that have emoji presentation by default (e.g. hourglass) |
| u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION) || |
| // Characters displayed as country flags when used as a valid pair. |
| // E.g. Regional Indicator Symbol Letter B used once in a string |
| // is rendered as 🇧, used twice is rendered as the flag of Barbados |
| // (with country code BB). It's therefore possible to come up with |
| // a spoof using regional indicator characters as text, but these |
| // domain names will be readily punycoded and detecting pairs isn't |
| // easy so we keep the code simple here. |
| u_hasBinaryProperty(codepoint, UCHAR_REGIONAL_INDICATOR) || |
| // Pictographs such as Black Cross On Shield (U+26E8). |
| u_hasBinaryProperty(codepoint, UCHAR_EXTENDED_PICTOGRAPHIC); |
| } |
| |
| // Returns true if |text| contains only ASCII characters, pictographs |
| // or emojis. This check is only used to determine if a domain that already |
| // failed spoof checks should be blocked by an interstitial. Ideally, we would |
| // check this for non-ASCII scripts as well (e.g. Cyrillic + emoji), but such |
| // usage isn't common. |
| bool IsASCIIAndEmojiOnly(std::u16string_view text) { |
| for (base::i18n::UTF16CharIterator iter(text); !iter.end(); iter.Advance()) { |
| const UChar32 codepoint = iter.get(); |
| if (!IsASCII(codepoint) && !IsEmojiRelatedCodepoint(codepoint)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| // Returns true if the e2LD of domain is long enough to display a punycode |
| // interstitial. |
| bool IsPunycodeInterstitialCandidate(const DomainInfo& domain) { |
| const url_formatter::IDNConversionResult idn_result = |
| url_formatter::UnsafeIDNToUnicodeWithDetails( |
| domain.domain_without_registry); |
| return idn_result.result.size() >= |
| kMinimumE2LDLengthToShowPunycodeInterstitial; |
| } |
| |
| bool ShouldBlockBySpoofCheckResult(const DomainInfo& navigated_domain) { |
| if (IsUnsafeLigature(navigated_domain)) { |
| return true; |
| } |
| |
| // Here, only a subset of spoof checks that cause an IDN to fallback to |
| // punycode are configured to show an interstitial. |
| switch (navigated_domain.idn_result.spoof_check_result) { |
| case url_formatter::IDNSpoofCheckerResult::kNone: |
| case url_formatter::IDNSpoofCheckerResult::kSafe: |
| return false; |
| |
| case url_formatter::IDNSpoofCheckerResult::kICUSpoofChecks: |
| // If the eTLD+1 contains only a mix of ASCII + Emoji, allow. |
| return !IsASCIIAndEmojiOnly(navigated_domain.idn_result.result) && |
| IsPunycodeInterstitialCandidate(navigated_domain); |
| |
| case url_formatter::IDNSpoofCheckerResult::kDeviationCharacters: |
| // Failures because of deviation characters, especially ß, is common. |
| return false; |
| |
| case url_formatter::IDNSpoofCheckerResult::kTLDSpecificCharacters: |
| case url_formatter::IDNSpoofCheckerResult::kUnsafeMiddleDot: |
| case url_formatter::IDNSpoofCheckerResult::kWholeScriptConfusable: |
| case url_formatter::IDNSpoofCheckerResult::kDigitLookalikes: |
| case url_formatter::IDNSpoofCheckerResult:: |
| kNonAsciiLatinCharMixedWithNonLatin: |
| case url_formatter::IDNSpoofCheckerResult::kDangerousPattern: |
| return IsPunycodeInterstitialCandidate(navigated_domain); |
| } |
| } |
| |
| bool IsAllowedByEnterprisePolicy(const PrefService* pref_service, |
| const GURL& url) { |
| const base::Value::List& list = |
| pref_service->GetList(prefs::kLookalikeWarningAllowlistDomains); |
| |
| for (const auto& domain_val : list) { |
| const std::string& domain = domain_val.GetString(); |
| if (url.DomainIs(domain)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| void SetEnterpriseAllowlistForTesting(PrefService* pref_service, |
| const std::vector<std::string>& hosts) { |
| base::Value::List list; |
| for (const auto& host : hosts) { |
| list.Append(host); |
| } |
| pref_service->SetList(prefs::kLookalikeWarningAllowlistDomains, |
| std::move(list)); |
| } |
| |
| bool HasOneCharacterSwap(const std::u16string& str1, |
| const std::u16string& str2) { |
| if (str1.size() != str2.size()) { |
| return false; |
| } |
| if (str1 == str2) { |
| return false; |
| } |
| bool has_swap = false; |
| std::u16string::const_iterator i = str1.begin(); |
| std::u16string::const_iterator j = str2.begin(); |
| while (i != str1.end()) { |
| DCHECK(j < str2.end()); |
| wchar_t left1 = *i; |
| wchar_t right1 = *j; |
| i++; |
| j++; |
| if (left1 == right1) { |
| continue; |
| } |
| wchar_t left2 = *i; |
| wchar_t right2 = *j; |
| if (!has_swap && (left1 == right2 && right1 == left2)) { |
| has_swap = true; |
| i++; |
| j++; |
| continue; |
| } |
| // Either there are multiple swaps, or strings have completely different |
| // characters. |
| return false; |
| } |
| return has_swap; |
| } |
| |
| void SetTopBucketDomainsParamsForTesting(const TopBucketDomainsParams& params) { |
| *GetTopDomainParams() = params; |
| } |
| |
| void ResetTopBucketDomainsParamsForTesting() { |
| TopBucketDomainsParams* params = GetTopDomainParams(); |
| *params = {top_bucket_domains::kTopBucketEditDistanceSkeletons, |
| top_bucket_domains::kNumTopBucketEditDistanceSkeletons}; |
| } |
| |
| bool IsHeuristicEnabledForHostname( |
| const reputation::SafetyTipsConfig* config_proto, |
| const reputation::HeuristicLaunchConfig::Heuristic heuristic, |
| const std::string& lookalike_etld_plus_one, |
| version_info::Channel channel) { |
| DCHECK(!lookalike_etld_plus_one.empty()); |
| if (!config_proto) { |
| return false; |
| } |
| base::SHA1Digest hash = |
| base::SHA1Hash(base::as_byte_span(lookalike_etld_plus_one)); |
| float cohort = hash[0u] / 2.56; |
| for (const reputation::HeuristicLaunchConfig& config : |
| config_proto->launch_config()) { |
| if (heuristic == config.heuristic()) { |
| switch (channel) { |
| // Enable by default on local builds. |
| case version_info::Channel::UNKNOWN: |
| return true; |
| |
| // Use pre-defined launch percentages for Canary/Dev and Beta. Use the |
| // launch percentage from config for Stable. |
| case version_info::Channel::CANARY: |
| case version_info::Channel::DEV: |
| return kDefaultLaunchPercentageOnCanaryDev > cohort; |
| |
| case version_info::Channel::BETA: |
| return kDefaultLaunchPercentageOnBeta > cohort; |
| |
| case version_info::Channel::STABLE: |
| return config.launch_percentage() > cohort; |
| } |
| } |
| } |
| return false; |
| } |
| |
| void SetComboSquattingParamsForTesting(const ComboSquattingParams& params) { |
| *GetComboSquattingParams() = params; |
| } |
| |
| void ResetComboSquattingParamsForTesting() { |
| ComboSquattingParams* params = GetComboSquattingParams(); |
| *params = {kBrandNamesForCSQ, kSkeletonsOfPopularKeywordsForCSQ}; |
| } |
| |
| ComboSquattingType GetComboSquattingType( |
| const DomainInfo& navigated_domain, |
| const std::vector<DomainInfo>& engaged_sites, |
| std::string* matched_domain) { |
| const ComboSquattingParams* combo_squatting_params = |
| GetComboSquattingParams(); |
| |
| // First check Combo Squatting with hard coded brand names. |
| std::vector<std::pair<std::string, std::string>> brand_names; |
| for (auto* it : combo_squatting_params->brand_names) { |
| brand_names.emplace_back(std::string(it[0]), |
| std::string(UNSAFE_TODO(it[1]))); |
| } |
| if (IsComboSquatting(brand_names, *combo_squatting_params, navigated_domain, |
| engaged_sites, matched_domain, |
| /*is_hard_coded=*/true)) { |
| return ComboSquattingType::kHardCoded; |
| } |
| |
| // Then check Combo Squatting with brand names in engaged sites. |
| brand_names = GetBrandNamesFromEngagedSites(engaged_sites); |
| if (IsComboSquatting(brand_names, *combo_squatting_params, navigated_domain, |
| engaged_sites, matched_domain, |
| /*is_hard_coded=*/false)) { |
| return ComboSquattingType::kSiteEngagement; |
| } |
| |
| return ComboSquattingType::kNone; |
| } |
| |
| bool IsSafeTLD(const std::string& hostname) { |
| // This is intentionally kept simple and currently ignores hostnames with |
| // ccTLDs (e.g. gov.in). |
| return base::EndsWith(hostname, ".gov") || base::EndsWith(hostname, ".mil"); |
| } |
| |
| LookalikeActionType GetActionForMatchType( |
| const reputation::SafetyTipsConfig* config, |
| version_info::Channel channel, |
| const std::string& etld_plus_one, |
| LookalikeUrlMatchType match_type) { |
| switch (match_type) { |
| case LookalikeUrlMatchType::kEditDistance: |
| // Edit distance is too noisy, just record metrics. |
| return LookalikeActionType::kRecordMetrics; |
| |
| case LookalikeUrlMatchType::kEditDistanceSiteEngagement: |
| return LookalikeActionType::kShowSafetyTip; |
| |
| case LookalikeUrlMatchType::kTargetEmbedding: |
| #if BUILDFLAG(IS_IOS) |
| // TODO(crbug.com/40705070): Only enable target embedding on iOS once we |
| // can |
| // check engaged sites. Otherwise, false positives are too high. |
| return LookalikeActionType::kRecordMetrics; |
| #else |
| return LookalikeActionType::kShowInterstitial; |
| #endif |
| |
| case LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips: |
| return LookalikeActionType::kShowSafetyTip; |
| |
| case LookalikeUrlMatchType::kSkeletonMatchTop5k: |
| return LookalikeActionType::kShowSafetyTip; |
| |
| case LookalikeUrlMatchType::kFailedSpoofChecks: |
| return LookalikeActionType::kShowInterstitial; |
| |
| case LookalikeUrlMatchType::kSkeletonMatchSiteEngagement: |
| case LookalikeUrlMatchType::kSkeletonMatchTop500: |
| return LookalikeActionType::kShowInterstitial; |
| |
| case LookalikeUrlMatchType::kCharacterSwapSiteEngagement: |
| case LookalikeUrlMatchType::kCharacterSwapTop500: |
| return LookalikeActionType::kShowSafetyTip; |
| |
| case LookalikeUrlMatchType::kComboSquatting: |
| return IsHeuristicEnabledForHostname( |
| config, |
| reputation::HeuristicLaunchConfig:: |
| HEURISTIC_COMBO_SQUATTING_TOP_DOMAINS, |
| etld_plus_one, channel) |
| ? LookalikeActionType::kShowSafetyTip |
| : LookalikeActionType::kRecordMetrics; |
| |
| case LookalikeUrlMatchType::kComboSquattingSiteEngagement: |
| return IsHeuristicEnabledForHostname( |
| config, |
| reputation::HeuristicLaunchConfig:: |
| HEURISTIC_COMBO_SQUATTING_ENGAGED_SITES, |
| etld_plus_one, channel) |
| ? LookalikeActionType::kShowSafetyTip |
| : LookalikeActionType::kRecordMetrics; |
| |
| case LookalikeUrlMatchType::kNone: |
| NOTREACHED(); |
| } |
| |
| NOTREACHED(); |
| } |
| |
| GURL GetSuggestedURL(LookalikeUrlMatchType match_type, |
| const GURL& navigated_url, |
| const std::string& matched_hostname) { |
| // matched_hostname can be a top domain or an engaged domain. Simply use its |
| // eTLD+1 as the suggested domain. |
| // 1. If matched_hostname is a top domain: Top domain list already contains |
| // eTLD+1s only so this works well. |
| // 2. If matched_hostname is an engaged domain and is not an eTLD+1, don't |
| // suggest it. Otherwise, navigating to googlé.com and having engaged with |
| // docs.google.com would suggest docs.google.com. |
| // |
| // When the navigated and matched domains are not eTLD+1s (e.g. |
| // docs.googlé.com and docs.google.com), this will suggest google.com |
| // instead of docs.google.com. This is less than ideal, but has two |
| // benefits: |
| // - Simpler code |
| // - Fewer suggestions to non-existent domains. E.g. When the navigated |
| // domain is nonexistent.googlé.com and the matched domain is |
| // docs.google.com, we will suggest google.com instead of |
| // nonexistent.google.com. |
| std::string suggested_domain = GetETLDPlusOne(matched_hostname); |
| DCHECK(!suggested_domain.empty()); |
| // Drop everything but the parts of the origin. |
| GURL::Replacements replace_host; |
| replace_host.SetHostStr(suggested_domain); |
| GURL suggested_url = |
| navigated_url.ReplaceComponents(replace_host).GetWithEmptyPath(); |
| |
| // Use https for top domain matches. |
| // TODO(crbug.com/40755923): If the match is against an engaged site, use the |
| // scheme of the engaged site instead. |
| if (suggested_url.SchemeIs(url::kHttpScheme) && |
| suggested_url.IntPort() == url::PORT_UNSPECIFIED && |
| (match_type == LookalikeUrlMatchType::kEditDistance || |
| match_type == LookalikeUrlMatchType::kSkeletonMatchTop500 || |
| match_type == LookalikeUrlMatchType::kSkeletonMatchTop5k)) { |
| GURL::Replacements replace_scheme; |
| replace_scheme.SetSchemeStr(url::kHttpsScheme); |
| suggested_url = suggested_url.ReplaceComponents(replace_scheme); |
| } |
| return suggested_url; |
| } |
| |
| } // namespace lookalikes |