components/lookalikes/core/lookalike_url_util.cc - chromium/src.git - Git at Google

 // Copyright 2020 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "components/lookalikes/core/lookalike_url_util.h"

 #include <algorithm>
 #include <string_view>
 #include <utility>

 #include "base/compiler_specific.h"
 #include "base/containers/contains.h"
 #include "base/functional/callback.h"
 #include "base/hash/sha1.h"
 #include "base/i18n/char_iterator.h"
 #include "base/metrics/histogram_macros.h"
 #include "base/no_destructor.h"
 #include "base/strings/strcat.h"
 #include "base/strings/string_split.h"
 #include "base/strings/string_util.h"
 #include "base/strings/utf_string_conversions.h"
 #include "base/trace_event/trace_event.h"
 #include "base/values.h"
 #include "build/build_config.h"
 #include "components/lookalikes/core/safety_tips_config.h"
 #include "components/security_interstitials/core/pref_names.h"
 #include "components/url_formatter/spoof_checks/common_words/common_words_util.h"
 #include "components/url_formatter/spoof_checks/top_domains/top_bucket_domains.h"
 #include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h"
 #include "components/url_formatter/url_formatter.h"
 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
 #include "net/base/url_util.h"
 #include "third_party/icu/source/common/unicode/uchar.h"
 #include "third_party/icu/source/common/unicode/utypes.h"

 using lookalikes::ComboSquattingParams;
 using lookalikes::DomainInfo;
 using lookalikes::GetDomainInfo;
 using lookalikes::HasOneCharacterSwap;
 using lookalikes::IsEditDistanceAtMostOne;
 using lookalikes::LookalikeTargetAllowlistChecker;
 using lookalikes::LookalikeUrlMatchType;
 using lookalikes::NavigationSuggestionEvent;
 using lookalikes::TopBucketDomainsParams;

 namespace {

 // Digits. Used for trimming domains in Edit Distance heuristic matches. Domains
 // that only differ by trailing digits (e.g. a1.tld and a2.tld) are ignored.
 const char kDigitChars[] = "0123456789";

 // Minimum length of e2LD protected against target embedding. For example,
 // foo.bar.baz.com-evil.com embeds foo.bar.baz.com, but we don't flag it since
 // "baz" is shorter than kMinTargetE2LDLength.
 const size_t kMinE2LDLengthForTargetEmbedding = 4;

 // We might not protect a domain whose e2LD is a common word in target embedding
 // based on the TLD that is paired with it. This list supplements words from
 // url_formatter::common_words::IsCommonWord().
 constexpr const char* kLocalAdditionalCommonWords[] = {"asahi", "hoteles",
                                                        "jharkhand", "nifty"};

 // These domains are plausible lookalike targets, but they also use common words
 // in their names. Selectively prevent flagging embeddings where the embedder
 // ends in "-DOMAIN.TLD", since these tend to have higher false positive rates.
 constexpr const char* kDomainsPermittedInEndEmbeddings[] = {
     "office.com", "medium.com", "orange.fr"};

 // What separators can be used to separate tokens in target embedding spoofs?
 // e.g. www-google.com.example.com uses "-" (www-google) and "." (google.com).
 const char kTargetEmbeddingSeparators[] = "-.";

 // A small subset of private registries on the PSL that act like public
 // registries AND are a common source of false positives in lookalike checks. We
 // treat them as public for the purposes of lookalike checks.
 constexpr const char* kPrivateRegistriesTreatedAsPublic[] = {"com.de",
                                                              "com.se"};

 TopBucketDomainsParams* GetTopDomainParams() {
   static TopBucketDomainsParams params{
       top_bucket_domains::kTopBucketEditDistanceSkeletons,
       top_bucket_domains::kNumTopBucketEditDistanceSkeletons};
   return &params;
 }

 // Minimum length of the eTLD+1 without registry needed to show the punycode
 // interstitial. IDN whose eTLD+1 without registry is shorter than this are
 // still displayed in punycode, but don't show an interstitial.
 const size_t kMinimumE2LDLengthToShowPunycodeInterstitial = 2;

 // Default launch percentage of a new heuristic on Canary/Dev and Beta. These
 // are used if there is a launch config for the heuristic in the proto.
 const int kDefaultLaunchPercentageOnCanaryDev = 90;
 const int kDefaultLaunchPercentageOnBeta = 50;

 // Define skeletons of brand names and popular keywords for using in Combo
 // Squatting heuristic. These lists are manually curated using Chrome metrics.
 // We will check combinations of brand names and popular keywords.
 // e. g. google-login.com or youtubesecure.com.
 // For every brand name, brand_name[.]com should be checked to be valid. If
 // no matched domain is found in top domains, brand_name[.]com will be
 // suggested to the user for navigation.
 // If brand_name[.]com is not valid for any brand name, each brand name should
 // be mapped to a valid url manually and the data structure of
 //  ForCSQ should be changed accordingly.
 // In each element of `kBrandNamesForCSQ`, first string is an original brand
 // name and second string is its skeleton. If you are adding a brand name here,
 // you can generate its skeleton using the format_url binary
 // (components/url_formatter/tools/format_url.cc)
 // TODO(crbug.com/40855941): Generate skeletons of hard coded brand names in
 // Chrome initialization and remove manual adding of skeletons to this list.
 constexpr std::string_view kBrandNamesForCSQ[][2] = {
     {"adobe", "adobe"},
     {"airbnb", "airbnb"},
     {"alibaba", "alibaba"},
     {"aliexpress", "aliexpress"},
     {"amazon", "arnazon"},
     {"baidu", "baidu"},
     {"bestbuy", "bestbuy"},
     {"blogspot", "blogspot"},
     {"costco", "costco"},
     {"craigslist", "craigslist"},
     {"dropbox", "dropbox"},
     {"expedia", "expedia"},
     {"facebook", "facebook"},
     {"fedex", "fedex"},
     {"flickr", "flickr"},
     {"github", "github"},
     {"glassdoor", "glassdoor"},
     {"gofundme", "gofundrne"},
     {"google", "google"},
     {"homedepot", "hornedepot"},
     {"icloud", "icloud"},
     {"indeed", "indeed"},
     {"instagram", "instagrarn"},
     {"intuit", "intuit"},
     {"microsoft", "rnicrosoft"},
     {"nbcnews", "nbcnews"},
     {"netflix", "netflix"},
     {"norton", "norton"},
     {"nytimes", "nytirnes"},
     {"office365", "office365"},
     {"paypal", "paypal"},
     {"pinterest", "pinterest"},
     {"playstation", "playstation"},
     {"quora", "quora"},
     {"reddit", "reddit"},
     {"reuters", "reuters"},
     {"samsung", "sarnsung"},
     {"spotify", "spotify"},
     {"stackexchange", "stackexchange"},
     {"stackoverflow", "stackoverflow"},
     {"trello", "trello"},
     {"twitch", "twitch"},
     {"twitter", "twitter"},
     {"uderny", "udemy"},
     {"wikipedia", "wikipedia"},
     {"wordpress", "wordpress"},
     {"xfinity", "xfinity"},
     {"yahoo", "yahoo"},
     {"youtube", "youtube"},
     {"zillow", "zillow"}};

 // Each element in `kSkeletonsOfPopularKeywordsForCSQ` is a skeleton of a
 // popular keyword. In contrast to `kBrandNamesForCSQ`, the original keywords
 // are not included. Because in `kBrandNamesForCSQ`, original brand names are
 // used to generate the matched domain, and original keywords are not needed for
 // that process. If you are adding a keyword here, you can generate its skeleton
 // using the format_url binary (components/url_formatter/tools/format_url.cc)
 constexpr std::string_view kSkeletonsOfPopularKeywordsForCSQ[] = {
     // Security
     "account",  "activate", "adrnin",   "coin",   "crypto",  "login", "logout",
     "password", "secure",   "security", "signin", "signout", "wallet"};

 // Minimum length of brand to be checked for Combo Squatting.
 const size_t kMinBrandNameLengthForComboSquatting = 4;

 ComboSquattingParams* GetComboSquattingParams() {
   static base::NoDestructor<ComboSquattingParams> params(
       {kBrandNamesForCSQ, kSkeletonsOfPopularKeywordsForCSQ});
   return params.get();
 }

 bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1,
                     const url_formatter::Skeletons& skeletons2) {
   DCHECK(!skeletons1.empty());
   DCHECK(!skeletons2.empty());
   for (const std::string& skeleton1 : skeletons1) {
     if (base::Contains(skeletons2, skeleton1)) {
       return true;
     }
   }
   return false;
 }

 // Returns a site that the user has used before that the eTLD+1 in
 // |domain_and_registry| may be attempting to spoof, based on skeleton
 // comparison.
 std::string GetMatchingSiteEngagementDomain(
     const std::vector<DomainInfo>& engaged_sites,
     const DomainInfo& navigated_domain) {
   DCHECK(!navigated_domain.domain_and_registry.empty());
   for (const DomainInfo& engaged_site : engaged_sites) {
     DCHECK(!engaged_site.domain_and_registry.empty());
     if (SkeletonsMatch(navigated_domain.skeletons, engaged_site.skeletons)) {
       return engaged_site.domain_and_registry;
     }
   }
   return std::string();
 }

 // Scans the top sites list and returns true if it finds a domain with an edit
 // distance or character swap of one to |domain_and_registry|. This search is
 // done in lexicographic order on the top 500 suitable domains, instead of in
 // order by popularity. This means that the resulting "similar" domain may not
 // be the most popular domain that matches.
 bool GetSimilarDomainFromTopBucket(
     const DomainInfo& navigated_domain,
     const LookalikeTargetAllowlistChecker& target_allowlisted,
     std::string* matched_domain,
     LookalikeUrlMatchType* match_type) {
   TopBucketDomainsParams* top_bucket_domain_params = GetTopDomainParams();
   for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
     for (size_t i = 0;
          i < top_bucket_domain_params->num_edit_distance_skeletons; i++) {
       const char* const top_domain_skeleton =
           UNSAFE_TODO(top_bucket_domain_params->edit_distance_skeletons[i]);
       DCHECK(strlen(top_domain_skeleton));
       // Check edit distance on skeletons.
       if (IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton),
                                   base::UTF8ToUTF16(top_domain_skeleton))) {
         const std::string top_domain =
             url_formatter::LookupSkeletonInTopDomains(
                 top_domain_skeleton, url_formatter::SkeletonType::kFull)
                 .domain;
         DCHECK(!top_domain.empty());

         if (!IsLikelyEditDistanceFalsePositive(navigated_domain,
                                                GetDomainInfo(top_domain)) &&
             !target_allowlisted.Run(top_domain)) {
           *matched_domain = top_domain;
           *match_type = LookalikeUrlMatchType::kEditDistance;
           return true;
         }
       }

       // Check character swap on skeletons.
       // TODO(crbug.com/40707797): Also check character swap on actual hostnames
       // with diacritics etc removed. This is because some characters have two
       // character skeletons such as m -> rn, and this prevents us from
       // detecting character swaps between example.com and exapmle.com.
       if (HasOneCharacterSwap(base::UTF8ToUTF16(navigated_skeleton),
                               base::UTF8ToUTF16(top_domain_skeleton))) {
         const std::string top_domain =
             url_formatter::LookupSkeletonInTopDomains(
                 top_domain_skeleton, url_formatter::SkeletonType::kFull)
                 .domain;
         DCHECK(!top_domain.empty());
         if (!IsLikelyCharacterSwapFalsePositive(navigated_domain,
                                                 GetDomainInfo(top_domain)) &&
             !target_allowlisted.Run(top_domain)) {
           *matched_domain = top_domain;
           *match_type = LookalikeUrlMatchType::kCharacterSwapTop500;
           return true;
         }
       }
     }
   }
   return false;
 }

 // Scans the engaged site list for edit distance and character swap matches.
 // Returns true if there is a match and fills |matched_domain| with the first
 // matching engaged domain and |match_type| with the matching heuristic type.
 bool GetSimilarDomainFromEngagedSites(
     const DomainInfo& navigated_domain,
     const std::vector<DomainInfo>& engaged_sites,
     const LookalikeTargetAllowlistChecker& target_allowlisted,
     std::string* matched_domain,
     LookalikeUrlMatchType* match_type) {
   for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
     for (const DomainInfo& engaged_site : engaged_sites) {
       DCHECK_NE(navigated_domain.domain_and_registry,
                 engaged_site.domain_and_registry);

       if (!url_formatter::top_domains::IsEditDistanceCandidate(
               engaged_site.domain_and_registry)) {
         continue;
       }
       // Skip past domains that are allowed to be spoofed.
       if (target_allowlisted.Run(engaged_site.domain_and_registry)) {
         continue;
       }
       for (const std::string& engaged_skeleton : engaged_site.skeletons) {
         // Check edit distance on skeletons.
         if (IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton),
                                     base::UTF8ToUTF16(engaged_skeleton)) &&
             !IsLikelyEditDistanceFalsePositive(navigated_domain,
                                                engaged_site)) {
           *matched_domain = engaged_site.domain_and_registry;
           *match_type = LookalikeUrlMatchType::kEditDistanceSiteEngagement;
           return true;
         }
         // Check character swap on skeletons.
         if (HasOneCharacterSwap(base::UTF8ToUTF16(navigated_skeleton),
                                 base::UTF8ToUTF16(engaged_skeleton)) &&
             !IsLikelyCharacterSwapFalsePositive(navigated_domain,
                                                 engaged_site)) {
           *matched_domain = engaged_site.domain_and_registry;
           *match_type = LookalikeUrlMatchType::kCharacterSwapSiteEngagement;
           return true;
         }
       }
     }
   }

   // Also check character swap on actual hostnames with diacritics etc removed.
   // This is because some characters have two character skeletons such as m ->
   // rn, and this prevents us from detecting character swaps between example.com
   // and exapmle.com.
   const std::u16string navigated_hostname_without_diacritics =
       url_formatter::MaybeRemoveDiacritics(navigated_domain.idn_result.result);
   if (navigated_hostname_without_diacritics !=
       navigated_domain.idn_result.result) {
     for (const DomainInfo& engaged_site : engaged_sites) {
       DCHECK_NE(navigated_domain.domain_and_registry,
                 engaged_site.domain_and_registry);
       const std::u16string engaged_hostname_without_diacritics =
           url_formatter::MaybeRemoveDiacritics(engaged_site.idn_result.result);

       if (HasOneCharacterSwap(navigated_hostname_without_diacritics,
                               engaged_hostname_without_diacritics)) {
         *matched_domain = engaged_site.domain_and_registry;
         *match_type = LookalikeUrlMatchType::kCharacterSwapSiteEngagement;
         return true;
       }
     }
   }
   return false;
 }

 std::optional<NavigationSuggestionEvent> ToNavigationSuggestionEvent(
     LookalikeUrlMatchType match_type) {
   switch (match_type) {
     case LookalikeUrlMatchType::kSkeletonMatchSiteEngagement:
       return NavigationSuggestionEvent::kMatchSiteEngagement;
     case LookalikeUrlMatchType::kEditDistance:
       return NavigationSuggestionEvent::kMatchEditDistance;
     case LookalikeUrlMatchType::kEditDistanceSiteEngagement:
       return NavigationSuggestionEvent::kMatchEditDistanceSiteEngagement;
     case LookalikeUrlMatchType::kTargetEmbedding:
       return NavigationSuggestionEvent::kMatchTargetEmbedding;
     case LookalikeUrlMatchType::kSkeletonMatchTop500:
       return NavigationSuggestionEvent::kMatchSkeletonTop500;
     case LookalikeUrlMatchType::kSkeletonMatchTop5k:
       return NavigationSuggestionEvent::kMatchSkeletonTop5k;
     case LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips:
       return NavigationSuggestionEvent::kMatchTargetEmbeddingForSafetyTips;
     case LookalikeUrlMatchType::kFailedSpoofChecks:
       return NavigationSuggestionEvent::kFailedSpoofChecks;
     case LookalikeUrlMatchType::kCharacterSwapSiteEngagement:
       return NavigationSuggestionEvent::kMatchCharacterSwapSiteEngagement;
     case LookalikeUrlMatchType::kCharacterSwapTop500:
       return NavigationSuggestionEvent::kMatchCharacterSwapTop500;
     case LookalikeUrlMatchType::kComboSquatting:
       return NavigationSuggestionEvent::kComboSquatting;
     case LookalikeUrlMatchType::kComboSquattingSiteEngagement:
       return NavigationSuggestionEvent::kComboSquattingSiteEngagement;
     case LookalikeUrlMatchType::kNone:
       return std::nullopt;
   }
 }

 // Returns the parts of the domain that are separated by "." or "-", not
 // including the eTLD.
 //
 // |hostname| must outlive the return value since the vector contains
 // StringPieces.
 std::vector<std::string_view> SplitDomainIntoTokens(
     const std::string& hostname) {
   return base::SplitStringPiece(hostname, kTargetEmbeddingSeparators,
                                 base::TRIM_WHITESPACE,
                                 base::SPLIT_WANT_NONEMPTY);
 }

 // Returns whether any subdomain ending in the last entry of |domain_labels| is
 // allowlisted. e.g. if domain_labels = {foo,scholar,google,com}, checks the
 // allowlist for google.com, scholar.google.com, and foo.scholar.google.com.
 bool ASubdomainIsAllowlisted(
     const base::span<const std::string_view>& domain_labels,
     const LookalikeTargetAllowlistChecker& in_target_allowlist) {
   CHECK_GT(domain_labels.size(), 1u);
   std::string potential_hostname(domain_labels.back());
   // Attach each token from the end to the embedded target to check if that
   // subdomain has been allowlisted.
   for (size_t i = domain_labels.size() - 1; i; --i) {
     potential_hostname =
         base::StrCat({domain_labels[i - 1], ".", potential_hostname});
     if (in_target_allowlist.Run(potential_hostname)) {
       return true;
     }
   }
   return false;
 }

 // Returns the top domain if the top domain without its separators matches the
 // |potential_target| (e.g. googlecom). The matching is a skeleton matching.
 std::string GetMatchingTopDomainWithoutSeparators(
     std::string_view potential_target) {
   const url_formatter::Skeletons skeletons =
       url_formatter::GetSkeletons(base::UTF8ToUTF16(potential_target));

   for (const auto& skeleton : skeletons) {
     url_formatter::TopDomainEntry matched_domain =
         url_formatter::LookupSkeletonInTopDomains(
             skeleton, url_formatter::SkeletonType::kSeparatorsRemoved);
     if (!matched_domain.domain.empty() &&
         matched_domain.skeleton_type ==
             url_formatter::SkeletonType::kSeparatorsRemoved) {
       return matched_domain.domain;
     }
   }
   return std::string();
 }

 // Returns whether the visited domain is either for a bare eTLD+1 (e.g.
 // 'google.com') or a trivial subdomain (e.g. 'www.google.com').
 bool IsETLDPlusOneOrTrivialSubdomain(const DomainInfo& host) {
   return (host.domain_and_registry == host.hostname ||
           "www." + host.domain_and_registry == host.hostname);
 }

 // Returns if |etld_plus_one| shares the skeleton of an eTLD+1 with an engaged
 // site or a top bucket domain. |embedded_target| is set to matching eTLD+1.
 bool DoesETLDPlus1MatchTopDomainOrEngagedSite(
     const DomainInfo& domain,
     const std::vector<DomainInfo>& engaged_sites,
     std::string* embedded_target) {
   for (const auto& skeleton : domain.skeletons) {
     for (const auto& engaged_site : engaged_sites) {
       // Skeleton matching only calculates skeletons of the eTLD+1, so only
       // consider engaged sites that are bare eTLD+1s (or a trivial subdomain)
       // and are a skeleton match.
       if (IsETLDPlusOneOrTrivialSubdomain(engaged_site) &&
           base::Contains(engaged_site.skeletons, skeleton)) {
         *embedded_target = engaged_site.domain_and_registry;
         return true;
       }
     }
   }
   for (const auto& skeleton : domain.skeletons) {
     const url_formatter::TopDomainEntry top_domain =
         url_formatter::LookupSkeletonInTopDomains(
             skeleton, url_formatter::SkeletonType::kFull);
     if (!top_domain.domain.empty() && top_domain.is_top_bucket) {
       *embedded_target = top_domain.domain;
       return true;
     }
   }
   return false;
 }

 // Returns whether the e2LD of the provided domain is a common word (e.g.
 // weather.com, ask.com). Target embeddings of these domains are often false
 // positives (e.g. "super-best-fancy-hotels.com" isn't spoofing "hotels.com").
 bool UsesCommonWord(const reputation::SafetyTipsConfig* config_proto,
                     const DomainInfo& domain) {
   // kDomainsPermittedInEndEmbeddings are based on domains with common words,
   // but they should not be excluded here (and instead are checked later).
   for (auto* permitted_ending : kDomainsPermittedInEndEmbeddings) {
     if (domain.domain_and_registry == permitted_ending) {
       return false;
     }
   }

   // Search for words in the big common word list.
   if (url_formatter::common_words::IsCommonWord(
           domain.domain_without_registry)) {
     return true;
   }

   // Search for words in the component-provided word list.
   if (lookalikes::IsCommonWordInConfigProto(config_proto,
                                             domain.domain_without_registry)) {
     return true;
   }

   // Search for words in the local word lists.
   for (auto* common_word : kLocalAdditionalCommonWords) {
     if (domain.domain_without_registry == common_word) {
       return true;
     }
   }

   return false;
 }

 // Returns whether |domain_labels| is in the same domain as embedding_domain.
 // e.g. IsEmbeddingItself(["foo", "example", "com"], "example.com") -> true
 //  since foo.example.com is in the same domain as example.com.
 bool IsEmbeddingItself(const base::span<const std::string_view>& domain_labels,
                        const std::string& embedding_domain) {
   DCHECK(domain_labels.size() >= 2);
   std::string potential_hostname(domain_labels.back());
   // Attach each token from the end to the embedded target to check if that
   // subdomain is the embedding domain. (e.g. using the earlier example, check
   // each ["com", "example.com", "foo.example.com"] against "example.com".
   for (size_t i = domain_labels.size() - 1; i; --i) {
     potential_hostname =
         base::StrCat({domain_labels[i - 1], ".", potential_hostname});
     if (embedding_domain == potential_hostname) {
       return true;
     }
   }
   return false;
 }

 // Identical to url_formatter::top_domains::HostnameWithoutRegistry(), but
 // respects de-facto public registries like .com.de using similar logic to
 // GetETLDPlusOne. See kPrivateRegistriesTreatedAsPublic definition for more
 // details. e.g. "google.com.de" returns "google". Call with an eTLD+1, not a
 // full hostname.
 std::string GetE2LDWithDeFactoPublicRegistries(
     const std::string& domain_and_registry) {
   if (domain_and_registry.empty()) {
     return std::string();
   }

   size_t registry_size =
       net::registry_controlled_domains::PermissiveGetHostRegistryLength(
           domain_and_registry.c_str(),
           net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
           net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
   const size_t private_registry_size =
       net::registry_controlled_domains::PermissiveGetHostRegistryLength(
           domain_and_registry.c_str(),
           net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
           net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);

   // If the registry lengths are the same using public and private registries,
   // than this is just a public registry domain. Otherwise, we need to check if
   // the registry ends with one of our anointed registries.
   if (registry_size != private_registry_size) {
     for (const auto* private_registry : kPrivateRegistriesTreatedAsPublic) {
       if (base::EndsWith(domain_and_registry, private_registry)) {
         registry_size = private_registry_size;
       }
     }
   }

   std::string out =
       domain_and_registry.substr(0, domain_and_registry.size() - registry_size);
   base::TrimString(out, ".", &out);
   return out;
 }

 // Returns whether |embedded_target| and |embedding_domain| share the same e2LD,
 // (as in, e.g., google.com and google.org, or airbnb.com.br and airbnb.com).
 // Assumes |embedding_domain| is an eTLD+1. Respects de-facto public eTLDs.
 bool IsCrossTLDMatch(const DomainInfo& embedded_target,
                      const std::string& embedding_domain) {
   return (
       GetE2LDWithDeFactoPublicRegistries(embedded_target.domain_and_registry) ==
       GetE2LDWithDeFactoPublicRegistries(embedding_domain));
 }

 // Returns whether |embedded_target| is one of kDomainsPermittedInEndEmbeddings
 // and that |embedding_domain| ends with that domain, e.g. "evil-office.com" is
 // permitted, as "office.com" is in kDomainsPermittedInEndEmbeddings.  Only
 // impacts Target Embedding matches.
 bool EndsWithPermittedDomains(const DomainInfo& embedded_target,
                               const std::string& embedding_domain) {
   for (auto* permitted_ending : kDomainsPermittedInEndEmbeddings) {
     if (embedded_target.domain_and_registry == permitted_ending &&
         base::EndsWith(embedding_domain,
                        base::StrCat({"-", permitted_ending}))) {
       return true;
     }
   }
   return false;
 }

 // A domain is allowed to be embedded if is embedding itself, if its e2LD is a
 // common word, any valid partial subdomain is allowlisted, or if it's a
 // cross-TLD match (e.g. google.com vs google.com.mx).
 bool IsAllowedToBeEmbedded(
     const DomainInfo& embedded_target,
     const base::span<const std::string_view>& subdomain_span,
     const LookalikeTargetAllowlistChecker& in_target_allowlist,
     const std::string& embedding_domain,
     const reputation::SafetyTipsConfig* config_proto) {
   return UsesCommonWord(config_proto, embedded_target) ||
          ASubdomainIsAllowlisted(subdomain_span, in_target_allowlist) ||
          IsEmbeddingItself(subdomain_span, embedding_domain) ||
          IsCrossTLDMatch(embedded_target, embedding_domain) ||
          EndsWithPermittedDomains(embedded_target, embedding_domain);
 }

 // Returns the first character of the first string that is different from the
 // second string. Strings should be at least 1 edit distance apart.
 char GetFirstDifferentChar(const std::string& str1, const std::string& str2) {
   std::string::const_iterator i1 = str1.begin();
   std::string::const_iterator i2 = str2.begin();
   while (i1 != str1.end() && i2 != str2.end()) {
     if (*i1 != *i2) {
       return *i1;
     }
     i1++;
     i2++;
   }
   NOTREACHED();
 }

 // Brand names with length of 4 or less should not be checked in domains for
 // Combo Squatting. Short brand names can cause false positives in results.
 bool IsComboSquattingCandidate(const std::string& brand) {
   return brand.size() > kMinBrandNameLengthForComboSquatting;
 }

 // Extract brand names from engaged sites to be checked for Combo Squatting, if
 // the brand is not one of the hard coded brand names.
 std::vector<std::pair<std::string, std::string>> GetBrandNamesFromEngagedSites(
     const std::vector<DomainInfo>& engaged_sites) {
   std::vector<std::pair<std::string, std::string>> output;

   for (const DomainInfo& engaged_site : engaged_sites) {
     url_formatter::Skeletons domain_without_registry_skeletons =
         engaged_site.domain_without_registry_skeletons;
     for (const std::string& skeleton : domain_without_registry_skeletons)
       if (IsComboSquattingCandidate(engaged_site.domain_without_registry)) {
         std::pair<std::string, std::string> brand_name = {
             engaged_site.domain_without_registry, skeleton};
         output.emplace_back(brand_name);
       }
   }
   return output;
 }

 // Registry of the navigated domain is needed to find matched_domain
 // in Combo Squatting domains. For example, registry of
 // `google-login[.]co[.]br` is `co[.]br`.
 std::string GetRegistry(const DomainInfo& navigated_domain) {
   size_t registry_size = navigated_domain.domain_and_registry.size() -
                          navigated_domain.domain_without_registry.size() - 1;

   std::string domain_and_registry = navigated_domain.domain_and_registry;
   std::string registry =
       domain_and_registry.substr(domain_and_registry.size() - registry_size,
                                  domain_and_registry.size() - 1);
   return registry;
 }

 // If a matched domain including the brand name and TLD of
 // navigated domain is found in top domains, |matched_domain|
 // is set to the found top domain. Otherwise, |matched_domain| will
 // be set to brand_name[.]com. Hard coded brand names should be checked to have
 // valid brand_name[.]com url.
 std::string FindMatchedDomainForHardCodedComboSquatting(
     const std::string& brand_name,
     const DomainInfo& navigated_domain) {
   DomainInfo suggested_matched_domain =
       GetDomainInfo(brand_name + '.' + GetRegistry(navigated_domain));
   if (url_formatter::IsDomainAndRegistryATopDomain(
           suggested_matched_domain.domain_and_registry)) {
     return suggested_matched_domain.hostname;
   } else {
     return brand_name + ".com";
   }
 }

 // Engaged sites are sorted based on engagement score, so |matched_domain|
 // will be set to the first domain in the engaged sites lists that includes
 // the brand name of the navigated domain.
 std::string FindMatchedDomainForSiteEngagementComboSquatting(
     const std::string& brand_name,
     const DomainInfo& navigated_domain,
     const std::vector<DomainInfo>& engaged_sites) {
   for (auto& engaged_site : engaged_sites) {
     if (brand_name == engaged_site.domain_without_registry) {
       return engaged_site.hostname;
     }
   }
   return std::string();
 }

 // Returns true if the navigated_domain is flagged as Combo Squatting.
 // matched_domain is the suggested domain that will be shown to the user
 // instead of the navigated_domain in the warning UI.
 bool IsComboSquatting(
     const std::vector<std::pair<std::string, std::string>>& brand_names,
     const ComboSquattingParams& combo_squatting_params,
     const DomainInfo& navigated_domain,
     const std::vector<DomainInfo>& engaged_sites,
     std::string* matched_domain,
     bool is_hard_coded) {
   // Check if the domain has any brand name and any popular keyword.
   for (auto& brand : brand_names) {
     auto brand_name = brand.first;
     auto brand_skeleton = brand.second;
     DCHECK(IsComboSquattingCandidate(brand_name));
     for (auto& skeleton : navigated_domain.domain_without_registry_skeletons) {
       size_t brand_skeleton_pos = skeleton.find(brand_skeleton);
       if (skeleton.size() == brand_skeleton.size() ||
           brand_skeleton_pos == std::string::npos) {
         continue;
       }

       for (auto keyword : combo_squatting_params.popular_keywords) {
         size_t keyword_pos = skeleton.find(keyword);
         if (keyword_pos == std::string::npos) {
           // Keyword not found, ignore.
           continue;
         }

         if (std::string(brand_skeleton).find(keyword) != std::string::npos ||
             std::string(keyword).find(brand_skeleton) != std::string::npos) {
           // Keyword is a substring of brand or vice versa, ignore.
           continue;
         }

         if ((keyword_pos > brand_skeleton_pos &&
              keyword_pos < brand_skeleton_pos + brand_skeleton.size()) ||
             (brand_skeleton_pos > keyword_pos &&
              brand_skeleton_pos < keyword_pos + keyword.size())) {
           // Keyword and brand overlap, ignore.
           continue;
         }

           if (is_hard_coded) {
             *matched_domain = FindMatchedDomainForHardCodedComboSquatting(
                 brand_name, navigated_domain);
           } else {
             *matched_domain = FindMatchedDomainForSiteEngagementComboSquatting(
                 brand_name, navigated_domain, engaged_sites);
           }
           return true;
       }
     }
   }
   return false;
 }

 // Hostnames containing these strings are considered unsafe due to ligature
 // rendering in some fonts.
 constexpr const char* kUnsafeLigatures[] = {
     "g_logo", "o_logo", "l_logo", "e_logo",
     // google_logo is also unsafe, but e_logo is its substring.
     // super_g_logo is also unsafe, but g_logo is its substring.
     "google_g", "glogoligature", "ologoligature", "llogoligature",
     "elogoligature",
     // googlelogoligature is also unsafe, but elogoligature is its
     // substring
 };

 bool IsUnsafeLigature(const DomainInfo& domain) {
   for (const char* unsafe_ligature : kUnsafeLigatures) {
     if (domain.hostname.find(unsafe_ligature) != std::string::npos) {
       return true;
     }
   }
   return false;
 }

 }  // namespace

 namespace lookalikes {

 const char kInterstitialHistogramName[] = "NavigationSuggestion.Event2";
 const char kIncognitoInterstitialHistogramName[] =
     "NavigationSuggestion.Event2.Incognito";

 void RegisterProfilePrefs(user_prefs::PrefRegistrySyncable* registry) {
   registry->RegisterListPref(prefs::kLookalikeWarningAllowlistDomains);
 }

 std::string GetConsoleMessage(const GURL& lookalike_url,
                               bool is_new_heuristic) {
   const char* const kNewHeuristicMessage =
       "Future Chrome versions will show a warning on this domain name.\n";
   return base::StrCat({"Chrome has determined that ", lookalike_url.host(),
                        " could be fake or fraudulent.\n\n",
                        is_new_heuristic ? kNewHeuristicMessage : "",
                        "If you believe this is shown in error please visit "
                        "https://g.co/chrome/lookalike-warnings"});
 }

 DomainInfo::DomainInfo(
     const std::string& arg_hostname,
     const std::string& arg_domain_and_registry,
     const std::string& arg_domain_without_registry,
     const url_formatter::IDNConversionResult& arg_idn_result,
     const url_formatter::Skeletons& arg_skeletons,
     const url_formatter::Skeletons& arg_domain_without_registry_skeletons)
     : hostname(arg_hostname),
       domain_and_registry(arg_domain_and_registry),
       domain_without_registry(arg_domain_without_registry),
       idn_result(arg_idn_result),
       skeletons(arg_skeletons),
       domain_without_registry_skeletons(arg_domain_without_registry_skeletons) {
 }

 DomainInfo::~DomainInfo() = default;

 DomainInfo::DomainInfo(const DomainInfo&) = default;

 DomainInfo GetDomainInfo(const std::string& hostname) {
   TRACE_EVENT0("navigation", "GetDomainInfo");
   if (net::HostStringIsLocalhost(hostname) ||
       net::IsHostnameNonUnique(hostname)) {
     return DomainInfo(std::string(), std::string(), std::string(),
                       url_formatter::IDNConversionResult(),
                       url_formatter::Skeletons(), url_formatter::Skeletons());
   }
   const std::string domain_and_registry = GetETLDPlusOne(hostname);
   const std::string domain_without_registry =
       domain_and_registry.empty()
           ? std::string()
           : url_formatter::top_domains::HostnameWithoutRegistry(
                 domain_and_registry);

   // eTLD+1 can be empty for private domains.
   if (domain_and_registry.empty()) {
     return DomainInfo(hostname, domain_and_registry, domain_without_registry,
                       url_formatter::IDNConversionResult(),
                       url_formatter::Skeletons(), url_formatter::Skeletons());
   }
   // Compute skeletons using eTLD+1, skipping all spoofing checks. Spoofing
   // checks in url_formatter can cause the converted result to be punycode.
   // We want to avoid this in order to get an accurate skeleton for the unicode
   // version of the domain.
   const url_formatter::IDNConversionResult idn_result =
       url_formatter::UnsafeIDNToUnicodeWithDetails(domain_and_registry);
   const url_formatter::Skeletons skeletons =
       url_formatter::GetSkeletons(idn_result.result);

   const url_formatter::IDNConversionResult domain_without_registry_idn_result =
       url_formatter::UnsafeIDNToUnicodeWithDetails(domain_without_registry);
   const url_formatter::Skeletons domain_without_registry_skeletons =
       url_formatter::GetSkeletons(domain_without_registry_idn_result.result);
   return DomainInfo(hostname, domain_and_registry, domain_without_registry,
                     idn_result, skeletons, domain_without_registry_skeletons);
 }

 DomainInfo GetDomainInfo(const GURL& url) {
   return GetDomainInfo(url.GetHost());
 }

 std::string GetETLDPlusOne(const std::string& hostname) {
   auto pub = net::registry_controlled_domains::GetDomainAndRegistry(
       hostname, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
   auto priv = net::registry_controlled_domains::GetDomainAndRegistry(
       hostname, net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
   // If there is no difference in eTLD+1 with/without private registries, then
   // the domain uses a public registry and we can return the eTLD+1 safely.
   if (pub == priv) {
     return pub;
   }
   // Otherwise, the domain uses a private registry and |pub| is that private
   // registry. If it's a de-facto-public registry, return the private eTLD+1.
   for (auto* private_registry : kPrivateRegistriesTreatedAsPublic) {
     if (private_registry == pub) {
       return priv;
     }
   }
   // Otherwise, ignore the normal private registry and return the public eTLD+1.
   return pub;
 }

 bool IsEditDistanceAtMostOne(const std::u16string& str1,
                              const std::u16string& str2) {
   if (str1.size() > str2.size() + 1 || str2.size() > str1.size() + 1) {
     return false;
   }
   std::u16string::const_iterator i = str1.begin();
   std::u16string::const_iterator j = str2.begin();
   size_t edit_count = 0;
   while (i != str1.end() && j != str2.end()) {
     if (*i == *j) {
       i++;
       j++;
     } else {
       edit_count++;
       if (edit_count > 1) {
         return false;
       }

       if (str1.size() > str2.size()) {
         // First string is longer than the second. This can only happen if the
         // first string has an extra character.
         i++;
       } else if (str2.size() > str1.size()) {
         // Second string is longer than the first. This can only happen if the
         // second string has an extra character.
         j++;
       } else {
         // Both strings are the same length. This can only happen if the two
         // strings differ by a single character.
         i++;
         j++;
       }
     }
   }
   if (i != str1.end() || j != str2.end()) {
     // A character at the end did not match.
     edit_count++;
   }
   return edit_count <= 1;
 }

 bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain,
                                        const DomainInfo& matched_domain) {
   DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
       matched_domain.domain_and_registry));
   DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
       navigated_domain.domain_and_registry));
   // If the only difference between the domains is the registry part, this is
   // unlikely to be a spoofing attempt and we should ignore this match.  E.g.
   // exclude matches like google.com.tw and google.com.tr.
   if (navigated_domain.domain_without_registry ==
       matched_domain.domain_without_registry) {
     return true;
   }

   // If the domains only differ by a numeric suffix on their e2LD (e.g.
   // site45.tld and site35.tld), then ignore the match.
   auto nav_trimmed = base::TrimString(navigated_domain.domain_without_registry,
                                       kDigitChars, base::TRIM_TRAILING);
   auto matched_trimmed = base::TrimString(
       matched_domain.domain_without_registry, kDigitChars, base::TRIM_TRAILING);
   DCHECK_NE(navigated_domain.domain_without_registry,
             matched_domain.domain_without_registry);
   // We previously verified that the domains without registries weren't equal,
   // so if they're equal now, the match must have come from numeric suffixes.
   if (nav_trimmed == matched_trimmed) {
     return true;
   }

   // Ignore domains that only differ by an insertion/substitution at the
   // start, as these are usually different words, not lookalikes.
   const auto nav_dom_len = navigated_domain.domain_and_registry.length();
   const auto matched_dom_len = matched_domain.domain_and_registry.length();
   const auto& nav_dom = navigated_domain.domain_and_registry;
   const auto& matched_dom = matched_domain.domain_and_registry;
   if (nav_dom_len == matched_dom_len) {
     // e.g. hank vs tank
     if (nav_dom.substr(1) == matched_dom.substr(1)) {
       return true;
     }
   } else if (nav_dom_len < matched_dom_len) {
     // e.g. oodle vs poodle
     if (nav_dom == matched_dom.substr(1)) {
       return true;
     }
   } else {  // navigated_dom_len > matched_dom_len
     // e.g. poodle vs oodle
     if (nav_dom.substr(1) == matched_dom) {
       return true;
     }
   }

   // Ignore domains that only differ by an insertion of a "-".
   if (nav_dom_len != matched_dom_len) {
     if (nav_dom_len < matched_dom_len &&
         GetFirstDifferentChar(matched_dom, nav_dom) == '-') {
       return true;
     } else if (nav_dom_len > matched_dom_len &&
                GetFirstDifferentChar(nav_dom, matched_dom) == '-') {
       return true;
     }
   }

   return false;
 }

 bool IsLikelyCharacterSwapFalsePositive(const DomainInfo& navigated_domain,
                                         const DomainInfo& matched_domain) {
   DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
       matched_domain.domain_and_registry));
   DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
       navigated_domain.domain_and_registry));
   // If the only difference between the domains is the registry part, this is
   // unlikely to be a spoofing attempt and we should ignore this match.  E.g.
   // exclude matches like google.sr and google.rs.
   return navigated_domain.domain_without_registry ==
          matched_domain.domain_without_registry;
 }

 bool GetMatchingDomain(
     const DomainInfo& navigated_domain,
     const std::vector<DomainInfo>& engaged_sites,
     const LookalikeTargetAllowlistChecker& in_target_allowlist,
     const reputation::SafetyTipsConfig* config_proto,
     std::string* matched_domain,
     LookalikeUrlMatchType* match_type) {
   DCHECK(!navigated_domain.domain_and_registry.empty());
   DCHECK(matched_domain);
   DCHECK(match_type);

   if (navigated_domain.idn_result.has_idn_component) {
     // If the navigated domain is IDN, check its skeleton against engaged sites
     // and top domains.
     const std::string matched_engaged_domain =
         GetMatchingSiteEngagementDomain(engaged_sites, navigated_domain);
     DCHECK_NE(navigated_domain.domain_and_registry, matched_engaged_domain);
     if (!matched_engaged_domain.empty()) {
       *matched_domain = matched_engaged_domain;
       *match_type = LookalikeUrlMatchType::kSkeletonMatchSiteEngagement;
       return true;
     }

     if (!navigated_domain.idn_result.matching_top_domain.domain.empty()) {
       // In practice, this is not possible since the top domain list does not
       // contain IDNs, so domain_and_registry can't both have IDN and be a top
       // domain. Still, sanity check in case the top domain list changes in the
       // future.
       // At this point, navigated domain should not be a top domain.
       DCHECK_NE(navigated_domain.domain_and_registry,
                 navigated_domain.idn_result.matching_top_domain.domain);
       *matched_domain = navigated_domain.idn_result.matching_top_domain.domain;
       *match_type =
           navigated_domain.idn_result.matching_top_domain.is_top_bucket
               ? LookalikeUrlMatchType::kSkeletonMatchTop500
               : LookalikeUrlMatchType::kSkeletonMatchTop5k;
       return true;
     }
   }

   if (url_formatter::top_domains::IsEditDistanceCandidate(
           navigated_domain.domain_and_registry)) {
     // If we can't find an exact top domain or an engaged site, try to find an
     // engaged domain within an edit distance of one or a single character swap.
     if (GetSimilarDomainFromEngagedSites(navigated_domain, engaged_sites,
                                          in_target_allowlist, matched_domain,
                                          match_type)) {
       DCHECK_NE(navigated_domain.domain_and_registry, *matched_domain);
       return true;
     }

     // Finally, try to find a top domain within an edit distance or character
     // swap of one.
     if (GetSimilarDomainFromTopBucket(navigated_domain, in_target_allowlist,
                                       matched_domain, match_type)) {
       DCHECK_NE(navigated_domain.domain_and_registry, *matched_domain);
       DCHECK(!matched_domain->empty());
       return true;
     }
   }

   TargetEmbeddingType embedding_type =
       GetTargetEmbeddingType(navigated_domain.hostname, engaged_sites,
                              in_target_allowlist, config_proto, matched_domain);
   if (embedding_type == TargetEmbeddingType::kSafetyTip) {
     *match_type = LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips;
     return true;
   } else if (embedding_type == TargetEmbeddingType::kInterstitial) {
     *match_type = LookalikeUrlMatchType::kTargetEmbedding;
     return true;
   }

   // If none of the previous heuristics work, check it for Combo Squatting.
   ComboSquattingType combo_squatting_type =
       GetComboSquattingType(navigated_domain, engaged_sites, matched_domain);
   if (combo_squatting_type == ComboSquattingType::kHardCoded) {
     *match_type = LookalikeUrlMatchType::kComboSquatting;
     DCHECK(!matched_domain->empty());
     return true;
   } else if (combo_squatting_type == ComboSquattingType::kSiteEngagement) {
     *match_type = LookalikeUrlMatchType::kComboSquattingSiteEngagement;
     DCHECK(!matched_domain->empty());
     return true;
   }

   DCHECK(embedding_type == TargetEmbeddingType::kNone);
   DCHECK(combo_squatting_type == ComboSquattingType::kNone);
   return false;
 }

 void RecordUMAFromMatchType(LookalikeUrlMatchType match_type,
                             bool is_incognito) {
   std::optional<NavigationSuggestionEvent> event =
       ToNavigationSuggestionEvent(match_type);
   if (event) {
     if (is_incognito) {
       UMA_HISTOGRAM_ENUMERATION(lookalikes::kIncognitoInterstitialHistogramName,
                                 *event);
     } else {
       UMA_HISTOGRAM_ENUMERATION(lookalikes::kInterstitialHistogramName, *event);
     }
   }
 }

 TargetEmbeddingType GetTargetEmbeddingType(
     const std::string& hostname,
     const std::vector<DomainInfo>& engaged_sites,
     const LookalikeTargetAllowlistChecker& in_target_allowlist,
     const reputation::SafetyTipsConfig* config_proto,
     std::string* safe_hostname) {
   // Because of how target embeddings are detected (i.e. by sweeping the URL
   // from back to front), we're guaranteed to find tail-embedding before other
   // target embedding. Tail embedding triggers a safety tip, but interstitials
   // are more important than safety tips, so if we find a safety tippable
   // embedding with SearchForEmbeddings, go search again not permitting safety
   // tips to see if we can also find an interstitiallable embedding.
   auto result = SearchForEmbeddings(
       hostname, engaged_sites, in_target_allowlist, config_proto,
       /*safety_tips_allowed=*/true, safe_hostname);
   if (result == TargetEmbeddingType::kSafetyTip) {
     std::string no_st_safe_hostname;
     auto no_st_result = SearchForEmbeddings(
         hostname, engaged_sites, in_target_allowlist, config_proto,
         /*safety_tips_allowed=*/false, &no_st_safe_hostname);
     if (no_st_result == TargetEmbeddingType::kNone) {
       return result;
     }
     *safe_hostname = no_st_safe_hostname;
     return no_st_result;
   }
   return result;
 }

 TargetEmbeddingType SearchForEmbeddings(
     const std::string& hostname,
     const std::vector<DomainInfo>& engaged_sites,
     const LookalikeTargetAllowlistChecker& in_target_allowlist,
     const reputation::SafetyTipsConfig* config_proto,
     bool safety_tips_allowed,
     std::string* safe_hostname) {
   const std::string embedding_domain = GetETLDPlusOne(hostname);
   const std::vector<std::string_view> hostname_tokens =
       SplitDomainIntoTokens(hostname);

   // There are O(n^2) potential target embeddings in a domain name. We want to
   // be comprehensive, but optimize so that usually we needn't check all of
   // them. We do that by sweeping from the back of the embedding domain, towards
   // the front, checking for a valid eTLD. If we find one, then we consider the
   // possible embedded domains that end in that eTLD (i.e. all possible start
   // points from the beginning of the string onward).
   for (size_t end = hostname_tokens.size(); end > 0; --end) {
     base::span<const std::string_view> UNSAFE_TODO(
         etld_check_span(hostname_tokens.data(), end));
     std::string etld_check_host = base::JoinString(etld_check_span, ".");
     auto etld_check_dominfo = GetDomainInfo(etld_check_host);

     // Check if the final token is a no-separator target (e.g. "googlecom").
     // This check happens first so that we can exclude invalid eTLD+1s next.
     std::string embedded_target =
         GetMatchingTopDomainWithoutSeparators(hostname_tokens[end - 1]);
     if (!embedded_target.empty()) {
       // Extract the full possibly-spoofed domain. To get this, we take the
       // hostname up until this point, strip off the no-separator bit (e.g.
       // googlecom) and then re-add the the separated version (e.g. google.com).
       auto spoofed_domain =
           etld_check_host.substr(
               0, etld_check_host.length() - hostname_tokens[end - 1].length()) +
           embedded_target;
       const auto no_separator_tokens = base::SplitStringPiece(
           spoofed_domain, kTargetEmbeddingSeparators, base::TRIM_WHITESPACE,
           base::SPLIT_WANT_NONEMPTY);
       auto no_separator_dominfo = GetDomainInfo(embedded_target);

       // Only flag on domains that are long enough, don't use common words, and
       // aren't target-allowlisted.
       if (no_separator_dominfo.domain_without_registry.length() >
               kMinE2LDLengthForTargetEmbedding &&
           !IsAllowedToBeEmbedded(no_separator_dominfo, no_separator_tokens,
                                  in_target_allowlist, embedding_domain,
                                  config_proto)) {
         *safe_hostname = embedded_target;
         return TargetEmbeddingType::kInterstitial;
       }
     }

     // Exclude otherwise-invalid eTLDs.
     if (etld_check_dominfo.domain_without_registry.empty()) {
       continue;
     }

     // Exclude e2LDs that are too short. <= because domain_without_registry has
     // a trailing ".".
     if (etld_check_dominfo.domain_without_registry.length() <=
         kMinE2LDLengthForTargetEmbedding) {
       continue;
     }

     // Check for exact matches against engaged sites, among all possible
     // subdomains ending at |end|.
     for (size_t start = 0; start < end - 1; ++start) {
       const base::span<const std::string_view> UNSAFE_TODO(
           span(hostname_tokens.data() + start, end - start));
       auto embedded_hostname = base::JoinString(span, ".");
       auto embedded_dominfo = GetDomainInfo(embedded_hostname);

       for (auto& engaged_site : engaged_sites) {
         if (engaged_site.hostname == embedded_dominfo.hostname &&
             !IsAllowedToBeEmbedded(embedded_dominfo, span, in_target_allowlist,
                                    embedding_domain, config_proto)) {
           *safe_hostname = engaged_site.hostname;
           // Tail-embedding (e.g. evil-google.com, where the embedding happens
           // at the very end of the hostname) is a safety tip, but only when
           // safety tips are allowed. If it's tail embedding but we can't create
           // a safety tip, keep looking.  Non-tail-embeddings are interstitials.
           if (end != hostname_tokens.size()) {
             return TargetEmbeddingType::kInterstitial;
           } else if (safety_tips_allowed) {
             return TargetEmbeddingType::kSafetyTip;
           }  // else keep searching.
         }
       }
     }

     // There were no exact engaged site matches, but there may yet still be a
     // match against the eTLD+1 of an engaged or top site.
     if (DoesETLDPlus1MatchTopDomainOrEngagedSite(
             etld_check_dominfo, engaged_sites, safe_hostname) &&
         !IsAllowedToBeEmbedded(etld_check_dominfo, etld_check_span,
                                in_target_allowlist, embedding_domain,
                                config_proto)) {
       // Tail-embedding (e.g. evil-google.com, where the embedding happens at
       // the very end of the hostname) is a safety tip, but only when safety
       // tips are allowed. If it's tail embedding but we can't create a safety
       // tip, keep looking.  Non-tail-embeddings are interstitials.
       if (end != hostname_tokens.size()) {
         return TargetEmbeddingType::kInterstitial;
       } else if (safety_tips_allowed) {
         return TargetEmbeddingType::kSafetyTip;
       }  // else keep searching.
     }
   }
   return TargetEmbeddingType::kNone;
 }

 bool IsASCII(UChar32 codepoint) {
   return !(codepoint & ~0x7F);
 }

 // Returns true if |codepoint| has emoji related properties.
 bool IsEmojiRelatedCodepoint(UChar32 codepoint) {
   return u_hasBinaryProperty(codepoint, UCHAR_EMOJI) ||
          // Characters that have emoji presentation by default (e.g. hourglass)
          u_hasBinaryProperty(codepoint, UCHAR_EMOJI_PRESENTATION) ||
          // Characters displayed as country flags when used as a valid pair.
          // E.g. Regional Indicator Symbol Letter B used once in a string
          // is rendered as 🇧, used twice is rendered as the flag of Barbados
          // (with country code BB). It's therefore possible to come up with
          // a spoof using regional indicator characters as text, but these
          // domain names will be readily punycoded and detecting pairs isn't
          // easy so we keep the code simple here.
          u_hasBinaryProperty(codepoint, UCHAR_REGIONAL_INDICATOR) ||
          // Pictographs such as Black Cross On Shield (U+26E8).
          u_hasBinaryProperty(codepoint, UCHAR_EXTENDED_PICTOGRAPHIC);
 }

 // Returns true if |text| contains only ASCII characters, pictographs
 // or emojis. This check is only used to determine if a domain that already
 // failed spoof checks should be blocked by an interstitial. Ideally, we would
 // check this for non-ASCII scripts as well (e.g. Cyrillic + emoji), but such
 // usage isn't common.
 bool IsASCIIAndEmojiOnly(std::u16string_view text) {
   for (base::i18n::UTF16CharIterator iter(text); !iter.end(); iter.Advance()) {
     const UChar32 codepoint = iter.get();
     if (!IsASCII(codepoint) && !IsEmojiRelatedCodepoint(codepoint)) {
       return false;
     }
   }
   return true;
 }

 // Returns true if the e2LD of domain is long enough to display a punycode
 // interstitial.
 bool IsPunycodeInterstitialCandidate(const DomainInfo& domain) {
   const url_formatter::IDNConversionResult idn_result =
       url_formatter::UnsafeIDNToUnicodeWithDetails(
           domain.domain_without_registry);
   return idn_result.result.size() >=
          kMinimumE2LDLengthToShowPunycodeInterstitial;
 }

 bool ShouldBlockBySpoofCheckResult(const DomainInfo& navigated_domain) {
   if (IsUnsafeLigature(navigated_domain)) {
     return true;
   }

   // Here, only a subset of spoof checks that cause an IDN to fallback to
   // punycode are configured to show an interstitial.
   switch (navigated_domain.idn_result.spoof_check_result) {
     case url_formatter::IDNSpoofCheckerResult::kNone:
     case url_formatter::IDNSpoofCheckerResult::kSafe:
       return false;

     case url_formatter::IDNSpoofCheckerResult::kICUSpoofChecks:
       // If the eTLD+1 contains only a mix of ASCII + Emoji, allow.
       return !IsASCIIAndEmojiOnly(navigated_domain.idn_result.result) &&
              IsPunycodeInterstitialCandidate(navigated_domain);

     case url_formatter::IDNSpoofCheckerResult::kDeviationCharacters:
       // Failures because of deviation characters, especially ß, is common.
       return false;

     case url_formatter::IDNSpoofCheckerResult::kTLDSpecificCharacters:
     case url_formatter::IDNSpoofCheckerResult::kUnsafeMiddleDot:
     case url_formatter::IDNSpoofCheckerResult::kWholeScriptConfusable:
     case url_formatter::IDNSpoofCheckerResult::kDigitLookalikes:
     case url_formatter::IDNSpoofCheckerResult::
         kNonAsciiLatinCharMixedWithNonLatin:
     case url_formatter::IDNSpoofCheckerResult::kDangerousPattern:
       return IsPunycodeInterstitialCandidate(navigated_domain);
   }
 }

 bool IsAllowedByEnterprisePolicy(const PrefService* pref_service,
                                  const GURL& url) {
   const base::Value::List& list =
       pref_service->GetList(prefs::kLookalikeWarningAllowlistDomains);

   for (const auto& domain_val : list) {
     const std::string& domain = domain_val.GetString();
     if (url.DomainIs(domain)) {
       return true;
     }
   }
   return false;
 }

 void SetEnterpriseAllowlistForTesting(PrefService* pref_service,
                                       const std::vector<std::string>& hosts) {
   base::Value::List list;
   for (const auto& host : hosts) {
     list.Append(host);
   }
   pref_service->SetList(prefs::kLookalikeWarningAllowlistDomains,
                         std::move(list));
 }

 bool HasOneCharacterSwap(const std::u16string& str1,
                          const std::u16string& str2) {
   if (str1.size() != str2.size()) {
     return false;
   }
   if (str1 == str2) {
     return false;
   }
   bool has_swap = false;
   std::u16string::const_iterator i = str1.begin();
   std::u16string::const_iterator j = str2.begin();
   while (i != str1.end()) {
     DCHECK(j < str2.end());
     wchar_t left1 = *i;
     wchar_t right1 = *j;
     i++;
     j++;
     if (left1 == right1) {
       continue;
     }
     wchar_t left2 = *i;
     wchar_t right2 = *j;
     if (!has_swap && (left1 == right2 && right1 == left2)) {
       has_swap = true;
       i++;
       j++;
       continue;
     }
     // Either there are multiple swaps, or strings have completely different
     // characters.
     return false;
   }
   return has_swap;
 }

 void SetTopBucketDomainsParamsForTesting(const TopBucketDomainsParams& params) {
   *GetTopDomainParams() = params;
 }

 void ResetTopBucketDomainsParamsForTesting() {
   TopBucketDomainsParams* params = GetTopDomainParams();
   *params = {top_bucket_domains::kTopBucketEditDistanceSkeletons,
              top_bucket_domains::kNumTopBucketEditDistanceSkeletons};
 }

 bool IsHeuristicEnabledForHostname(
     const reputation::SafetyTipsConfig* config_proto,
     const reputation::HeuristicLaunchConfig::Heuristic heuristic,
     const std::string& lookalike_etld_plus_one,
     version_info::Channel channel) {
   DCHECK(!lookalike_etld_plus_one.empty());
   if (!config_proto) {
     return false;
   }
   base::SHA1Digest hash =
       base::SHA1Hash(base::as_byte_span(lookalike_etld_plus_one));
   float cohort = hash[0u] / 2.56;
   for (const reputation::HeuristicLaunchConfig& config :
        config_proto->launch_config()) {
     if (heuristic == config.heuristic()) {
       switch (channel) {
         // Enable by default on local builds.
         case version_info::Channel::UNKNOWN:
           return true;

         // Use pre-defined launch percentages for Canary/Dev and Beta. Use the
         // launch percentage from config for Stable.
         case version_info::Channel::CANARY:
         case version_info::Channel::DEV:
           return kDefaultLaunchPercentageOnCanaryDev > cohort;

         case version_info::Channel::BETA:
           return kDefaultLaunchPercentageOnBeta > cohort;

         case version_info::Channel::STABLE:
           return config.launch_percentage() > cohort;
       }
     }
   }
   return false;
 }

 void SetComboSquattingParamsForTesting(const ComboSquattingParams& params) {
   *GetComboSquattingParams() = params;
 }

 void ResetComboSquattingParamsForTesting() {
   ComboSquattingParams* params = GetComboSquattingParams();
   *params = {kBrandNamesForCSQ, kSkeletonsOfPopularKeywordsForCSQ};
 }

 ComboSquattingType GetComboSquattingType(
     const DomainInfo& navigated_domain,
     const std::vector<DomainInfo>& engaged_sites,
     std::string* matched_domain) {
   const ComboSquattingParams* combo_squatting_params =
       GetComboSquattingParams();

   // First check Combo Squatting with hard coded brand names.
   std::vector<std::pair<std::string, std::string>> brand_names;
   for (auto* it : combo_squatting_params->brand_names) {
     brand_names.emplace_back(std::string(it[0]),
                              std::string(UNSAFE_TODO(it[1])));
   }
   if (IsComboSquatting(brand_names, *combo_squatting_params, navigated_domain,
                        engaged_sites, matched_domain,
                        /*is_hard_coded=*/true)) {
     return ComboSquattingType::kHardCoded;
   }

   // Then check Combo Squatting with brand names in engaged sites.
   brand_names = GetBrandNamesFromEngagedSites(engaged_sites);
   if (IsComboSquatting(brand_names, *combo_squatting_params, navigated_domain,
                        engaged_sites, matched_domain,
                        /*is_hard_coded=*/false)) {
     return ComboSquattingType::kSiteEngagement;
   }

   return ComboSquattingType::kNone;
 }

 bool IsSafeTLD(const std::string& hostname) {
   // This is intentionally kept simple and currently ignores hostnames with
   // ccTLDs (e.g. gov.in).
   return base::EndsWith(hostname, ".gov") || base::EndsWith(hostname, ".mil");
 }

 LookalikeActionType GetActionForMatchType(
     const reputation::SafetyTipsConfig* config,
     version_info::Channel channel,
     const std::string& etld_plus_one,
     LookalikeUrlMatchType match_type) {
   switch (match_type) {
     case LookalikeUrlMatchType::kEditDistance:
       // Edit distance is too noisy, just record metrics.
       return LookalikeActionType::kRecordMetrics;

     case LookalikeUrlMatchType::kEditDistanceSiteEngagement:
       return LookalikeActionType::kShowSafetyTip;

     case LookalikeUrlMatchType::kTargetEmbedding:
 #if BUILDFLAG(IS_IOS)
       // TODO(crbug.com/40705070): Only enable target embedding on iOS once we
       // can
       //    check engaged sites. Otherwise, false positives are too high.
       return LookalikeActionType::kRecordMetrics;
 #else
       return LookalikeActionType::kShowInterstitial;
 #endif

     case LookalikeUrlMatchType::kTargetEmbeddingForSafetyTips:
       return LookalikeActionType::kShowSafetyTip;

     case LookalikeUrlMatchType::kSkeletonMatchTop5k:
       return LookalikeActionType::kShowSafetyTip;

     case LookalikeUrlMatchType::kFailedSpoofChecks:
       return LookalikeActionType::kShowInterstitial;

     case LookalikeUrlMatchType::kSkeletonMatchSiteEngagement:
     case LookalikeUrlMatchType::kSkeletonMatchTop500:
       return LookalikeActionType::kShowInterstitial;

     case LookalikeUrlMatchType::kCharacterSwapSiteEngagement:
     case LookalikeUrlMatchType::kCharacterSwapTop500:
       return LookalikeActionType::kShowSafetyTip;

     case LookalikeUrlMatchType::kComboSquatting:
       return IsHeuristicEnabledForHostname(
                  config,
                  reputation::HeuristicLaunchConfig::
                      HEURISTIC_COMBO_SQUATTING_TOP_DOMAINS,
                  etld_plus_one, channel)
                  ? LookalikeActionType::kShowSafetyTip
                  : LookalikeActionType::kRecordMetrics;

     case LookalikeUrlMatchType::kComboSquattingSiteEngagement:
       return IsHeuristicEnabledForHostname(
                  config,
                  reputation::HeuristicLaunchConfig::
                      HEURISTIC_COMBO_SQUATTING_ENGAGED_SITES,
                  etld_plus_one, channel)
                  ? LookalikeActionType::kShowSafetyTip
                  : LookalikeActionType::kRecordMetrics;

     case LookalikeUrlMatchType::kNone:
       NOTREACHED();
   }

   NOTREACHED();
 }

 GURL GetSuggestedURL(LookalikeUrlMatchType match_type,
                      const GURL& navigated_url,
                      const std::string& matched_hostname) {
   // matched_hostname can be a top domain or an engaged domain. Simply use its
   // eTLD+1 as the suggested domain.
   // 1. If matched_hostname is a top domain: Top domain list already contains
   // eTLD+1s only so this works well.
   // 2. If matched_hostname is an engaged domain and is not an eTLD+1, don't
   // suggest it. Otherwise, navigating to googlé.com and having engaged with
   // docs.google.com would suggest docs.google.com.
   //
   // When the navigated and matched domains are not eTLD+1s (e.g.
   // docs.googlé.com and docs.google.com), this will suggest google.com
   // instead of docs.google.com. This is less than ideal, but has two
   // benefits:
   // - Simpler code
   // - Fewer suggestions to non-existent domains. E.g. When the navigated
   // domain is nonexistent.googlé.com and the matched domain is
   // docs.google.com, we will suggest google.com instead of
   // nonexistent.google.com.
   std::string suggested_domain = GetETLDPlusOne(matched_hostname);
   DCHECK(!suggested_domain.empty());
   // Drop everything but the parts of the origin.
   GURL::Replacements replace_host;
   replace_host.SetHostStr(suggested_domain);
   GURL suggested_url =
       navigated_url.ReplaceComponents(replace_host).GetWithEmptyPath();

   // Use https for top domain matches.
   // TODO(crbug.com/40755923): If the match is against an engaged site, use the
   // scheme of the engaged site instead.
   if (suggested_url.SchemeIs(url::kHttpScheme) &&
       suggested_url.IntPort() == url::PORT_UNSPECIFIED &&
       (match_type == LookalikeUrlMatchType::kEditDistance ||
        match_type == LookalikeUrlMatchType::kSkeletonMatchTop500 ||
        match_type == LookalikeUrlMatchType::kSkeletonMatchTop5k)) {
     GURL::Replacements replace_scheme;
     replace_scheme.SetSchemeStr(url::kHttpsScheme);
     suggested_url = suggested_url.ReplaceComponents(replace_scheme);
   }
   return suggested_url;
 }

 }  // namespace lookalikes