blob: f89be4df763b8bf373f55b6d3cb85398de35a77d [file] [log] [blame]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/lookalikes/lookalike_url_navigation_observer.h"
#include "base/bind.h"
#include "base/metrics/field_trial_params.h"
#include "base/metrics/histogram_macros.h"
#include "base/stl_util.h"
#include "base/strings/utf_string_conversions.h"
#include "chrome/browser/engagement/site_engagement_service.h"
#include "chrome/browser/lookalikes/lookalike_url_service.h"
#include "chrome/browser/profiles/profile.h"
#include "chrome/browser/ui/omnibox/alternate_nav_infobar_delegate.h"
#include "chrome/common/chrome_features.h"
#include "components/omnibox/browser/autocomplete_match.h"
#include "components/ukm/content/source_url_recorder.h"
#include "components/url_formatter/idn_spoof_checker.h"
#include "components/url_formatter/top_domains/top_domain_util.h"
#include "content/public/browser/navigation_handle.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "services/metrics/public/cpp/ukm_builders.h"
#include "services/metrics/public/cpp/ukm_recorder.h"
namespace {
#include "components/url_formatter/top_domains/top500-domains-inc.cc"
using MatchType = LookalikeUrlNavigationObserver::MatchType;
using NavigationSuggestionEvent =
LookalikeUrlNavigationObserver::NavigationSuggestionEvent;
void RecordEvent(
LookalikeUrlNavigationObserver::NavigationSuggestionEvent event) {
UMA_HISTOGRAM_ENUMERATION(LookalikeUrlNavigationObserver::kHistogramName,
event);
}
bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1,
const url_formatter::Skeletons& skeletons2) {
DCHECK(!skeletons1.empty());
DCHECK(!skeletons2.empty());
for (const std::string& skeleton1 : skeletons1) {
if (base::ContainsKey(skeletons2, skeleton1))
return true;
}
return false;
}
// Returns true if the domain given by |domain_info| is a top domain.
bool IsTopDomain(
const LookalikeUrlNavigationObserver::DomainInfo& domain_info) {
// Top domains are only accessible through their skeletons, so query the top
// domains trie for each skeleton of this domain.
for (const std::string& skeleton : domain_info.skeletons) {
const std::string top_domain =
url_formatter::LookupSkeletonInTopDomains(skeleton);
if (domain_info.domain_and_registry == top_domain) {
return true;
}
}
return false;
}
// Returns the eTLD+1 of |hostname|. This excludes private registries so that
// GetETLDPlusOne("test.blogspot.com") returns "blogspot.com" (blogspot.com is
// listed as a private registry). We do this to be consistent with
// url_formatter's top domain list which doesn't have a notion of private
// registries.
std::string GetETLDPlusOne(const GURL& url) {
return net::registry_controlled_domains::GetDomainAndRegistry(
url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
}
// Returns a site that the user has used before that the eTLD+1 in
// |domain_and_registry| may be attempting to spoof, based on skeleton
// comparison.
std::string GetMatchingSiteEngagementDomain(
const std::vector<GURL>& engaged_sites,
const LookalikeUrlNavigationObserver::DomainInfo& navigated_domain) {
DCHECK(!navigated_domain.domain_and_registry.empty());
std::map<std::string, url_formatter::Skeletons>
domain_and_registry_to_skeleton;
for (const GURL& engaged_site : engaged_sites) {
DCHECK(engaged_site.SchemeIsHTTPOrHTTPS());
// If the user has engaged with eTLD+1 of this site, don't show any
// lookalike navigation suggestions. This ignores the scheme. That's okay as
// it's the more conservative option: If the user is engaged with
// http://domain.test, not showing the warning on https://domain.test is
// acceptable.
const std::string engaged_domain_and_registry =
GetETLDPlusOne(engaged_site);
// eTLD+1 can be empty for private domains (e.g. http://test).
if (engaged_domain_and_registry.empty())
continue;
if (navigated_domain.domain_and_registry == engaged_domain_and_registry)
return std::string();
// Multiple domains can map to the same eTLD+1, avoid skeleton generation
// when possible.
auto it = domain_and_registry_to_skeleton.find(engaged_domain_and_registry);
url_formatter::Skeletons skeletons;
if (it == domain_and_registry_to_skeleton.end()) {
// Engaged site can be IDN. Decode as unicode and compute the skeleton
// from that. At this point, top domain checks have already been done, so
// if the site is IDN, it'll always be decoded as unicode (i.e. IDN spoof
// checker will not find a matching top domain and fall back to punycode
// for it).
url_formatter::IDNConversionResult conversion_result =
url_formatter::IDNToUnicodeWithDetails(engaged_domain_and_registry);
skeletons = url_formatter::GetSkeletons(conversion_result.result);
domain_and_registry_to_skeleton[engaged_domain_and_registry] = skeletons;
} else {
skeletons = it->second;
}
if (SkeletonsMatch(navigated_domain.skeletons, skeletons))
return engaged_site.host();
}
return std::string();
}
} // namespace
// static
const char LookalikeUrlNavigationObserver::kHistogramName[] =
"NavigationSuggestion.Event";
LookalikeUrlNavigationObserver::DomainInfo::DomainInfo(
const std::string& arg_domain_and_registry,
const url_formatter::IDNConversionResult& arg_idn_result,
const url_formatter::Skeletons& arg_skeletons)
: domain_and_registry(arg_domain_and_registry),
idn_result(arg_idn_result),
skeletons(arg_skeletons) {}
LookalikeUrlNavigationObserver::DomainInfo::~DomainInfo() = default;
LookalikeUrlNavigationObserver::DomainInfo::DomainInfo(const DomainInfo&) =
default;
LookalikeUrlNavigationObserver::LookalikeUrlNavigationObserver(
content::WebContents* web_contents)
: WebContentsObserver(web_contents),
profile_(Profile::FromBrowserContext(web_contents->GetBrowserContext())),
weak_factory_(this) {}
LookalikeUrlNavigationObserver::~LookalikeUrlNavigationObserver() {}
void LookalikeUrlNavigationObserver::DidFinishNavigation(
content::NavigationHandle* navigation_handle) {
// Ignore subframe and same document navigations.
if (!navigation_handle->IsInMainFrame() ||
navigation_handle->IsSameDocument())
return;
// If the navigation was not committed, it means either the page was a
// download or error 204/205, or the navigation never left the previous
// URL. Basically, this isn't a problem since we stayed at the existing URL.
// Also ignore any navigation errors.
if (!navigation_handle->HasCommitted() ||
navigation_handle->GetNetErrorCode() != net::OK)
return;
const GURL url = navigation_handle->GetURL();
if (!url.SchemeIsHTTPOrHTTPS())
return;
// If the user has engaged with this site, don't show any lookalike
// navigation suggestions.
if (SiteEngagementService::Get(profile_)->IsEngagementAtLeast(
url, blink::mojom::EngagementLevel::MEDIUM))
return;
const DomainInfo navigated_domain = GetDomainInfo(url);
if (navigated_domain.domain_and_registry.empty() ||
IsTopDomain(navigated_domain))
return;
LookalikeUrlService::Get(profile_)->GetEngagedSites(
base::BindOnce(&LookalikeUrlNavigationObserver::PerformChecks,
weak_factory_.GetWeakPtr(), url, navigated_domain));
}
LookalikeUrlNavigationObserver::DomainInfo
LookalikeUrlNavigationObserver::GetDomainInfo(const GURL& url) {
// Perform all computations on eTLD+1.
const std::string domain_and_registry = GetETLDPlusOne(url);
// eTLD+1 can be empty for private domains.
if (domain_and_registry.empty()) {
return DomainInfo(domain_and_registry, url_formatter::IDNConversionResult(),
url_formatter::Skeletons());
}
// Compute skeletons using eTLD+1.
url_formatter::IDNConversionResult idn_result =
url_formatter::IDNToUnicodeWithDetails(domain_and_registry);
url_formatter::Skeletons skeletons =
url_formatter::GetSkeletons(idn_result.result);
return DomainInfo(domain_and_registry, idn_result, skeletons);
}
void LookalikeUrlNavigationObserver::PerformChecks(
const GURL& url,
const DomainInfo& navigated_domain,
const std::vector<GURL>& engaged_sites) {
std::string matched_domain;
MatchType match_type;
if (!GetMatchingDomain(navigated_domain, engaged_sites, &matched_domain,
&match_type)) {
return;
}
DCHECK(!matched_domain.empty());
GURL::Replacements replace_host;
replace_host.SetHostStr(matched_domain);
const GURL suggested_url = url.ReplaceComponents(replace_host);
ukm::UkmRecorder* ukm_recorder = ukm::UkmRecorder::Get();
CHECK(ukm_recorder);
ukm::SourceId source_id =
ukm::GetSourceIdForWebContentsDocument(web_contents());
ukm::builders::LookalikeUrl_NavigationSuggestion(source_id)
.SetMatchType(static_cast<int>(match_type))
.Record(ukm_recorder);
if (base::FeatureList::IsEnabled(
features::kLookalikeUrlNavigationSuggestionsUI)) {
RecordEvent(NavigationSuggestionEvent::kInfobarShown);
AlternateNavInfoBarDelegate::CreateForLookalikeUrlNavigation(
web_contents(), base::UTF8ToUTF16(matched_domain), suggested_url, url,
base::BindOnce(RecordEvent, NavigationSuggestionEvent::kLinkClicked));
}
}
bool LookalikeUrlNavigationObserver::GetMatchingDomain(
const DomainInfo& navigated_domain,
const std::vector<GURL>& engaged_sites,
std::string* matched_domain,
MatchType* match_type) {
DCHECK(!navigated_domain.domain_and_registry.empty());
if (navigated_domain.idn_result.has_idn_component) {
// If the navigated domain is IDN, check its skeleton against top domains
// and engaged sites.
if (!navigated_domain.idn_result.matching_top_domain.empty()) {
// In practice, this is not possible since the top domain list does not
// contain IDNs, so domain_and_registry can't both have IDN and be a top
// domain. Still, sanity check in case the top domain list changes in the
// future.
// At this point, navigated domain should not be a top domain.
DCHECK_NE(navigated_domain.domain_and_registry,
navigated_domain.idn_result.matching_top_domain);
RecordEvent(NavigationSuggestionEvent::kMatchTopSite);
*matched_domain = navigated_domain.idn_result.matching_top_domain;
*match_type = MatchType::kTopSite;
return true;
}
const std::string matched_engaged_domain =
GetMatchingSiteEngagementDomain(engaged_sites, navigated_domain);
if (!matched_engaged_domain.empty()) {
RecordEvent(NavigationSuggestionEvent::kMatchSiteEngagement);
*matched_domain = matched_engaged_domain;
*match_type = MatchType::kSiteEngagement;
return true;
}
}
// If we can't find an exact top domain or an engaged site, try to find a top
// domain within an edit distance of one.
const std::string similar_domain =
GetSimilarDomainFromTop500(navigated_domain);
if (!similar_domain.empty() &&
navigated_domain.domain_and_registry != similar_domain) {
RecordEvent(NavigationSuggestionEvent::kMatchEditDistance);
*matched_domain = similar_domain;
*match_type = MatchType::kEditDistance;
return true;
}
return false;
}
// static
bool LookalikeUrlNavigationObserver::IsEditDistanceAtMostOne(
const base::string16& str1,
const base::string16& str2) {
if (str1.size() > str2.size() + 1 || str2.size() > str1.size() + 1) {
return false;
}
base::string16::const_iterator i = str1.begin();
base::string16::const_iterator j = str2.begin();
size_t edit_count = 0;
while (i != str1.end() && j != str2.end()) {
if (*i == *j) {
i++;
j++;
} else {
edit_count++;
if (edit_count > 1) {
return false;
}
if (str1.size() > str2.size()) {
// First string is longer than the second. This can only happen if the
// first string has an extra character.
i++;
} else if (str2.size() > str1.size()) {
// Second string is longer than the first. This can only happen if the
// second string has an extra character.
j++;
} else {
// Both strings are the same length. This can only happen if the two
// strings differ by a single character.
i++;
j++;
}
}
}
if (i != str1.end() || j != str2.end()) {
// A character at the end did not match.
edit_count++;
}
return edit_count <= 1;
}
// static
std::string LookalikeUrlNavigationObserver::GetSimilarDomainFromTop500(
const DomainInfo& navigated_domain) {
if (!url_formatter::top_domains::IsEditDistanceCandidate(
navigated_domain.domain_and_registry)) {
return std::string();
}
const std::string domain_without_registry =
url_formatter::top_domains::HostnameWithoutRegistry(
navigated_domain.domain_and_registry);
for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
for (const char* const top_domain_skeleton : kTop500) {
if (IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton),
base::UTF8ToUTF16(top_domain_skeleton))) {
const std::string top_domain =
url_formatter::LookupSkeletonInTopDomains(top_domain_skeleton);
DCHECK(!top_domain.empty());
// If the only difference between the navigated and top
// domains is the registry part, this is unlikely to be a spoofing
// attempt. Ignore this match and continue. E.g. If the navigated domain
// is google.com.tw and the top domain is google.com.tr, this won't
// produce a match.
const std::string top_domain_without_registry =
url_formatter::top_domains::HostnameWithoutRegistry(top_domain);
if (domain_without_registry != top_domain_without_registry) {
return top_domain;
}
}
}
}
return std::string();
}
// static
void LookalikeUrlNavigationObserver::CreateForWebContents(
content::WebContents* web_contents) {
DCHECK(web_contents);
if (!FromWebContents(web_contents)) {
web_contents->SetUserData(
UserDataKey(),
std::make_unique<LookalikeUrlNavigationObserver>(web_contents));
}
}
WEB_CONTENTS_USER_DATA_KEY_IMPL(LookalikeUrlNavigationObserver)