blob: eb701f28f4e5099d62a006d5e922662da24a87e5 [file] [log] [blame]
// Copyright 2019 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/lookalikes/lookalike_url_navigation_throttle.h"
#include <map>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "base/bind.h"
#include "base/metrics/field_trial_params.h"
#include "base/metrics/histogram_macros.h"
#include "base/stl_util.h"
#include "base/strings/utf_string_conversions.h"
#include "chrome/browser/engagement/site_engagement_service.h"
#include "chrome/browser/lookalikes/lookalike_url_allowlist.h"
#include "chrome/browser/lookalikes/lookalike_url_controller_client.h"
#include "chrome/browser/lookalikes/lookalike_url_interstitial_page.h"
#include "chrome/browser/lookalikes/lookalike_url_service.h"
#include "chrome/browser/prerender/prerender_contents.h"
#include "chrome/browser/profiles/profile.h"
#include "chrome/common/chrome_features.h"
#include "components/security_interstitials/content/security_interstitial_tab_helper.h"
#include "components/ukm/content/source_url_recorder.h"
#include "components/url_formatter/top_domains/top500_domains.h"
#include "components/url_formatter/top_domains/top_domain_util.h"
#include "content/public/browser/navigation_handle.h"
namespace {
const base::FeatureParam<bool> kEnableInterstitialForTopSites{
&features::kLookalikeUrlNavigationSuggestionsUI, "topsites", false};
using lookalikes::LookalikeUrlNavigationThrottle;
using MatchType = LookalikeUrlInterstitialPage::MatchType;
using UserAction = LookalikeUrlInterstitialPage::UserAction;
using NavigationSuggestionEvent =
lookalikes::LookalikeUrlNavigationThrottle::NavigationSuggestionEvent;
using DomainInfo = lookalikes::DomainInfo;
typedef content::NavigationThrottle::ThrottleCheckResult ThrottleCheckResult;
void RecordEvent(
LookalikeUrlNavigationThrottle::NavigationSuggestionEvent event) {
UMA_HISTOGRAM_ENUMERATION(LookalikeUrlNavigationThrottle::kHistogramName,
event);
}
bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1,
const url_formatter::Skeletons& skeletons2) {
DCHECK(!skeletons1.empty());
DCHECK(!skeletons2.empty());
for (const std::string& skeleton1 : skeletons1) {
if (base::ContainsKey(skeletons2, skeleton1)) {
return true;
}
}
return false;
}
// Returns true if the domain given by |domain_info| is a top domain.
bool IsTopDomain(const DomainInfo& domain_info) {
// Top domains are only accessible through their skeletons, so query the top
// domains trie for each skeleton of this domain.
for (const std::string& skeleton : domain_info.skeletons) {
const std::string top_domain =
url_formatter::LookupSkeletonInTopDomains(skeleton);
if (domain_info.domain_and_registry == top_domain) {
return true;
}
}
return false;
}
// Returns a site that the user has used before that the eTLD+1 in
// |domain_and_registry| may be attempting to spoof, based on skeleton
// comparison.
std::string GetMatchingSiteEngagementDomain(
const std::vector<DomainInfo>& engaged_sites,
const DomainInfo& navigated_domain) {
DCHECK(!navigated_domain.domain_and_registry.empty());
for (const DomainInfo& engaged_site : engaged_sites) {
DCHECK(!engaged_site.domain_and_registry.empty());
DCHECK_NE(navigated_domain.domain_and_registry,
engaged_site.domain_and_registry);
if (SkeletonsMatch(navigated_domain.skeletons, engaged_site.skeletons)) {
return engaged_site.domain_and_registry;
}
}
return std::string();
}
// Returns the first matching top domain with an edit distance of at most one
// to |domain_and_registry|.
std::string GetSimilarDomainFromTop500(const DomainInfo& navigated_domain) {
for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
for (const char* const top_domain_skeleton : top500_domains::kTop500) {
if (lookalikes::IsEditDistanceAtMostOne(
base::UTF8ToUTF16(navigated_skeleton),
base::UTF8ToUTF16(top_domain_skeleton))) {
const std::string top_domain =
url_formatter::LookupSkeletonInTopDomains(top_domain_skeleton);
DCHECK(!top_domain.empty());
// If the only difference between the navigated and top
// domains is the registry part, this is unlikely to be a spoofing
// attempt. Ignore this match and continue. E.g. If the navigated domain
// is google.com.tw and the top domain is google.com.tr, this won't
// produce a match.
const std::string top_domain_without_registry =
url_formatter::top_domains::HostnameWithoutRegistry(top_domain);
DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
top_domain_without_registry));
if (navigated_domain.domain_without_registry !=
top_domain_without_registry) {
return top_domain;
}
}
}
}
return std::string();
}
// Returns the first matching engaged domain with an edit distance of at most
// one to |domain_and_registry|.
std::string GetSimilarDomainFromEngagedSites(
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites) {
for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
for (const DomainInfo& engaged_site : engaged_sites) {
if (!url_formatter::top_domains::IsEditDistanceCandidate(
engaged_site.domain_and_registry)) {
continue;
}
for (const std::string& engaged_skeleton : engaged_site.skeletons) {
if (lookalikes::IsEditDistanceAtMostOne(
base::UTF8ToUTF16(navigated_skeleton),
base::UTF8ToUTF16(engaged_skeleton))) {
// If the only difference between the navigated and engaged
// domain is the registry part, this is unlikely to be a spoofing
// attempt. Ignore this match and continue. E.g. If the navigated
// domain is google.com.tw and the top domain is google.com.tr, this
// won't produce a match.
if (navigated_domain.domain_without_registry !=
engaged_site.domain_without_registry) {
return engaged_site.domain_and_registry;
}
}
}
}
}
return std::string();
}
} // namespace
namespace lookalikes {
// static
const char LookalikeUrlNavigationThrottle::kHistogramName[] =
"NavigationSuggestion.Event";
bool IsEditDistanceAtMostOne(const base::string16& str1,
const base::string16& str2) {
if (str1.size() > str2.size() + 1 || str2.size() > str1.size() + 1) {
return false;
}
base::string16::const_iterator i = str1.begin();
base::string16::const_iterator j = str2.begin();
size_t edit_count = 0;
while (i != str1.end() && j != str2.end()) {
if (*i == *j) {
i++;
j++;
} else {
edit_count++;
if (edit_count > 1) {
return false;
}
if (str1.size() > str2.size()) {
// First string is longer than the second. This can only happen if the
// first string has an extra character.
i++;
} else if (str2.size() > str1.size()) {
// Second string is longer than the first. This can only happen if the
// second string has an extra character.
j++;
} else {
// Both strings are the same length. This can only happen if the two
// strings differ by a single character.
i++;
j++;
}
}
}
if (i != str1.end() || j != str2.end()) {
// A character at the end did not match.
edit_count++;
}
return edit_count <= 1;
}
LookalikeUrlNavigationThrottle::LookalikeUrlNavigationThrottle(
content::NavigationHandle* navigation_handle)
: content::NavigationThrottle(navigation_handle),
interstitials_enabled_(base::FeatureList::IsEnabled(
features::kLookalikeUrlNavigationSuggestionsUI)),
profile_(Profile::FromBrowserContext(
navigation_handle->GetWebContents()->GetBrowserContext())),
weak_factory_(this) {}
LookalikeUrlNavigationThrottle::~LookalikeUrlNavigationThrottle() {}
ThrottleCheckResult LookalikeUrlNavigationThrottle::HandleThrottleRequest(
const GURL& url) {
content::NavigationHandle* handle = navigation_handle();
// Ignore subframe and same document navigations.
if (!handle->IsInMainFrame() || handle->IsSameDocument()) {
return content::NavigationThrottle::PROCEED;
}
if (!url.SchemeIsHTTPOrHTTPS()) {
return content::NavigationThrottle::PROCEED;
}
// If the URL is in the allowlist, don't show any warning.
LookalikeUrlAllowlist* allowlist =
LookalikeUrlAllowlist::GetOrCreateAllowlist(handle->GetWebContents());
if (allowlist->IsDomainInList(url.host())) {
return content::NavigationThrottle::PROCEED;
}
const DomainInfo navigated_domain = GetDomainInfo(url);
if (navigated_domain.domain_and_registry.empty() ||
IsTopDomain(navigated_domain)) {
return content::NavigationThrottle::PROCEED;
}
LookalikeUrlService* service = LookalikeUrlService::Get(profile_);
if (service->UpdateEngagedSites(
base::BindOnce(&LookalikeUrlNavigationThrottle::PerformChecksDeferred,
weak_factory_.GetWeakPtr(), url, navigated_domain))) {
// If we're not going to show an interstitial, there's no reason to delay
// the navigation any further.
if (!interstitials_enabled_) {
return content::NavigationThrottle::PROCEED;
}
return content::NavigationThrottle::DEFER;
}
return PerformChecks(url, navigated_domain, service->GetLatestEngagedSites());
}
ThrottleCheckResult LookalikeUrlNavigationThrottle::WillStartRequest() {
return HandleThrottleRequest(navigation_handle()->GetURL());
}
ThrottleCheckResult LookalikeUrlNavigationThrottle::WillRedirectRequest() {
return HandleThrottleRequest(navigation_handle()->GetURL());
}
const char* LookalikeUrlNavigationThrottle::GetNameForLogging() {
return "LookalikeUrlNavigationThrottle";
}
ThrottleCheckResult LookalikeUrlNavigationThrottle::ShowInterstitial(
const GURL& safe_url,
const GURL& url,
ukm::SourceId source_id,
MatchType match_type) {
content::NavigationHandle* handle = navigation_handle();
content::WebContents* web_contents = handle->GetWebContents();
auto controller = std::make_unique<LookalikeUrlControllerClient>(
web_contents, url, safe_url);
std::unique_ptr<LookalikeUrlInterstitialPage> blocking_page(
new LookalikeUrlInterstitialPage(web_contents, safe_url, source_id,
match_type, std::move(controller)));
base::Optional<std::string> error_page_contents =
blocking_page->GetHTMLContents();
security_interstitials::SecurityInterstitialTabHelper::AssociateBlockingPage(
web_contents, handle->GetNavigationId(), std::move(blocking_page));
return ThrottleCheckResult(content::NavigationThrottle::CANCEL,
net::ERR_BLOCKED_BY_CLIENT, error_page_contents);
}
std::unique_ptr<LookalikeUrlNavigationThrottle>
LookalikeUrlNavigationThrottle::MaybeCreateNavigationThrottle(
content::NavigationHandle* navigation_handle) {
// If the tab is being prerendered, stop here before it breaks metrics
content::WebContents* web_contents = navigation_handle->GetWebContents();
if (prerender::PrerenderContents::FromWebContents(web_contents)) {
return nullptr;
}
// Otherwise, always insert the throttle for metrics recording.
return std::make_unique<LookalikeUrlNavigationThrottle>(navigation_handle);
}
void LookalikeUrlNavigationThrottle::PerformChecksDeferred(
const GURL& url,
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites) {
ThrottleCheckResult result =
PerformChecks(url, navigated_domain, engaged_sites);
if (!interstitials_enabled_) {
return;
}
if (result.action() == content::NavigationThrottle::PROCEED) {
Resume();
return;
}
CancelDeferredNavigation(result);
}
ThrottleCheckResult LookalikeUrlNavigationThrottle::PerformChecks(
const GURL& url,
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites) {
std::string matched_domain;
MatchType match_type;
// Ensure that this URL is not already engaged. We can't use the synchronous
// SiteEngagementService::IsEngagementAtLeast as it has side effects. We check
// in PerformChecks to ensure we have up-to-date engaged_sites.
// This check ignores the scheme which is okay since it's more conservative:
// If the user is engaged with http://domain.test, not showing the warning on
// https://domain.test is acceptable.
const auto already_engaged =
std::find_if(engaged_sites.begin(), engaged_sites.end(),
[navigated_domain](const DomainInfo& engaged_domain) {
return (navigated_domain.domain_and_registry ==
engaged_domain.domain_and_registry);
});
if (already_engaged != engaged_sites.end()) {
return content::NavigationThrottle::PROCEED;
}
if (!GetMatchingDomain(navigated_domain, engaged_sites, &matched_domain,
&match_type)) {
return content::NavigationThrottle::PROCEED;
}
DCHECK(!matched_domain.empty());
ukm::SourceId source_id = ukm::ConvertToSourceId(
navigation_handle()->GetNavigationId(), ukm::SourceIdType::NAVIGATION_ID);
if (ShouldDisplayInterstitial(match_type)) {
// matched_domain can be a top domain or an engaged domain. Simply use its
// eTLD+1 as the suggested domain.
// 1. If matched_domain is a top domain: Top domain list already contains
// eTLD+1s only so this works well.
// 2. If matched_domain is an engaged domain and is not an eTLD+1, don't
// suggest it. Otherwise, navigating to googlé.com and having engaged with
// docs.google.com would suggest docs.google.com.
//
// When the navigated and matched domains are not eTLD+1s (e.g.
// docs.googlé.com and docs.google.com), this will suggest google.com
// instead of docs.google.com. This is less than ideal, but has two
// benefits:
// - Simpler code
// - Fewer suggestions to non-existent domains. E.g. When the navigated
// domain is nonexistent.googlé.com and the matched domain is
// docs.google.com, we will suggest google.com instead of
// nonexistent.google.com.
const std::string suggested_domain = GetETLDPlusOne(matched_domain);
DCHECK(!suggested_domain.empty());
// Drop everything but the parts of the origin.
GURL::Replacements replace_host;
replace_host.SetHostStr(suggested_domain);
const GURL suggested_url =
url.ReplaceComponents(replace_host).GetWithEmptyPath();
return ShowInterstitial(suggested_url, url, source_id, match_type);
}
// Interstitial normally records UKM, but still record when it's not shown.
LookalikeUrlInterstitialPage::RecordUkmEvent(
source_id, match_type, UserAction::kInterstitialNotShown);
return content::NavigationThrottle::PROCEED;
}
bool LookalikeUrlNavigationThrottle::ShouldDisplayInterstitial(
MatchType match_type) const {
if (!interstitials_enabled_) {
return false;
}
if (match_type == MatchType::kSiteEngagement) {
return true;
}
return match_type == MatchType::kTopSite &&
kEnableInterstitialForTopSites.Get();
}
bool LookalikeUrlNavigationThrottle::GetMatchingDomain(
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites,
std::string* matched_domain,
MatchType* match_type) {
DCHECK(!navigated_domain.domain_and_registry.empty());
DCHECK(matched_domain);
DCHECK(match_type);
if (navigated_domain.idn_result.has_idn_component) {
// If the navigated domain is IDN, check its skeleton against engaged sites
// and top domains.
const std::string matched_engaged_domain =
GetMatchingSiteEngagementDomain(engaged_sites, navigated_domain);
if (!matched_engaged_domain.empty()) {
RecordEvent(NavigationSuggestionEvent::kMatchSiteEngagement);
*matched_domain = matched_engaged_domain;
*match_type = MatchType::kSiteEngagement;
return true;
}
if (!navigated_domain.idn_result.matching_top_domain.empty()) {
// In practice, this is not possible since the top domain list does not
// contain IDNs, so domain_and_registry can't both have IDN and be a top
// domain. Still, sanity check in case the top domain list changes in the
// future.
// At this point, navigated domain should not be a top domain.
DCHECK_NE(navigated_domain.domain_and_registry,
navigated_domain.idn_result.matching_top_domain);
RecordEvent(NavigationSuggestionEvent::kMatchTopSite);
*matched_domain = navigated_domain.idn_result.matching_top_domain;
*match_type = MatchType::kTopSite;
return true;
}
}
if (!url_formatter::top_domains::IsEditDistanceCandidate(
navigated_domain.domain_and_registry)) {
return false;
}
// If we can't find an exact top domain or an engaged site, try to find an
// engaged domain within an edit distance of one.
const std::string similar_engaged_domain =
GetSimilarDomainFromEngagedSites(navigated_domain, engaged_sites);
if (!similar_engaged_domain.empty() &&
navigated_domain.domain_and_registry != similar_engaged_domain) {
RecordEvent(NavigationSuggestionEvent::kMatchEditDistanceSiteEngagement);
*matched_domain = similar_engaged_domain;
*match_type = MatchType::kEditDistanceSiteEngagement;
return true;
}
// Finally, try to find a top domain within an edit distance of one.
const std::string similar_top_domain =
GetSimilarDomainFromTop500(navigated_domain);
if (!similar_top_domain.empty() &&
navigated_domain.domain_and_registry != similar_top_domain) {
RecordEvent(NavigationSuggestionEvent::kMatchEditDistance);
*matched_domain = similar_top_domain;
*match_type = MatchType::kEditDistance;
return true;
}
return false;
}
} // namespace lookalikes