blob: 7663551e203ac55b0cff8b0b35394373b5973c2a [file] [log] [blame]
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/safe_browsing/browser_feature_extractor.h"
#include <map>
#include <utility>
#include "base/bind.h"
#include "base/bind_helpers.h"
#include "base/format_macros.h"
#include "base/stl_util.h"
#include "base/stringprintf.h"
#include "base/task.h"
#include "base/time.h"
#include "chrome/common/safe_browsing/csd.pb.h"
#include "chrome/browser/history/history.h"
#include "chrome/browser/history/history_types.h"
#include "chrome/browser/profiles/profile.h"
#include "chrome/browser/safe_browsing/browser_features.h"
#include "chrome/browser/safe_browsing/client_side_detection_service.h"
#include "content/browser/browser_thread.h"
#include "content/browser/cancelable_request.h"
#include "content/browser/tab_contents/tab_contents.h"
#include "content/public/common/page_transition_types.h"
#include "googleurl/src/gurl.h"
namespace safe_browsing {
BrowseInfo::BrowseInfo() {}
BrowseInfo::~BrowseInfo() {}
static void AddFeature(const std::string& feature_name,
double feature_value,
ClientPhishingRequest* request) {
DCHECK(request);
ClientPhishingRequest::Feature* feature =
request->add_non_model_feature_map();
feature->set_name(feature_name);
feature->set_value(feature_value);
VLOG(2) << "Browser feature: " << feature->name() << " " << feature->value();
}
static void AddNavigationFeatures(const std::string& feature_prefix,
const NavigationController& controller,
int index,
const std::vector<GURL>& redirect_chain,
ClientPhishingRequest* request) {
NavigationEntry* entry = controller.GetEntryAtIndex(index);
bool is_secure_referrer = entry->referrer().SchemeIsSecure();
if (!is_secure_referrer) {
AddFeature(StringPrintf("%s%s=%s",
feature_prefix.c_str(),
features::kReferrer,
entry->referrer().spec().c_str()),
1.0,
request);
}
AddFeature(feature_prefix + features::kHasSSLReferrer,
is_secure_referrer ? 1.0 : 0.0,
request);
AddFeature(feature_prefix + features::kPageTransitionType,
static_cast<double>(
content::PageTransitionStripQualifier(
entry->transition_type())),
request);
AddFeature(feature_prefix + features::kIsFirstNavigation,
index == 0 ? 1.0 : 0.0,
request);
// Redirect chain should always be at least of size one, as the rendered
// url is the last element in the chain.
if (redirect_chain.empty()) {
NOTREACHED();
return;
}
if (redirect_chain.back() != entry->url()) {
// I originally had this as a DCHECK but I saw a failure once that I
// can't reproduce. It looks like it might be related to the
// navigation controller only keeping a limited number of navigation
// events. For now we'll just attach a feature specifying that this is
// a mismatch and try and figure out what to do with it on the server.
DLOG(WARNING) << "Expected:" << entry->url()
<< " Actual:" << redirect_chain.back();
AddFeature(feature_prefix + features::kRedirectUrlMismatch,
1.0,
request);
return;
}
// We skip the last element since it should just be the current url.
for (size_t i = 0; i < redirect_chain.size() - 1; i++) {
std::string printable_redirect = redirect_chain[i].spec();
if (redirect_chain[i].SchemeIsSecure()) {
printable_redirect = features::kSecureRedirectValue;
}
AddFeature(StringPrintf("%s%s[%"PRIuS"]=%s",
feature_prefix.c_str(),
features::kRedirect,
i,
printable_redirect.c_str()),
1.0,
request);
}
}
BrowserFeatureExtractor::BrowserFeatureExtractor(
TabContents* tab,
ClientSideDetectionService* service)
: tab_(tab),
service_(service),
ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
DCHECK(tab);
}
BrowserFeatureExtractor::~BrowserFeatureExtractor() {
method_factory_.RevokeAll();
// Delete all the pending extractions (delete callback and request objects).
STLDeleteContainerPairPointers(pending_extractions_.begin(),
pending_extractions_.end());
// Also cancel all the pending history service queries.
HistoryService* history;
bool success = GetHistoryService(&history);
DCHECK(success || pending_queries_.size() == 0);
// Cancel all the pending history lookups and cleanup the memory.
for (PendingQueriesMap::iterator it = pending_queries_.begin();
it != pending_queries_.end(); ++it) {
if (history) {
history->CancelRequest(it->first);
}
ExtractionData& extraction = it->second;
delete extraction.first; // delete request
delete extraction.second; // delete callback
}
pending_queries_.clear();
}
void BrowserFeatureExtractor::ExtractFeatures(const BrowseInfo* info,
ClientPhishingRequest* request,
DoneCallback* callback) {
DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
DCHECK(request);
DCHECK(info);
DCHECK_EQ(0U, request->url().find("http:"));
DCHECK(callback);
if (!callback) {
DLOG(ERROR) << "ExtractFeatures called without a callback object";
return;
}
// Extract features pertaining to this navigation.
const NavigationController& controller = tab_->controller();
int url_index = -1;
int first_host_index = -1;
GURL request_url(request->url());
int index = controller.GetCurrentEntryIndex();
// The url that we are extracting features for should already be commited.
DCHECK_NE(index, -1);
for (; index >= 0; index--) {
NavigationEntry* entry = controller.GetEntryAtIndex(index);
if (url_index == -1 && entry->url() == request_url) {
// It's possible that we've been on the on the possibly phishy url before
// in this tab, so make sure that we use the latest navigation for
// features.
// Note that it's possible that the url_index should always be the
// latest entry, but I'm worried about possible races during a navigation
// and transient entries (i.e. interstiatials) so for now we will just
// be cautious.
url_index = index;
} else if (index < url_index) {
if (entry->url().host() == request_url.host()) {
first_host_index = index;
} else {
// We have found the possibly phishing url, but we are no longer on the
// host. No reason to look back any further.
break;
}
}
}
// Add features pertaining to how we got to
// 1) The candidate url
// 2) The first url on the same host as the candidate url (assuming that
// it's different from the candidate url).
if (url_index != -1) {
AddNavigationFeatures("", controller, url_index, info->url_redirects,
request);
}
if (first_host_index != -1) {
AddNavigationFeatures(features::kHostPrefix,
controller,
first_host_index,
info->host_redirects,
request);
}
ExtractBrowseInfoFeatures(*info, request);
pending_extractions_.insert(std::make_pair(request, callback));
MessageLoop::current()->PostTask(
FROM_HERE,
method_factory_.NewRunnableMethod(
&BrowserFeatureExtractor::StartExtractFeatures,
request, callback));
}
void BrowserFeatureExtractor::ExtractBrowseInfoFeatures(
const BrowseInfo& info,
ClientPhishingRequest* request) {
if (service_) {
for (std::set<std::string>::const_iterator it = info.ips.begin();
it != info.ips.end(); ++it) {
if (service_->IsBadIpAddress(*it)) {
AddFeature(features::kBadIpFetch + *it, 1.0, request);
}
}
}
if (info.unsafe_resource.get()) {
// A SafeBrowsing interstitial was shown for the current URL.
AddFeature(features::kSafeBrowsingMaliciousUrl +
info.unsafe_resource->url.spec(),
1.0,
request);
AddFeature(features::kSafeBrowsingOriginalUrl +
info.unsafe_resource->original_url.spec(),
1.0,
request);
AddFeature(features::kSafeBrowsingIsSubresource,
info.unsafe_resource->is_subresource ? 1.0 : 0.0,
request);
AddFeature(features::kSafeBrowsingThreatType,
static_cast<double>(info.unsafe_resource->threat_type),
request);
}
}
void BrowserFeatureExtractor::StartExtractFeatures(
ClientPhishingRequest* request,
DoneCallback* callback) {
DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
ExtractionData extraction = std::make_pair(request, callback);
size_t removed = pending_extractions_.erase(extraction);
DCHECK_EQ(1U, removed);
HistoryService* history;
if (!request || !request->IsInitialized() || !GetHistoryService(&history)) {
callback->Run(false, request);
delete callback;
return;
}
CancelableRequestProvider::Handle handle = history->QueryURL(
GURL(request->url()),
true /* wants_visits */,
&request_consumer_,
base::Bind(&BrowserFeatureExtractor::QueryUrlHistoryDone,
base::Unretained(this)));
StorePendingQuery(handle, request, callback);
}
void BrowserFeatureExtractor::QueryUrlHistoryDone(
CancelableRequestProvider::Handle handle,
bool success,
const history::URLRow* row,
history::VisitVector* visits) {
DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
ClientPhishingRequest* request;
DoneCallback* callback;
if (!GetPendingQuery(handle, &request, &callback)) {
DLOG(FATAL) << "No pending history query found";
return;
}
DCHECK(request);
DCHECK(callback);
if (!success) {
// URL is not found in the history. In practice this should not
// happen (unless there is a real error) because we just visited
// that URL.
callback->Run(false, request);
delete callback;
return;
}
AddFeature(features::kUrlHistoryVisitCount,
static_cast<double>(row->visit_count()),
request);
base::Time threshold = base::Time::Now() - base::TimeDelta::FromDays(1);
int num_visits_24h_ago = 0;
int num_visits_typed = 0;
int num_visits_link = 0;
for (history::VisitVector::const_iterator it = visits->begin();
it != visits->end(); ++it) {
if (!content::PageTransitionIsMainFrame(it->transition)) {
continue;
}
if (it->visit_time < threshold) {
++num_visits_24h_ago;
}
content::PageTransition transition = content::PageTransitionStripQualifier(
it->transition);
if (transition == content::PAGE_TRANSITION_TYPED) {
++num_visits_typed;
} else if (transition == content::PAGE_TRANSITION_LINK) {
++num_visits_link;
}
}
AddFeature(features::kUrlHistoryVisitCountMoreThan24hAgo,
static_cast<double>(num_visits_24h_ago),
request);
AddFeature(features::kUrlHistoryTypedCount,
static_cast<double>(num_visits_typed),
request);
AddFeature(features::kUrlHistoryLinkCount,
static_cast<double>(num_visits_link),
request);
// Issue next history lookup for host visits.
HistoryService* history;
if (!GetHistoryService(&history)) {
callback->Run(false, request);
delete callback;
return;
}
CancelableRequestProvider::Handle next_handle =
history->GetVisibleVisitCountToHost(
GURL(request->url()),
&request_consumer_,
base::Bind(&BrowserFeatureExtractor::QueryHttpHostVisitsDone,
base::Unretained(this)));
StorePendingQuery(next_handle, request, callback);
}
void BrowserFeatureExtractor::QueryHttpHostVisitsDone(
CancelableRequestProvider::Handle handle,
bool success,
int num_visits,
base::Time first_visit) {
DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
ClientPhishingRequest* request;
DoneCallback* callback;
if (!GetPendingQuery(handle, &request, &callback)) {
DLOG(FATAL) << "No pending history query found";
return;
}
DCHECK(request);
DCHECK(callback);
if (!success) {
callback->Run(false, request);
delete callback;
return;
}
SetHostVisitsFeatures(num_visits, first_visit, true, request);
// Same lookup but for the HTTPS URL.
HistoryService* history;
if (!GetHistoryService(&history)) {
callback->Run(false, request);
delete callback;
return;
}
std::string https_url = request->url();
CancelableRequestProvider::Handle next_handle =
history->GetVisibleVisitCountToHost(
GURL(https_url.replace(0, 5, "https:")),
&request_consumer_,
base::Bind(&BrowserFeatureExtractor::QueryHttpsHostVisitsDone,
base::Unretained(this)));
StorePendingQuery(next_handle, request, callback);
}
void BrowserFeatureExtractor::QueryHttpsHostVisitsDone(
CancelableRequestProvider::Handle handle,
bool success,
int num_visits,
base::Time first_visit) {
DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
ClientPhishingRequest* request;
DoneCallback* callback;
if (!GetPendingQuery(handle, &request, &callback)) {
DLOG(FATAL) << "No pending history query found";
return;
}
DCHECK(request);
DCHECK(callback);
if (!success) {
callback->Run(false, request);
delete callback;
return;
}
SetHostVisitsFeatures(num_visits, first_visit, false, request);
callback->Run(true, request); // We're done with all the history lookups.
delete callback;
}
void BrowserFeatureExtractor::SetHostVisitsFeatures(
int num_visits,
base::Time first_visit,
bool is_http_query,
ClientPhishingRequest* request) {
DCHECK(request);
AddFeature(is_http_query ?
features::kHttpHostVisitCount : features::kHttpsHostVisitCount,
static_cast<double>(num_visits),
request);
if (num_visits > 0) {
AddFeature(
is_http_query ?
features::kFirstHttpHostVisitMoreThan24hAgo :
features::kFirstHttpsHostVisitMoreThan24hAgo,
(first_visit < (base::Time::Now() - base::TimeDelta::FromDays(1))) ?
1.0 : 0.0,
request);
}
}
void BrowserFeatureExtractor::StorePendingQuery(
CancelableRequestProvider::Handle handle,
ClientPhishingRequest* request,
DoneCallback* callback) {
DCHECK_EQ(0U, pending_queries_.count(handle));
pending_queries_[handle] = std::make_pair(request, callback);
}
bool BrowserFeatureExtractor::GetPendingQuery(
CancelableRequestProvider::Handle handle,
ClientPhishingRequest** request,
DoneCallback** callback) {
PendingQueriesMap::iterator it = pending_queries_.find(handle);
DCHECK(it != pending_queries_.end());
if (it != pending_queries_.end()) {
*request = it->second.first;
*callback = it->second.second;
pending_queries_.erase(it);
return true;
}
return false;
}
bool BrowserFeatureExtractor::GetHistoryService(HistoryService** history) {
*history = NULL;
if (tab_ && tab_->browser_context()) {
Profile* profile = Profile::FromBrowserContext(tab_->browser_context());
*history = profile->GetHistoryService(Profile::EXPLICIT_ACCESS);
if (*history) {
return true;
}
}
VLOG(2) << "Unable to query history. No history service available.";
return false;
}
} // namespace safe_browsing