blob: cf260bb5823d1ac449da3dae92556c442a5a3e78 [file] [log] [blame]
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/renderer/safe_browsing/phishing_classifier.h"
#include <string>
#include "base/bind.h"
#include "base/callback.h"
#include "base/compiler_specific.h"
#include "base/location.h"
#include "base/logging.h"
#include "base/metrics/histogram.h"
#include "base/single_thread_task_runner.h"
#include "base/strings/string_util.h"
#include "base/threading/thread_task_runner_handle.h"
#include "chrome/common/safe_browsing/csd.pb.h"
#include "chrome/common/url_constants.h"
#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
#include "chrome/renderer/safe_browsing/features.h"
#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
#include "chrome/renderer/safe_browsing/scorer.h"
#include "content/public/renderer/render_frame.h"
#include "crypto/sha2.h"
#include "third_party/WebKit/public/platform/WebURL.h"
#include "third_party/WebKit/public/platform/WebURLRequest.h"
#include "third_party/WebKit/public/web/WebDataSource.h"
#include "third_party/WebKit/public/web/WebDocument.h"
#include "third_party/WebKit/public/web/WebLocalFrame.h"
#include "third_party/WebKit/public/web/WebView.h"
#include "url/gurl.h"
namespace safe_browsing {
const float PhishingClassifier::kInvalidScore = -1.0;
const float PhishingClassifier::kPhishyThreshold = 0.5;
namespace {
// Used for UMA, do not reorder.
enum SkipClassificationReason {
CLASSIFICATION_PROCEED = 0,
SKIP_HTTPS = 1,
SKIP_NONE_GET = 2,
SKIP_REASON_MAX
};
void RecordReasonForSkippingClassificationToUMA(
SkipClassificationReason reason) {
UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.SkipClassificationReason",
reason,
SKIP_REASON_MAX);
}
} // namespace
PhishingClassifier::PhishingClassifier(content::RenderFrame* render_frame,
FeatureExtractorClock* clock)
: render_frame_(render_frame),
scorer_(NULL),
clock_(clock),
weak_factory_(this) {
Clear();
}
PhishingClassifier::~PhishingClassifier() {
// The RenderView should have called CancelPendingClassification() before
// we are destroyed.
CheckNoPendingClassification();
}
void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
CheckNoPendingClassification();
scorer_ = scorer;
if (scorer_) {
url_extractor_.reset(new PhishingUrlFeatureExtractor);
dom_extractor_.reset(new PhishingDOMFeatureExtractor(clock_.get()));
term_extractor_.reset(new PhishingTermFeatureExtractor(
&scorer_->page_terms(),
&scorer_->page_words(),
scorer_->max_words_per_term(),
scorer_->murmurhash3_seed(),
scorer_->max_shingles_per_page(),
scorer_->shingle_size(),
clock_.get()));
} else {
// We're disabling client-side phishing detection, so tear down all
// of the relevant objects.
url_extractor_.reset();
dom_extractor_.reset();
term_extractor_.reset();
}
}
bool PhishingClassifier::is_ready() const {
return scorer_ != NULL;
}
void PhishingClassifier::BeginClassification(
const base::string16* page_text,
const DoneCallback& done_callback) {
DCHECK(is_ready());
// The RenderView should have called CancelPendingClassification() before
// starting a new classification, so DCHECK this.
CheckNoPendingClassification();
// However, in an opt build, we will go ahead and clean up the pending
// classification so that we can start in a known state.
CancelPendingClassification();
page_text_ = page_text;
done_callback_ = done_callback;
// For consistency, we always want to invoke the DoneCallback
// asynchronously, rather than directly from this method. To ensure that
// this is the case, post a task to begin feature extraction on the next
// iteration of the message loop.
base::ThreadTaskRunnerHandle::Get()->PostTask(
FROM_HERE, base::Bind(&PhishingClassifier::BeginFeatureExtraction,
weak_factory_.GetWeakPtr()));
}
void PhishingClassifier::BeginFeatureExtraction() {
blink::WebLocalFrame* frame = render_frame_->GetWebFrame();
// Check whether the URL is one that we should classify.
// Currently, we only classify http: URLs that are GET requests.
GURL url(frame->document().url());
if (!url.SchemeIs(url::kHttpScheme)) {
RecordReasonForSkippingClassificationToUMA(SKIP_HTTPS);
RunFailureCallback();
return;
}
blink::WebDataSource* ds = frame->dataSource();
if (!ds ||
!base::EqualsASCII(base::StringPiece16(ds->request().httpMethod()),
"GET")) {
if (ds)
RecordReasonForSkippingClassificationToUMA(SKIP_NONE_GET);
RunFailureCallback();
return;
}
RecordReasonForSkippingClassificationToUMA(CLASSIFICATION_PROCEED);
features_.reset(new FeatureMap);
if (!url_extractor_->ExtractFeatures(url, features_.get())) {
RunFailureCallback();
return;
}
// DOM feature extraction can take awhile, so it runs asynchronously
// in several chunks of work and invokes the callback when finished.
dom_extractor_->ExtractFeatures(
frame->document(), features_.get(),
base::Bind(&PhishingClassifier::DOMExtractionFinished,
base::Unretained(this)));
}
void PhishingClassifier::CancelPendingClassification() {
// Note that cancelling the feature extractors is simply a no-op if they
// were not running.
DCHECK(is_ready());
dom_extractor_->CancelPendingExtraction();
term_extractor_->CancelPendingExtraction();
weak_factory_.InvalidateWeakPtrs();
Clear();
}
void PhishingClassifier::DOMExtractionFinished(bool success) {
shingle_hashes_.reset(new std::set<uint32_t>);
if (success) {
// Term feature extraction can take awhile, so it runs asynchronously
// in several chunks of work and invokes the callback when finished.
term_extractor_->ExtractFeatures(
page_text_,
features_.get(),
shingle_hashes_.get(),
base::Bind(&PhishingClassifier::TermExtractionFinished,
base::Unretained(this)));
} else {
RunFailureCallback();
}
}
void PhishingClassifier::TermExtractionFinished(bool success) {
if (success) {
blink::WebLocalFrame* main_frame = render_frame_->GetWebFrame();
// Hash all of the features so that they match the model, then compute
// the score.
FeatureMap hashed_features;
ClientPhishingRequest verdict;
verdict.set_model_version(scorer_->model_version());
verdict.set_url(main_frame->document().url().string().utf8());
for (base::hash_map<std::string, double>::const_iterator it =
features_->features().begin();
it != features_->features().end(); ++it) {
DVLOG(2) << "Feature: " << it->first << " = " << it->second;
bool result = hashed_features.AddRealFeature(
crypto::SHA256HashString(it->first), it->second);
DCHECK(result);
ClientPhishingRequest::Feature* feature = verdict.add_feature_map();
feature->set_name(it->first);
feature->set_value(it->second);
}
for (std::set<uint32_t>::const_iterator it = shingle_hashes_->begin();
it != shingle_hashes_->end(); ++it) {
verdict.add_shingle_hashes(*it);
}
float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
verdict.set_client_score(score);
verdict.set_is_phishing(score >= kPhishyThreshold);
RunCallback(verdict);
} else {
RunFailureCallback();
}
}
void PhishingClassifier::CheckNoPendingClassification() {
DCHECK(done_callback_.is_null());
DCHECK(!page_text_);
if (!done_callback_.is_null() || page_text_) {
LOG(ERROR) << "Classification in progress, missing call to "
<< "CancelPendingClassification";
}
}
void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) {
done_callback_.Run(verdict);
Clear();
}
void PhishingClassifier::RunFailureCallback() {
ClientPhishingRequest verdict;
// In this case we're not guaranteed to have a valid URL. Just set it
// to the empty string to make sure we have a valid protocol buffer.
verdict.set_url("");
verdict.set_client_score(kInvalidScore);
verdict.set_is_phishing(false);
RunCallback(verdict);
}
void PhishingClassifier::Clear() {
page_text_ = NULL;
done_callback_.Reset();
features_.reset(NULL);
shingle_hashes_.reset(NULL);
}
} // namespace safe_browsing