// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This class handles the process of extracting all of the features from a
// page and computing a phishyness score. The basic steps are:
// - Run each feature extractor over the page, building up a FeatureMap of
// feature -> value.
// - SHA-256 hash all of the feature names in the map so that they match the
// supplied model.
// - Hand the hashed map off to a Scorer, which computes the probability that
// the page is phishy.
// - If the page is phishy, run the supplied callback.
// For more details, see phishing_*_feature_extractor.h, scorer.h, and
// client_model.proto.
#include <stdint.h>
#include <memory>
#include <set>
#include "base/callback.h"
#include "base/macros.h"
#include "base/memory/weak_ptr.h"
#include "base/strings/string16.h"
namespace content {
class RenderFrame;
namespace safe_browsing {
class ClientPhishingRequest;
class FeatureExtractorClock;
class FeatureMap;
class PhishingDOMFeatureExtractor;
class PhishingTermFeatureExtractor;
class PhishingUrlFeatureExtractor;
class Scorer;
class PhishingClassifier {
// Callback to be run when phishing classification finishes. The verdict
// is a ClientPhishingRequest which contains the verdict computed by the
// classifier as well as the extracted features. If the verdict.is_phishing()
// is true, the page is considered phishy by the client-side model,
// and the browser should ping back to get a final verdict. The
// verdict.client_score() is set to kInvalidScore if classification failed.
typedef base::Callback<void(const ClientPhishingRequest& /* verdict */)>
static const float kInvalidScore;
// Creates a new PhishingClassifier object that will operate on
// |render_view|. |clock| is used to time feature extractor operations, and
// the PhishingClassifier takes ownership of this object. Note that the
// classifier will not be 'ready' until set_phishing_scorer() is called.
PhishingClassifier(content::RenderFrame* render_frame,
FeatureExtractorClock* clock);
virtual ~PhishingClassifier();
// Sets a scorer for the classifier to use in computing the phishiness score.
// This must live at least as long as the PhishingClassifier. The caller is
// expected to cancel any pending classification before setting a phishing
// scorer.
void set_phishing_scorer(const Scorer* scorer);
// Returns true if the classifier is ready to classify pages, i.e. it
// has had a scorer set via set_phishing_scorer().
bool is_ready() const;
// Called by the RenderView when a page has finished loading. This begins
// the feature extraction and scoring process. |page_text| should contain
// the plain text of a web page, including any subframes, as returned by
// RenderView::CaptureText(). |page_text| is owned by the caller, and must
// not be destroyed until either |done_callback| is run or
// CancelPendingClassification() is called.
// To avoid blocking the render thread for too long, phishing classification
// may run in several chunks of work, posting a task to the current
// MessageLoop to continue processing. Once the scoring process is complete,
// |done_callback| is run on the current thread. PhishingClassifier takes
// ownership of the callback.
// It is an error to call BeginClassification if the classifier is not yet
// ready.
virtual void BeginClassification(const base::string16* page_text,
const DoneCallback& callback);
// Called by the RenderView (on the render thread) when a page is unloading
// or the RenderView is being destroyed. This cancels any extraction that
// is in progress. It is an error to call CancelPendingClassification if
// the classifier is not yet ready.
virtual void CancelPendingClassification();
// Any score equal to or above this value is considered phishy.
static const float kPhishyThreshold;
// Begins the feature extraction process, by extracting URL features and
// beginning DOM feature extraction.
void BeginFeatureExtraction();
// Callback to be run when DOM feature extraction is complete.
// If it was successful, begins term feature extraction, otherwise
// runs the DoneCallback with a non-phishy verdict.
void DOMExtractionFinished(bool success);
// Callback to be run when term feature extraction is complete.
// If it was successful, computes a score and runs the DoneCallback.
// If extraction was unsuccessful, runs the DoneCallback with a
// non-phishy verdict.
void TermExtractionFinished(bool success);
// Helper to verify that there is no pending phishing classification. Dies
// in debug builds if the state is not as expected. This is a no-op in
// release builds.
void CheckNoPendingClassification();
// Helper method to run the DoneCallback and clear the state.
void RunCallback(const ClientPhishingRequest& verdict);
// Helper to run the DoneCallback when feature extraction has failed.
// This always signals a non-phishy verdict for the page, with kInvalidScore.
void RunFailureCallback();
// Clears the current state of the PhishingClassifier.
void Clear();
content::RenderFrame* render_frame_; // owns us
const Scorer* scorer_; // owned by the caller
std::unique_ptr<FeatureExtractorClock> clock_;
std::unique_ptr<PhishingUrlFeatureExtractor> url_extractor_;
std::unique_ptr<PhishingDOMFeatureExtractor> dom_extractor_;
std::unique_ptr<PhishingTermFeatureExtractor> term_extractor_;
// State for any in-progress extraction.
std::unique_ptr<FeatureMap> features_;
std::unique_ptr<std::set<uint32_t>> shingle_hashes_;
const base::string16* page_text_; // owned by the caller
DoneCallback done_callback_;
// Used in scheduling BeginFeatureExtraction tasks.
// These pointers are invalidated if classification is cancelled.
base::WeakPtrFactory<PhishingClassifier> weak_factory_;
} // namespace safe_browsing