blob: 23ceefb610bb61c4c477c56270372c4fa708b0da [file] [log] [blame]
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/renderer/safe_browsing/scorer.h"
#include <math.h>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include "base/logging.h"
#include "base/metrics/histogram_macros.h"
#include "base/strings/string_piece.h"
#include "chrome/common/safe_browsing/client_model.pb.h"
#include "chrome/renderer/safe_browsing/features.h"
namespace {
// Enum used to keep stats about the status of the Scorer creation.
enum ScorerCreationStatus {
SCORER_SUCCESS,
SCORER_FAIL_MODEL_OPEN_FAIL, // Not used anymore
SCORER_FAIL_MODEL_FILE_EMPTY, // Not used anymore
SCORER_FAIL_MODEL_FILE_TOO_LARGE, // Not used anymore
SCORER_FAIL_MODEL_PARSE_ERROR,
SCORER_FAIL_MODEL_MISSING_FIELDS,
SCORER_STATUS_MAX // Always add new values before this one.
};
void RecordScorerCreationStatus(ScorerCreationStatus status) {
UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.ScorerCreationStatus",
status,
SCORER_STATUS_MAX);
}
} // namespace
namespace safe_browsing {
// Helper function which converts log odds to a probability in the range
// [0.0,1.0].
static double LogOdds2Prob(double log_odds) {
// 709 = floor(1023*ln(2)). 2**1023 is the largest finite double.
// Small log odds aren't a problem. as the odds will be 0. It's only
// when we get +infinity for the odds, that odds/(odds+1) would be NaN.
if (log_odds >= 709) {
return 1.0;
}
double odds = exp(log_odds);
return odds/(odds+1.0);
}
Scorer::Scorer() {}
Scorer::~Scorer() {}
/* static */
Scorer* Scorer::Create(const base::StringPiece& model_str) {
std::unique_ptr<Scorer> scorer(new Scorer());
ClientSideModel& model = scorer->model_;
if (!model.ParseFromArray(model_str.data(), model_str.size())) {
DLOG(ERROR) << "Unable to parse phishing model. This Scorer object is "
<< "invalid.";
RecordScorerCreationStatus(SCORER_FAIL_MODEL_PARSE_ERROR);
return NULL;
} else if (!model.IsInitialized()) {
DLOG(ERROR) << "Unable to parse phishing model. The model is missing "
<< "some required fields. Maybe the .proto file changed?";
RecordScorerCreationStatus(SCORER_FAIL_MODEL_MISSING_FIELDS);
return NULL;
}
RecordScorerCreationStatus(SCORER_SUCCESS);
for (int i = 0; i < model.page_term_size(); ++i) {
scorer->page_terms_.insert(model.hashes(model.page_term(i)));
}
for (int i = 0; i < model.page_word_size(); ++i) {
scorer->page_words_.insert(model.page_word(i));
}
return scorer.release();
}
double Scorer::ComputeScore(const FeatureMap& features) const {
double logodds = 0.0;
for (int i = 0; i < model_.rule_size(); ++i) {
logodds += ComputeRuleScore(model_.rule(i), features);
}
return LogOdds2Prob(logodds);
}
int Scorer::model_version() const {
return model_.version();
}
const std::unordered_set<std::string>& Scorer::page_terms() const {
return page_terms_;
}
const std::unordered_set<uint32_t>& Scorer::page_words() const {
return page_words_;
}
size_t Scorer::max_words_per_term() const {
return model_.max_words_per_term();
}
uint32_t Scorer::murmurhash3_seed() const {
return model_.murmur_hash_seed();
}
size_t Scorer::max_shingles_per_page() const {
return model_.max_shingles_per_page();
}
size_t Scorer::shingle_size() const {
return model_.shingle_size();
}
double Scorer::ComputeRuleScore(const ClientSideModel::Rule& rule,
const FeatureMap& features) const {
const std::unordered_map<std::string, double>& feature_map =
features.features();
double rule_score = 1.0;
for (int i = 0; i < rule.feature_size(); ++i) {
const auto it = feature_map.find(model_.hashes(rule.feature(i)));
if (it == feature_map.end() || it->second == 0.0) {
// If the feature of the rule does not exist in the given feature map the
// feature weight is considered to be zero. If the feature weight is zero
// we leave early since we know that the rule score will be zero.
return 0.0;
}
rule_score *= it->second;
}
return rule_score * rule.weight();
}
} // namespace safe_browsing