blob: be48a4e663c93d8aadc737657a8e2ca8387390dc [file] [log] [blame]
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// This test performs a series false positive checks using a list of URLs
// against a known set of SafeBrowsing data.
//
// It uses a normal SafeBrowsing database to create a bloom filter where it
// looks up all the URLs in the url file. A URL that has a prefix found in the
// bloom filter and found in the database is considered a hit: a valid lookup
// that will result in a gethash request. A URL that has a prefix found in the
// bloom filter but not in the database is a miss: a false positive lookup that
// will result in an unnecessary gethash request.
//
// By varying the size of the bloom filter and using a constant set of
// SafeBrowsing data, we can check a known set of URLs against the filter and
// determine the false positive rate.
//
// False positive calculation usage:
// $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.FalsePositives
// --filter-start=<integer>
// --filter-steps=<integer>
// --filter-verbose
//
// --filter-start: The filter multiplier to begin with. This represents the
// number of bits per prefix of memory to use in the filter.
// The default value is identical to the current SafeBrowsing
// database value.
// --filter-steps: The number of iterations to run, with each iteration
// increasing the filter multiplier by 1. The default value
// is 1.
// --filter-verbose: Used to print out the hit / miss results per URL.
// --filter-csv: The URL file contains information about the number of
// unique views (the popularity) of each URL. See the format
// description below.
//
//
// Hash compute time usage:
// $ ./perf_tests.exe --gtest_filter=SafeBrowsingBloomFilter.HashTime
// --filter-num-checks=<integer>
//
// --filter-num-checks: The number of hash look ups to perform on the bloom
// filter. The default is 10 million.
//
// Data files:
// chrome/test/data/safe_browsing/filter/database
// chrome/test/data/safe_browsing/filter/urls
//
// database: A normal SafeBrowsing database.
// urls: A text file containing a list of URLs, one per line. If the option
// --filter-csv is specified, the format of each line in the file is
// <url>,<weight> where weight is an integer indicating the number of
// unique views for the URL.
#include <algorithm>
#include <fstream>
#include <vector>
#include "base/command_line.h"
#include "base/file_path.h"
#include "base/file_util.h"
#include "base/logging.h"
#include "base/memory/scoped_ptr.h"
#include "base/path_service.h"
#include "base/rand_util.h"
#include "base/string_number_conversions.h"
#include "base/string_util.h"
#include "base/time.h"
#include "chrome/browser/safe_browsing/bloom_filter.h"
#include "chrome/browser/safe_browsing/safe_browsing_util.h"
#include "chrome/common/chrome_paths.h"
#include "crypto/sha2.h"
#include "googleurl/src/gurl.h"
#include "sql/statement.h"
#include "testing/gtest/include/gtest/gtest.h"
using base::Time;
using base::TimeDelta;
namespace {
// Command line flags.
const char kFilterVerbose[] = "filter-verbose";
const char kFilterStart[] = "filter-start";
const char kFilterSteps[] = "filter-steps";
const char kFilterCsv[] = "filter-csv";
const char kFilterNumChecks[] = "filter-num-checks";
// Number of hash checks to make during performance testing.
const int kNumHashChecks = 10000000;
// Returns the path to the data used in this test, relative to the top of the
// source directory.
FilePath GetFullDataPath() {
FilePath full_path;
CHECK(PathService::Get(chrome::DIR_TEST_DATA, &full_path));
full_path = full_path.Append(FILE_PATH_LITERAL("safe_browsing"));
full_path = full_path.Append(FILE_PATH_LITERAL("filter"));
CHECK(file_util::PathExists(full_path));
return full_path;
}
// Constructs a bloom filter of the appropriate size from the provided prefixes.
void BuildBloomFilter(int size_multiplier,
const std::vector<SBPrefix>& prefixes,
BloomFilter** bloom_filter) {
// Create a BloomFilter with the specified size.
const int key_count = std::max(static_cast<int>(prefixes.size()),
BloomFilter::kBloomFilterMinSize);
const int filter_size = key_count * size_multiplier;
*bloom_filter = new BloomFilter(filter_size);
// Add the prefixes to it.
for (size_t i = 0; i < prefixes.size(); ++i)
(*bloom_filter)->Insert(prefixes[i]);
std::cout << "Bloom filter with prefixes: " << prefixes.size()
<< ", multiplier: " << size_multiplier
<< ", size (bytes): " << (*bloom_filter)->size()
<< std::endl;
}
// Reads the set of add prefixes contained in a SafeBrowsing database into a
// sorted array suitable for fast searching. This takes significantly less time
// to look up a given prefix than performing SQL queries.
bool ReadDatabase(const FilePath& path, std::vector<SBPrefix>* prefixes) {
FilePath database_file = path.Append(FILE_PATH_LITERAL("database"));
sql::Connection db;
if (!db.Open(database_file))
return false;
// Get the number of items in the add_prefix table.
const char* query = "SELECT COUNT(*) FROM add_prefix";
sql::Statement count_statement(db.GetCachedStatement(SQL_FROM_HERE, query));
if (!count_statement)
return false;
if (!count_statement.Step())
return false;
const int count = count_statement.ColumnInt(0);
// Load them into a prefix vector and sort.
prefixes->reserve(count);
query = "SELECT prefix FROM add_prefix";
sql::Statement prefix_statement(db.GetCachedStatement(SQL_FROM_HERE, query));
if (!prefix_statement)
return false;
while (prefix_statement.Step())
prefixes->push_back(prefix_statement.ColumnInt(0));
DCHECK(static_cast<int>(prefixes->size()) == count);
sort(prefixes->begin(), prefixes->end());
return true;
}
// Generates all legal SafeBrowsing prefixes for the specified URL, and returns
// the set of Prefixes that exist in the bloom filter. The function returns the
// number of host + path combinations checked.
int GeneratePrefixHits(const std::string url,
BloomFilter* bloom_filter,
std::vector<SBPrefix>* prefixes) {
GURL url_check(url);
std::vector<std::string> hosts;
if (url_check.HostIsIPAddress()) {
hosts.push_back(url_check.host());
} else {
safe_browsing_util::GenerateHostsToCheck(url_check, &hosts);
}
std::vector<std::string> paths;
safe_browsing_util::GeneratePathsToCheck(url_check, &paths);
for (size_t i = 0; i < hosts.size(); ++i) {
for (size_t j = 0; j < paths.size(); ++j) {
SBPrefix prefix;
crypto::SHA256HashString(hosts[i] + paths[j], &prefix, sizeof(prefix));
if (bloom_filter->Exists(prefix))
prefixes->push_back(prefix);
}
}
return hosts.size() * paths.size();
}
// Binary search of sorted prefixes.
bool IsPrefixInDatabase(SBPrefix prefix,
const std::vector<SBPrefix>& prefixes) {
if (prefixes.empty())
return false;
int low = 0;
int high = prefixes.size() - 1;
while (low <= high) {
int mid = ((unsigned int)low + (unsigned int)high) >> 1;
SBPrefix prefix_mid = prefixes[mid];
if (prefix_mid == prefix)
return true;
if (prefix_mid < prefix)
low = mid + 1;
else
high = mid - 1;
}
return false;
}
// Construct a bloom filter with the given prefixes and multiplier, and test the
// false positive rate (misses) against a URL list.
void CalculateBloomFilterFalsePositives(
int size_multiplier,
const FilePath& data_dir,
const std::vector<SBPrefix>& prefix_list) {
BloomFilter* bloom_filter = NULL;
BuildBloomFilter(size_multiplier, prefix_list, &bloom_filter);
scoped_refptr<BloomFilter> scoped_filter(bloom_filter);
// Read in data file line at a time.
FilePath url_file = data_dir.Append(FILE_PATH_LITERAL("urls"));
std::ifstream url_stream(url_file.value().c_str());
// Keep track of stats
int hits = 0;
int misses = 0;
int weighted_hits = 0;
int weighted_misses = 0;
int url_count = 0;
int prefix_count = 0;
// Print out volumes of data (per URL hit and miss information).
bool verbose = CommandLine::ForCurrentProcess()->HasSwitch(kFilterVerbose);
bool use_weights = CommandLine::ForCurrentProcess()->HasSwitch(kFilterCsv);
std::string url;
while (std::getline(url_stream, url)) {
++url_count;
// Handle a format that contains URLs weighted by unique views.
int weight = 1;
if (use_weights) {
std::string::size_type pos = url.find_last_of(",");
if (pos != std::string::npos) {
base::StringToInt(url.begin() + pos + 1, url.end(), &weight);
url = url.substr(0, pos);
}
}
// See if the URL is in the bloom filter.
std::vector<SBPrefix> prefixes;
prefix_count += GeneratePrefixHits(url, bloom_filter, &prefixes);
// See if the prefix is actually in the database (in-memory prefix list).
for (size_t i = 0; i < prefixes.size(); ++i) {
if (IsPrefixInDatabase(prefixes[i], prefix_list)) {
++hits;
weighted_hits += weight;
if (verbose) {
std::cout << "Hit for URL: " << url
<< " (prefix = " << prefixes[i] << ")"
<< std::endl;
}
} else {
++misses;
weighted_misses += weight;
if (verbose) {
std::cout << "Miss for URL: " << url
<< " (prefix = " << prefixes[i] << ")"
<< std::endl;
}
}
}
}
// Print out the results for this test.
std::cout << "URLs checked: " << url_count
<< ", prefix compares: " << prefix_count
<< ", hits: " << hits
<< ", misses: " << misses;
if (use_weights) {
std::cout << ", weighted hits: " << weighted_hits
<< ", weighted misses: " << weighted_misses;
}
std::cout << std::endl;
}
} // namespace
// This test can take several minutes to perform its calculations, so it should
// be disabled until you need to run it.
TEST(SafeBrowsingBloomFilter, FalsePositives) {
std::vector<SBPrefix> prefix_list;
FilePath data_dir = GetFullDataPath();
ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list));
const CommandLine& cmd_line = *CommandLine::ForCurrentProcess();
int start = BloomFilter::kBloomFilterSizeRatio;
if (cmd_line.HasSwitch(kFilterStart)) {
ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterStart),
&start));
}
int steps = 1;
if (cmd_line.HasSwitch(kFilterSteps)) {
ASSERT_TRUE(base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterSteps),
&steps));
}
int stop = start + steps;
for (int multiplier = start; multiplier < stop; ++multiplier)
CalculateBloomFilterFalsePositives(multiplier, data_dir, prefix_list);
}
// Computes the time required for performing a number of look ups in a bloom
// filter. This is useful for measuring the performance of new hash functions.
TEST(SafeBrowsingBloomFilter, HashTime) {
// Read the data from the database.
std::vector<SBPrefix> prefix_list;
FilePath data_dir = GetFullDataPath();
ASSERT_TRUE(ReadDatabase(data_dir, &prefix_list));
const CommandLine& cmd_line = *CommandLine::ForCurrentProcess();
int num_checks = kNumHashChecks;
if (cmd_line.HasSwitch(kFilterNumChecks)) {
ASSERT_TRUE(
base::StringToInt(cmd_line.GetSwitchValueASCII(kFilterNumChecks),
&num_checks));
}
// Populate the bloom filter and measure the time.
BloomFilter* bloom_filter = NULL;
Time populate_before = Time::Now();
BuildBloomFilter(BloomFilter::kBloomFilterSizeRatio,
prefix_list, &bloom_filter);
TimeDelta populate = Time::Now() - populate_before;
// Check a large number of random prefixes against the filter.
int hits = 0;
Time check_before = Time::Now();
for (int i = 0; i < num_checks; ++i) {
uint32 prefix = static_cast<uint32>(base::RandUint64());
if (bloom_filter->Exists(prefix))
++hits;
}
TimeDelta check = Time::Now() - check_before;
int64 time_per_insert = populate.InMicroseconds() /
static_cast<int>(prefix_list.size());
int64 time_per_check = check.InMicroseconds() / num_checks;
std::cout << "Time results for checks: " << num_checks
<< ", prefixes: " << prefix_list.size()
<< ", populate time (ms): " << populate.InMilliseconds()
<< ", check time (ms): " << check.InMilliseconds()
<< ", hits: " << hits
<< ", per-populate (us): " << time_per_insert
<< ", per-check (us): " << time_per_check
<< std::endl;
}