blob: 0701b8e25bcbaba9dad20cc0db7ce0d94bd5cafc [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/safe_browsing_db/prefix_set.h"
#include <limits.h>
#include <algorithm>
#include <utility>
#include "base/files/file_util.h"
#include "base/files/scoped_file.h"
#include "base/logging.h"
#include "base/md5.h"
#include "base/memory/ptr_util.h"
#include "base/metrics/histogram.h"
#include "base/metrics/histogram_macros.h"
namespace safe_browsing {
namespace {
// |kMagic| should be reasonably unique, and not match itself across
// endianness changes. I generated this value with:
// md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
static uint32_t kMagic = 0x864088dd;
// Version history:
// Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10
// Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27
// Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05
// Version 2 layout is identical to version 1. The sort order of |index_|
// changed from |int32_t| to |uint32_t| to match the change of |SBPrefix|.
// Version 3 adds storage for full hashes.
static uint32_t kVersion = 3;
static uint32_t kDeprecatedVersion = 2; // And lower.
typedef struct {
uint32_t magic;
uint32_t version;
uint32_t index_size;
uint32_t deltas_size;
uint32_t full_hashes_size;
} FileHeader;
// Common std::vector<> implementations add capacity by multiplying from the
// current size (usually either by 2 or 1.5) to satisfy push_back() running in
// amortized constant time. This is not necessary for insert() at end(), but
// AFAICT it seems true for some implementations. SBPrefix values should
// uniformly cover the 32-bit space, so the final size can be estimated given a
// subset of the input.
//
// |kEstimateThreshold| is when estimates start converging. Results are strong
// starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of
// resizing capacity from >50% to 100%.
//
// TODO(shess): I'm sure there is math in the world to describe good settings
// for estimating the size of a uniformly-distributed set of integers from a
// sorted subset. I do not have such math in me, so I assumed that my current
// organic database of prefixes was scale-free, and wrote a script to see how
// often given slop values would always suffice for given strides. At 1<<30,
// .5% slop was sufficient to cover all cases (though the code below uses 1%).
//
// TODO(shess): A smaller threshold uses less transient space in reallocation.
// 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost
// is that a smaller threshold needs more slop (locked down for the long term).
// 1<<29 worked well with 1%, 1<<27 worked well with 2%.
const SBPrefix kEstimateThreshold = 1 << 30;
size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) {
// estimated_count / current_count == estimated_max / current_prefix
// For large input sets, estimated_max of 2^32 is close enough.
const size_t estimated_prefix_count = static_cast<size_t>(
(static_cast<uint64_t>(current_count) << 32) / current_prefix);
// The estimate has an error bar, if the final total is below the estimate, no
// harm, but if it is above a capacity resize will happen at nearly 100%. Add
// some slop to make sure all cases are covered.
return estimated_prefix_count + estimated_prefix_count / 100;
}
} // namespace
// For |std::upper_bound()| to find a prefix w/in a vector of pairs.
// static
bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) {
return a.first < b.first;
}
PrefixSet::PrefixSet() {
}
PrefixSet::PrefixSet(IndexVector* index,
std::vector<uint16_t>* deltas,
std::vector<SBFullHash>* full_hashes) {
DCHECK(index && deltas && full_hashes);
index_.swap(*index);
deltas_.swap(*deltas);
full_hashes_.swap(*full_hashes);
}
PrefixSet::~PrefixSet() {}
bool PrefixSet::PrefixExists(SBPrefix prefix) const {
if (index_.empty())
return false;
// Find the first position after |prefix| in |index_|.
IndexVector::const_iterator iter =
std::upper_bound(index_.begin(), index_.end(),
IndexPair(prefix, 0), PrefixLess);
// |prefix| comes before anything that's in the set.
if (iter == index_.begin())
return false;
// Capture the upper bound of our target entry's deltas.
const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);
// Back up to the entry our target is in.
--iter;
// All prefixes in |index_| are in the set.
SBPrefix current = iter->first;
if (current == prefix)
return true;
// Scan forward accumulating deltas while a match is possible.
for (size_t di = iter->second; di < bound && current < prefix; ++di) {
current += deltas_[di];
}
return current == prefix;
}
bool PrefixSet::Exists(const SBFullHash& hash) const {
if (std::binary_search(full_hashes_.begin(), full_hashes_.end(),
hash, SBFullHashLess)) {
return true;
}
return PrefixExists(hash.prefix);
}
void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {
prefixes->reserve(index_.size() + deltas_.size());
for (size_t ii = 0; ii < index_.size(); ++ii) {
// The deltas for this |index_| entry run to the next index entry,
// or the end of the deltas.
const size_t deltas_end =
(ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();
SBPrefix current = index_[ii].first;
prefixes->push_back(current);
for (size_t di = index_[ii].second; di < deltas_end; ++di) {
current += deltas_[di];
prefixes->push_back(current);
}
}
}
// static
std::unique_ptr<const PrefixSet> PrefixSet::LoadFile(
const base::FilePath& filter_name) {
int64_t size_64;
if (!base::GetFileSize(filter_name, &size_64))
return nullptr;
using base::MD5Digest;
if (size_64 < static_cast<int64_t>(sizeof(FileHeader) + sizeof(MD5Digest)))
return nullptr;
base::ScopedFILE file(base::OpenFile(filter_name, "rb"));
if (!file.get())
return nullptr;
FileHeader header;
size_t read = fread(&header, sizeof(header), 1, file.get());
if (read != 1)
return nullptr;
// The file looks valid, start building the digest.
base::MD5Context context;
base::MD5Init(&context);
base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
sizeof(header)));
if (header.magic != kMagic)
return nullptr;
// Track version read to inform removal of support for older versions.
UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header.version);
if (header.version <= kDeprecatedVersion) {
return nullptr;
} else if (header.version != kVersion) {
return nullptr;
}
IndexVector index;
const size_t index_bytes = sizeof(index[0]) * header.index_size;
std::vector<uint16_t> deltas;
const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;
std::vector<SBFullHash> full_hashes;
const size_t full_hashes_bytes =
sizeof(full_hashes[0]) * header.full_hashes_size;
// Check for bogus sizes before allocating any space.
const size_t expected_bytes = sizeof(header) +
index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest);
if (static_cast<int64_t>(expected_bytes) != size_64)
return nullptr;
// Read the index vector. Herb Sutter indicates that vectors are
// guaranteed to be contiuguous, so reading to where element 0 lives
// is valid.
if (header.index_size) {
index.resize(header.index_size);
read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());
if (read != index.size())
return nullptr;
base::MD5Update(&context,
base::StringPiece(reinterpret_cast<char*>(&(index[0])),
index_bytes));
}
// Read vector of deltas.
if (header.deltas_size) {
deltas.resize(header.deltas_size);
read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());
if (read != deltas.size())
return nullptr;
base::MD5Update(&context,
base::StringPiece(reinterpret_cast<char*>(&(deltas[0])),
deltas_bytes));
}
// Read vector of full hashes.
if (header.full_hashes_size) {
full_hashes.resize(header.full_hashes_size);
read = fread(&(full_hashes[0]), sizeof(full_hashes[0]), full_hashes.size(),
file.get());
if (read != full_hashes.size())
return nullptr;
base::MD5Update(&context,
base::StringPiece(
reinterpret_cast<char*>(&(full_hashes[0])),
full_hashes_bytes));
}
base::MD5Digest calculated_digest;
base::MD5Final(&calculated_digest, &context);
base::MD5Digest file_digest;
read = fread(&file_digest, sizeof(file_digest), 1, file.get());
if (read != 1)
return nullptr;
if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
return nullptr;
// Steals vector contents using swap().
return base::WrapUnique(new PrefixSet(&index, &deltas, &full_hashes));
}
bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {
FileHeader header;
header.magic = kMagic;
header.version = kVersion;
header.index_size = static_cast<uint32_t>(index_.size());
header.deltas_size = static_cast<uint32_t>(deltas_.size());
header.full_hashes_size = static_cast<uint32_t>(full_hashes_.size());
// Sanity check that the 32-bit values never mess things up.
if (static_cast<size_t>(header.index_size) != index_.size() ||
static_cast<size_t>(header.deltas_size) != deltas_.size() ||
static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) {
NOTREACHED();
return false;
}
base::ScopedFILE file(base::OpenFile(filter_name, "wb"));
if (!file.get())
return false;
base::MD5Context context;
base::MD5Init(&context);
// TODO(shess): The I/O code in safe_browsing_store_file.cc would
// sure be useful about now.
size_t written = fwrite(&header, sizeof(header), 1, file.get());
if (written != 1)
return false;
base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
sizeof(header)));
// As for reads, the standard guarantees the ability to access the
// contents of the vector by a pointer to an element.
if (index_.size()) {
const size_t index_bytes = sizeof(index_[0]) * index_.size();
written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(),
file.get());
if (written != index_.size())
return false;
base::MD5Update(&context,
base::StringPiece(
reinterpret_cast<const char*>(&(index_[0])),
index_bytes));
}
if (deltas_.size()) {
const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();
written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),
file.get());
if (written != deltas_.size())
return false;
base::MD5Update(&context,
base::StringPiece(
reinterpret_cast<const char*>(&(deltas_[0])),
deltas_bytes));
}
if (full_hashes_.size()) {
const size_t elt_size = sizeof(full_hashes_[0]);
const size_t elts = full_hashes_.size();
const size_t full_hashes_bytes = elt_size * elts;
written = fwrite(&(full_hashes_[0]), elt_size, elts, file.get());
if (written != elts)
return false;
base::MD5Update(&context,
base::StringPiece(
reinterpret_cast<const char*>(&(full_hashes_[0])),
full_hashes_bytes));
}
base::MD5Digest digest;
base::MD5Final(&digest, &context);
written = fwrite(&digest, sizeof(digest), 1, file.get());
if (written != 1)
return false;
// TODO(shess): Can this code check that the close was successful?
file.reset();
return true;
}
void PrefixSet::AddRun(SBPrefix index_prefix,
const uint16_t* run_begin,
const uint16_t* run_end) {
// Preempt organic capacity decisions for |delta_| once strong estimates can
// be made.
if (index_prefix > kEstimateThreshold &&
deltas_.capacity() < deltas_.size() + (run_end - run_begin)) {
deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size()));
}
index_.push_back(
std::make_pair(index_prefix, static_cast<uint32_t>(deltas_.size())));
deltas_.insert(deltas_.end(), run_begin, run_end);
}
PrefixSetBuilder::PrefixSetBuilder()
: prefix_set_(new PrefixSet()) {
}
PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes)
: prefix_set_(new PrefixSet()) {
for (size_t i = 0; i < prefixes.size(); ++i) {
AddPrefix(prefixes[i]);
}
}
PrefixSetBuilder::~PrefixSetBuilder() {
}
std::unique_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSet(
const std::vector<SBFullHash>& hashes) {
DCHECK(prefix_set_.get());
// Flush runs until buffered data is gone.
while (!buffer_.empty()) {
EmitRun();
}
// Precisely size |index_| for read-only. It's 50k-60k, so minor savings, but
// they're almost free.
PrefixSet::IndexVector(prefix_set_->index_).swap(prefix_set_->index_);
prefix_set_->full_hashes_ = hashes;
std::sort(prefix_set_->full_hashes_.begin(), prefix_set_->full_hashes_.end(),
SBFullHashLess);
return std::move(prefix_set_);
}
std::unique_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() {
return GetPrefixSet(std::vector<SBFullHash>());
}
void PrefixSetBuilder::EmitRun() {
DCHECK(prefix_set_.get());
SBPrefix prev_prefix = buffer_[0];
uint16_t run[PrefixSet::kMaxRun];
size_t run_pos = 0;
size_t i;
for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) {
// Calculate the delta. |unsigned| is mandatory, because the
// sorted_prefixes could be more than INT_MAX apart.
DCHECK_GT(buffer_[i], prev_prefix);
const unsigned delta = buffer_[i] - prev_prefix;
const uint16_t delta16 = static_cast<uint16_t>(delta);
// Break the run if the delta doesn't fit.
if (delta != static_cast<unsigned>(delta16))
break;
// Continue the run of deltas.
run[run_pos++] = delta16;
DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta);
prev_prefix = buffer_[i];
}
prefix_set_->AddRun(buffer_[0], run, run + run_pos);
buffer_.erase(buffer_.begin(), buffer_.begin() + i);
}
void PrefixSetBuilder::AddPrefix(SBPrefix prefix) {
DCHECK(prefix_set_.get());
if (buffer_.empty()) {
DCHECK(prefix_set_->index_.empty());
DCHECK(prefix_set_->deltas_.empty());
} else {
// Drop duplicates.
if (buffer_.back() == prefix)
return;
DCHECK_LT(buffer_.back(), prefix);
}
buffer_.push_back(prefix);
// Flush buffer when a run can be constructed. +1 for the index item, and +1
// to leave at least one item in the buffer for dropping duplicates.
if (buffer_.size() > PrefixSet::kMaxRun + 2)
EmitRun();
}
} // namespace safe_browsing