blob: 3b55b358b84aea47753bb04313be36621680982c [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
#define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
#include <set>
#include <vector>
#include "chrome/browser/safe_browsing/safe_browsing_store.h"
#include "base/callback.h"
#include "base/file_util.h"
// Implement SafeBrowsingStore in terms of a flat file. The file
// format is pretty literal:
//
// int32 magic; // magic number "validating" file
// int32 version; // format version
//
// // Counts for the various data which follows the header.
// uint32 add_chunk_count; // Chunks seen, including empties.
// uint32 sub_chunk_count; // Ditto.
// uint32 add_prefix_count;
// uint32 sub_prefix_count;
// uint32 add_hash_count;
// uint32 sub_hash_count;
//
// array[add_chunk_count] {
// int32 chunk_id;
// }
// array[sub_chunk_count] {
// int32 chunk_id;
// }
// array[add_prefix_count] {
// int32 chunk_id;
// int32 prefix;
// }
// array[sub_prefix_count] {
// int32 chunk_id;
// int32 add_chunk_id;
// int32 add_prefix;
// }
// array[add_hash_count] {
// int32 chunk_id;
// int32 received_time; // From base::Time::ToTimeT().
// char[32] full_hash;
// array[sub_hash_count] {
// int32 chunk_id;
// int32 add_chunk_id;
// char[32] add_full_hash;
// }
// MD5Digest checksum; // Checksum over preceeding data.
//
// During the course of an update, uncommitted data is stored in a
// temporary file (which is later re-used to commit). This is an
// array of chunks, with the count kept in memory until the end of the
// transaction. The format of this file is like the main file, with
// the list of chunks seen omitted, as that data is tracked in-memory:
//
// array[] {
// uint32 add_prefix_count;
// uint32 sub_prefix_count;
// uint32 add_hash_count;
// uint32 sub_hash_count;
// array[add_prefix_count] {
// int32 chunk_id;
// int32 prefix;
// }
// array[sub_prefix_count] {
// int32 chunk_id;
// int32 add_chunk_id;
// int32 add_prefix;
// }
// array[add_hash_count] {
// int32 chunk_id;
// int32 received_time; // From base::Time::ToTimeT().
// char[32] full_hash;
// }
// array[sub_hash_count] {
// int32 chunk_id;
// int32 add_chunk_id;
// char[32] add_full_hash;
// }
// }
//
// The overall transaction works like this:
// - Open the original file to get the chunks-seen data.
// - Open a temp file for storing new chunk info.
// - Write new chunks to the temp file.
// - When the transaction is finished:
// - Read the rest of the original file's data into buffers.
// - Rewind the temp file and merge the new data into buffers.
// - Process buffers for deletions and apply subs.
// - Rewind and write the buffers out to temp file.
// - Delete original file.
// - Rename temp file to original filename.
// TODO(shess): By using a checksum, this code can avoid doing an
// fsync(), at the possible cost of more frequently retrieving the
// full dataset. Measure how often this occurs, and if it occurs too
// often, consider retaining the last known-good file for recovery
// purposes, rather than deleting it.
class SafeBrowsingStoreFile : public SafeBrowsingStore {
public:
SafeBrowsingStoreFile();
virtual ~SafeBrowsingStoreFile();
virtual void Init(const FilePath& filename,
const base::Closure& corruption_callback) OVERRIDE;
// Delete any on-disk files, including the permanent storage.
virtual bool Delete() OVERRIDE;
// Get all add hash prefixes and full-length hashes, respectively, from
// the store.
virtual bool GetAddPrefixes(SBAddPrefixes* add_prefixes) OVERRIDE;
virtual bool GetAddFullHashes(
std::vector<SBAddFullHash>* add_full_hashes) OVERRIDE;
virtual bool BeginChunk() OVERRIDE;
virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix) OVERRIDE;
virtual bool WriteAddHash(int32 chunk_id,
base::Time receive_time,
const SBFullHash& full_hash) OVERRIDE;
virtual bool WriteSubPrefix(int32 chunk_id,
int32 add_chunk_id, SBPrefix prefix) OVERRIDE;
virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id,
const SBFullHash& full_hash) OVERRIDE;
virtual bool FinishChunk() OVERRIDE;
virtual bool BeginUpdate() OVERRIDE;
// Store updates with pending add full hashes in file store and
// return |add_prefixes_result| and |add_full_hashes_result|.
virtual bool FinishUpdate(
const std::vector<SBAddFullHash>& pending_adds,
const std::set<SBPrefix>& prefix_misses,
SBAddPrefixes* add_prefixes_result,
std::vector<SBAddFullHash>* add_full_hashes_result) OVERRIDE;
virtual bool CancelUpdate() OVERRIDE;
virtual void SetAddChunk(int32 chunk_id) OVERRIDE;
virtual bool CheckAddChunk(int32 chunk_id) OVERRIDE;
virtual void GetAddChunks(std::vector<int32>* out) OVERRIDE;
virtual void SetSubChunk(int32 chunk_id) OVERRIDE;
virtual bool CheckSubChunk(int32 chunk_id) OVERRIDE;
virtual void GetSubChunks(std::vector<int32>* out) OVERRIDE;
virtual void DeleteAddChunk(int32 chunk_id) OVERRIDE;
virtual void DeleteSubChunk(int32 chunk_id) OVERRIDE;
// Verify |file_|'s checksum, calling the corruption callback if it
// does not check out. Empty input is considered valid.
virtual bool CheckValidity() OVERRIDE;
// Returns the name of the temporary file used to buffer data for
// |filename|. Exported for unit tests.
static const FilePath TemporaryFileForFilename(const FilePath& filename) {
return FilePath(filename.value() + FILE_PATH_LITERAL("_new"));
}
private:
// Update store file with pending full hashes.
virtual bool DoUpdate(const std::vector<SBAddFullHash>& pending_adds,
const std::set<SBPrefix>& prefix_misses,
SBAddPrefixes* add_prefixes_result,
std::vector<SBAddFullHash>* add_full_hashes_result);
// Enumerate different format-change events for histogramming
// purposes. DO NOT CHANGE THE ORDERING OF THESE VALUES.
// TODO(shess): Remove this once the format change is complete.
enum FormatEventType {
// Corruption detected, broken down by file format.
FORMAT_EVENT_FILE_CORRUPT,
FORMAT_EVENT_SQLITE_CORRUPT, // Obsolete
// The type of format found in the file. The expected case (new
// file format) is intentionally not covered.
FORMAT_EVENT_FOUND_SQLITE,
FORMAT_EVENT_FOUND_UNKNOWN,
// The number of SQLite-format files deleted should be the same as
// FORMAT_EVENT_FOUND_SQLITE. It can differ if the delete fails,
// or if a failure prevents the update from succeeding.
FORMAT_EVENT_SQLITE_DELETED, // Obsolete
FORMAT_EVENT_SQLITE_DELETE_FAILED, // Obsolete
// Found and deleted (or failed to delete) the ancient "Safe
// Browsing" file.
FORMAT_EVENT_DELETED_ORIGINAL,
FORMAT_EVENT_DELETED_ORIGINAL_FAILED,
// The checksum did not check out in CheckValidity() or in
// FinishUpdate(). This most likely indicates that the machine
// crashed before the file was fully sync'ed to disk.
FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,
// Memory space for histograms is determined by the max. ALWAYS
// ADD NEW VALUES BEFORE THIS ONE.
FORMAT_EVENT_MAX
};
// Helper to record an event related to format conversion from
// SQLite to file.
static void RecordFormatEvent(FormatEventType event_type);
// Some very lucky users have an original-format file still in their
// profile. Check for it and delete, recording a histogram for the
// result (no histogram for not-found). Logically this
// would make more sense at the SafeBrowsingDatabase level, but
// practically speaking that code doesn't touch files directly.
static void CheckForOriginalAndDelete(const FilePath& filename);
// Close all files and clear all buffers.
bool Close();
// Calls |corruption_callback_| if non-NULL, always returns false as
// a convenience to the caller.
bool OnCorruptDatabase();
// Helper for creating a corruption callback for |old_store_|.
// TODO(shess): Remove after migration.
void HandleCorruptDatabase();
// Clear temporary buffers used to accumulate chunk data.
bool ClearChunkBuffers() {
// NOTE: .clear() doesn't release memory.
// TODO(shess): Figure out if this is overkill. Some amount of
// pre-reserved space is probably reasonable between each chunk
// collected.
SBAddPrefixes().swap(add_prefixes_);
std::vector<SBSubPrefix>().swap(sub_prefixes_);
std::vector<SBAddFullHash>().swap(add_hashes_);
std::vector<SBSubFullHash>().swap(sub_hashes_);
return true;
}
// Clear all buffers used during update.
void ClearUpdateBuffers() {
ClearChunkBuffers();
chunks_written_ = 0;
std::set<int32>().swap(add_chunks_cache_);
std::set<int32>().swap(sub_chunks_cache_);
base::hash_set<int32>().swap(add_del_cache_);
base::hash_set<int32>().swap(sub_del_cache_);
}
// Buffers for collecting data between BeginChunk() and
// FinishChunk().
SBAddPrefixes add_prefixes_;
std::vector<SBSubPrefix> sub_prefixes_;
std::vector<SBAddFullHash> add_hashes_;
std::vector<SBSubFullHash> sub_hashes_;
// Count of chunks collected in |new_file_|.
int chunks_written_;
// Name of the main database file.
FilePath filename_;
// Handles to the main and scratch files. |empty_| is true if the
// main file didn't exist when the update was started.
file_util::ScopedFILE file_;
file_util::ScopedFILE new_file_;
bool empty_;
// Cache of chunks which have been seen. Loaded from the database
// on BeginUpdate() so that it can be queried during the
// transaction.
std::set<int32> add_chunks_cache_;
std::set<int32> sub_chunks_cache_;
// Cache the set of deleted chunks during a transaction, applied on
// FinishUpdate().
// TODO(shess): If the set is small enough, hash_set<> might be
// slower than plain set<>.
base::hash_set<int32> add_del_cache_;
base::hash_set<int32> sub_del_cache_;
base::Closure corruption_callback_;
// Tracks whether corruption has already been seen in the current
// update, so that only one instance is recorded in the stats.
// TODO(shess): Remove with format-migration support.
bool corruption_seen_;
DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStoreFile);
};
#endif // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_