chrome/browser/safe_browsing/safe_browsing_store_file.cc - chromium/src - Git at Google

 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h"

 #include "base/md5.h"
 #include "base/metrics/histogram.h"

 namespace {

 // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
 // that byte-order changes force corruption.
 const int32 kFileMagic = 0x600D71FE;
 const int32 kFileVersion = 7;  // SQLite storage was 6...

 // Header at the front of the main database file.
 struct FileHeader {
   int32 magic, version;
   uint32 add_chunk_count, sub_chunk_count;
   uint32 add_prefix_count, sub_prefix_count;
   uint32 add_hash_count, sub_hash_count;
 };

 // Header for each chunk in the chunk-accumulation file.
 struct ChunkHeader {
   uint32 add_prefix_count, sub_prefix_count;
   uint32 add_hash_count, sub_hash_count;
 };

 // Rewind the file.  Using fseek(2) because rewind(3) errors are
 // weird.
 bool FileRewind(FILE* fp) {
   int rv = fseek(fp, 0, SEEK_SET);
   DCHECK_EQ(rv, 0);
   return rv == 0;
 }

 // Move file read pointer forward by |bytes| relative to current position.
 bool FileSkip(size_t bytes, FILE* fp) {
   // Although fseek takes negative values, for this case, we only want
   // to skip forward.
   DCHECK(static_cast<long>(bytes) >= 0);
   if (static_cast<long>(bytes) < 0)
     return false;
   int rv = fseek(fp, static_cast<long>(bytes), SEEK_CUR);
   DCHECK_EQ(rv, 0);
   return rv == 0;
 }

 // Read from |fp| into |item|, and fold the input data into the
 // checksum in |context|, if non-NULL.  Return true on success.
 template <class T>
 bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
   const size_t ret = fread(item, sizeof(T), 1, fp);
   if (ret != 1)
     return false;

   if (context) {
     base::MD5Update(context,
                     base::StringPiece(reinterpret_cast<char*>(item),
                                       sizeof(T)));
   }
   return true;
 }

 // Write |item| to |fp|, and fold the output data into the checksum in
 // |context|, if non-NULL.  Return true on success.
 template <class T>
 bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
   const size_t ret = fwrite(&item, sizeof(T), 1, fp);
   if (ret != 1)
     return false;

   if (context) {
     base::MD5Update(context,
                     base::StringPiece(reinterpret_cast<const char*>(&item),
                                       sizeof(T)));
   }

   return true;
 }

 // Read |count| items into |values| from |fp|, and fold them into the
 // checksum in |context|.  Returns true on success.
 template <typename CT>
 bool ReadToContainer(CT* values, size_t count, FILE* fp,
                      base::MD5Context* context) {
   if (!count)
     return true;

   for (size_t i = 0; i < count; ++i) {
     typename CT::value_type value;
     if (!ReadItem(&value, fp, context))
       return false;

     // push_back() is more obvious, but coded this way std::set can
     // also be read.
     values->insert(values->end(), value);
   }

   return true;
 }

 // Write all of |values| to |fp|, and fold the data into the checksum
 // in |context|, if non-NULL.  Returns true on succsess.
 template <typename CT>
 bool WriteContainer(const CT& values, FILE* fp,
                     base::MD5Context* context) {
   if (values.empty())
     return true;

   for (typename CT::const_iterator iter = values.begin();
        iter != values.end(); ++iter) {
     if (!WriteItem(*iter, fp, context))
       return false;
   }
   return true;
 }

 // Delete the chunks in |deleted| from |chunks|.
 void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
                          std::set<int32>* chunks) {
   for (std::set<int32>::iterator iter = chunks->begin();
        iter != chunks->end();) {
     std::set<int32>::iterator prev = iter++;
     if (deleted.count(*prev) > 0)
       chunks->erase(prev);
   }
 }

 // Sanity-check the header against the file's size to make sure our
 // vectors aren't gigantic.  This doubles as a cheap way to detect
 // corruption without having to checksum the entire file.
 bool FileHeaderSanityCheck(const FilePath& filename,
                            const FileHeader& header) {
   int64 size = 0;
   if (!file_util::GetFileSize(filename, &size))
     return false;

   int64 expected_size = sizeof(FileHeader);
   expected_size += header.add_chunk_count * sizeof(int32);
   expected_size += header.sub_chunk_count * sizeof(int32);
   expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
   expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
   expected_size += header.add_hash_count * sizeof(SBAddFullHash);
   expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
   expected_size += sizeof(base::MD5Digest);
   if (size != expected_size)
     return false;

   return true;
 }

 // This a helper function that reads header to |header|. Returns true if the
 // magic number is correct and santiy check passes.
 bool ReadAndVerifyHeader(const FilePath& filename,
                          FILE* fp,
                          FileHeader* header,
                          base::MD5Context* context) {
   if (!ReadItem(header, fp, context))
     return false;
   if (header->magic != kFileMagic || header->version != kFileVersion)
     return false;
   if (!FileHeaderSanityCheck(filename, *header))
     return false;
   return true;
 }

 }  // namespace

 // static
 void SafeBrowsingStoreFile::RecordFormatEvent(FormatEventType event_type) {
   UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
 }

 // static
 void SafeBrowsingStoreFile::CheckForOriginalAndDelete(
     const FilePath& current_filename) {
   const FilePath original_filename(
       current_filename.DirName().AppendASCII("Safe Browsing"));
   if (file_util::PathExists(original_filename)) {
     int64 size = 0;
     if (file_util::GetFileSize(original_filename, &size)) {
       UMA_HISTOGRAM_COUNTS("SB2.OldDatabaseKilobytes",
                            static_cast<int>(size / 1024));
     }

     if (file_util::Delete(original_filename, false)) {
       RecordFormatEvent(FORMAT_EVENT_DELETED_ORIGINAL);
     } else {
       RecordFormatEvent(FORMAT_EVENT_DELETED_ORIGINAL_FAILED);
     }

     // Just best-effort on the journal file, don't want to get lost in
     // the weeds.
     const FilePath journal_filename(
         current_filename.DirName().AppendASCII("Safe Browsing-journal"));
     file_util::Delete(journal_filename, false);
   }
 }

 SafeBrowsingStoreFile::SafeBrowsingStoreFile()
     : chunks_written_(0),
       file_(NULL),
       empty_(false),
       corruption_seen_(false) {
 }

 SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
   Close();
 }

 bool SafeBrowsingStoreFile::Delete() {
   // The database should not be open at this point.  But, just in
   // case, close everything before deleting.
   if (!Close()) {
     NOTREACHED();
     return false;
   }

   if (!file_util::Delete(filename_, false) &&
       file_util::PathExists(filename_)) {
     NOTREACHED();
     return false;
   }

   const FilePath new_filename = TemporaryFileForFilename(filename_);
   if (!file_util::Delete(new_filename, false) &&
       file_util::PathExists(new_filename)) {
     NOTREACHED();
     return false;
   }

   // With SQLite support gone, one way to get to this code is if the
   // existing file is a SQLite file.  Make sure the journal file is
   // also removed.
   const FilePath journal_filename(
       filename_.value() + FILE_PATH_LITERAL("-journal"));
   if (file_util::PathExists(journal_filename))
     file_util::Delete(journal_filename, false);

   return true;
 }

 void SafeBrowsingStoreFile::Init(
     const FilePath& filename,
     const base::Closure& corruption_callback
 ) {
   filename_ = filename;
   corruption_callback_ = corruption_callback;
 }

 bool SafeBrowsingStoreFile::BeginChunk() {
   return ClearChunkBuffers();
 }

 bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
   add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
   return true;
 }

 bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
   add_prefixes->clear();

   file_util::ScopedFILE file(file_util::OpenFile(filename_, "rb"));
   if (file.get() == NULL) return false;

   FileHeader header;
   if (!ReadAndVerifyHeader(filename_, file.get(), &header, NULL))
     return OnCorruptDatabase();

   size_t add_prefix_offset = header.add_chunk_count * sizeof(int32) +
       header.sub_chunk_count * sizeof(int32);
   if (!FileSkip(add_prefix_offset, file.get()))
     return false;

   if (!ReadToContainer(add_prefixes, header.add_prefix_count, file.get(), NULL))
     return false;

   return true;
 }

 bool SafeBrowsingStoreFile::GetAddFullHashes(
     std::vector<SBAddFullHash>* add_full_hashes) {
   add_full_hashes->clear();

   file_util::ScopedFILE file(file_util::OpenFile(filename_, "rb"));
   if (file.get() == NULL) return false;

   FileHeader header;
   if (!ReadAndVerifyHeader(filename_, file.get(), &header, NULL))
     return OnCorruptDatabase();

   size_t offset =
       header.add_chunk_count * sizeof(int32) +
       header.sub_chunk_count * sizeof(int32) +
       header.add_prefix_count * sizeof(SBAddPrefix) +
       header.sub_prefix_count * sizeof(SBSubPrefix);
   if (!FileSkip(offset, file.get()))
     return false;

   return ReadToContainer(add_full_hashes,
                          header.add_hash_count,
                          file.get(),
                          NULL);
 }

 bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
                                          base::Time receive_time,
                                          const SBFullHash& full_hash) {
   add_hashes_.push_back(SBAddFullHash(chunk_id, receive_time, full_hash));
   return true;
 }

 bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
                                            int32 add_chunk_id,
                                            SBPrefix prefix) {
   sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
   return true;
 }

 bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
                                          const SBFullHash& full_hash) {
   sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
   return true;
 }

 bool SafeBrowsingStoreFile::OnCorruptDatabase() {
   if (!corruption_seen_)
     RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
   corruption_seen_ = true;

   corruption_callback_.Run();

   // Return false as a convenience to callers.
   return false;
 }

 bool SafeBrowsingStoreFile::Close() {
   ClearUpdateBuffers();

   // Make sure the files are closed.
   file_.reset();
   new_file_.reset();
   return true;
 }

 bool SafeBrowsingStoreFile::BeginUpdate() {
   DCHECK(!file_.get() && !new_file_.get());

   // Structures should all be clear unless something bad happened.
   DCHECK(add_chunks_cache_.empty());
   DCHECK(sub_chunks_cache_.empty());
   DCHECK(add_del_cache_.empty());
   DCHECK(sub_del_cache_.empty());
   DCHECK(add_prefixes_.empty());
   DCHECK(sub_prefixes_.empty());
   DCHECK(add_hashes_.empty());
   DCHECK(sub_hashes_.empty());
   DCHECK_EQ(chunks_written_, 0);

   // Since the following code will already hit the profile looking for
   // database files, this is a reasonable to time delete any old
   // files.
   CheckForOriginalAndDelete(filename_);

   corruption_seen_ = false;

   const FilePath new_filename = TemporaryFileForFilename(filename_);
   file_util::ScopedFILE new_file(file_util::OpenFile(new_filename, "wb+"));
   if (new_file.get() == NULL)
     return false;

   file_util::ScopedFILE file(file_util::OpenFile(filename_, "rb"));
   empty_ = (file.get() == NULL);
   if (empty_) {
     // If the file exists but cannot be opened, try to delete it (not
     // deleting directly, the bloom filter needs to be deleted, too).
     if (file_util::PathExists(filename_))
       return OnCorruptDatabase();

     new_file_.swap(new_file);
     return true;
   }

   FileHeader header;
   if (!ReadItem(&header, file.get(), NULL))
       return OnCorruptDatabase();

   if (header.magic != kFileMagic || header.version != kFileVersion) {
     if (!strcmp(reinterpret_cast<char*>(&header.magic), "SQLite format 3")) {
       RecordFormatEvent(FORMAT_EVENT_FOUND_SQLITE);
     } else {
       RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
     }

     // Close the file so that it can be deleted.
     file.reset();

     return OnCorruptDatabase();
   }

   // TODO(shess): Under POSIX it is possible that this could size a
   // file different from the file which was opened.
   if (!FileHeaderSanityCheck(filename_, header))
     return OnCorruptDatabase();

   // Pull in the chunks-seen data for purposes of implementing
   // |GetAddChunks()| and |GetSubChunks()|.  This data is sent up to
   // the server at the beginning of an update.
   if (!ReadToContainer(&add_chunks_cache_, header.add_chunk_count,
                        file.get(), NULL) ||
       !ReadToContainer(&sub_chunks_cache_, header.sub_chunk_count,
                        file.get(), NULL))
     return OnCorruptDatabase();

   file_.swap(file);
   new_file_.swap(new_file);
   return true;
 }

 bool SafeBrowsingStoreFile::FinishChunk() {
   if (!add_prefixes_.size() && !sub_prefixes_.size() &&
       !add_hashes_.size() && !sub_hashes_.size())
     return true;

   ChunkHeader header;
   header.add_prefix_count = add_prefixes_.size();
   header.sub_prefix_count = sub_prefixes_.size();
   header.add_hash_count = add_hashes_.size();
   header.sub_hash_count = sub_hashes_.size();
   if (!WriteItem(header, new_file_.get(), NULL))
     return false;

   if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) ||
       !WriteContainer(sub_prefixes_, new_file_.get(), NULL) ||
       !WriteContainer(add_hashes_, new_file_.get(), NULL) ||
       !WriteContainer(sub_hashes_, new_file_.get(), NULL))
     return false;

   ++chunks_written_;

   // Clear everything to save memory.
   return ClearChunkBuffers();
 }

 bool SafeBrowsingStoreFile::DoUpdate(
     const std::vector<SBAddFullHash>& pending_adds,
     const std::set<SBPrefix>& prefix_misses,
     SBAddPrefixes* add_prefixes_result,
     std::vector<SBAddFullHash>* add_full_hashes_result) {
   DCHECK(file_.get() || empty_);
   DCHECK(new_file_.get());
   CHECK(add_prefixes_result);
   CHECK(add_full_hashes_result);

   SBAddPrefixes add_prefixes;
   std::vector<SBSubPrefix> sub_prefixes;
   std::vector<SBAddFullHash> add_full_hashes;
   std::vector<SBSubFullHash> sub_full_hashes;

   // Read original data into the vectors.
   if (!empty_) {
     DCHECK(file_.get());

     if (!FileRewind(file_.get()))
       return OnCorruptDatabase();

     base::MD5Context context;
     base::MD5Init(&context);

     // Read the file header and make sure it looks right.
     FileHeader header;
     if (!ReadAndVerifyHeader(filename_, file_.get(), &header, &context))
       return OnCorruptDatabase();

     // Re-read the chunks-seen data to get to the later data in the
     // file and calculate the checksum.  No new elements should be
     // added to the sets.
     if (!ReadToContainer(&add_chunks_cache_, header.add_chunk_count,
                          file_.get(), &context) ||
         !ReadToContainer(&sub_chunks_cache_, header.sub_chunk_count,
                          file_.get(), &context))
       return OnCorruptDatabase();

     if (!ReadToContainer(&add_prefixes, header.add_prefix_count,
                          file_.get(), &context) ||
         !ReadToContainer(&sub_prefixes, header.sub_prefix_count,
                          file_.get(), &context) ||
         !ReadToContainer(&add_full_hashes, header.add_hash_count,
                          file_.get(), &context) ||
         !ReadToContainer(&sub_full_hashes, header.sub_hash_count,
                          file_.get(), &context))
       return OnCorruptDatabase();

     // Calculate the digest to this point.
     base::MD5Digest calculated_digest;
     base::MD5Final(&calculated_digest, &context);

     // Read the stored checksum and verify it.
     base::MD5Digest file_digest;
     if (!ReadItem(&file_digest, file_.get(), NULL))
       return OnCorruptDatabase();

     if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
       return OnCorruptDatabase();

     // Close the file so we can later rename over it.
     file_.reset();
   }
   DCHECK(!file_.get());

   // Rewind the temporary storage.
   if (!FileRewind(new_file_.get()))
     return false;

   // Get chunk file's size for validating counts.
   int64 size = 0;
   if (!file_util::GetFileSize(TemporaryFileForFilename(filename_), &size))
     return OnCorruptDatabase();

   // Track update size to answer questions at http://crbug.com/72216 .
   // Log small updates as 1k so that the 0 (underflow) bucket can be
   // used for "empty" in SafeBrowsingDatabase.
   UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
                        std::max(static_cast<int>(size / 1024), 1));

   // Append the accumulated chunks onto the vectors read from |file_|.
   for (int i = 0; i < chunks_written_; ++i) {
     ChunkHeader header;

     int64 ofs = ftell(new_file_.get());
     if (ofs == -1)
       return false;

     if (!ReadItem(&header, new_file_.get(), NULL))
       return false;

     // As a safety measure, make sure that the header describes a sane
     // chunk, given the remaining file size.
     int64 expected_size = ofs + sizeof(ChunkHeader);
     expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
     expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
     expected_size += header.add_hash_count * sizeof(SBAddFullHash);
     expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
     if (expected_size > size)
       return false;

     // TODO(shess): If the vectors were kept sorted, then this code
     // could use std::inplace_merge() to merge everything together in
     // sorted order.  That might still be slower than just sorting at
     // the end if there were a large number of chunks.  In that case
     // some sort of recursive binary merge might be in order (merge
     // chunks pairwise, merge those chunks pairwise, and so on, then
     // merge the result with the main list).
     if (!ReadToContainer(&add_prefixes, header.add_prefix_count,
                          new_file_.get(), NULL) ||
         !ReadToContainer(&sub_prefixes, header.sub_prefix_count,
                          new_file_.get(), NULL) ||
         !ReadToContainer(&add_full_hashes, header.add_hash_count,
                          new_file_.get(), NULL) ||
         !ReadToContainer(&sub_full_hashes, header.sub_hash_count,
                          new_file_.get(), NULL))
       return false;
   }

   // Append items from |pending_adds|.
   add_full_hashes.insert(add_full_hashes.end(),
                          pending_adds.begin(), pending_adds.end());

   // Check how often a prefix was checked which wasn't in the
   // database.
   SBCheckPrefixMisses(add_prefixes, prefix_misses);

   // Knock the subs from the adds and process deleted chunks.
   SBProcessSubs(&add_prefixes, &sub_prefixes,
                 &add_full_hashes, &sub_full_hashes,
                 add_del_cache_, sub_del_cache_);

   // We no longer need to track deleted chunks.
   DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
   DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);

   // Write the new data to new_file_.
   if (!FileRewind(new_file_.get()))
     return false;

   base::MD5Context context;
   base::MD5Init(&context);

   // Write a file header.
   FileHeader header;
   header.magic = kFileMagic;
   header.version = kFileVersion;
   header.add_chunk_count = add_chunks_cache_.size();
   header.sub_chunk_count = sub_chunks_cache_.size();
   header.add_prefix_count = add_prefixes.size();
   header.sub_prefix_count = sub_prefixes.size();
   header.add_hash_count = add_full_hashes.size();
   header.sub_hash_count = sub_full_hashes.size();
   if (!WriteItem(header, new_file_.get(), &context))
     return false;

   // Write all the chunk data.
   if (!WriteContainer(add_chunks_cache_, new_file_.get(), &context) ||
       !WriteContainer(sub_chunks_cache_, new_file_.get(), &context) ||
       !WriteContainer(add_prefixes, new_file_.get(), &context) ||
       !WriteContainer(sub_prefixes, new_file_.get(), &context) ||
       !WriteContainer(add_full_hashes, new_file_.get(), &context) ||
       !WriteContainer(sub_full_hashes, new_file_.get(), &context))
     return false;

   // Write the checksum at the end.
   base::MD5Digest digest;
   base::MD5Final(&digest, &context);
   if (!WriteItem(digest, new_file_.get(), NULL))
     return false;

   // Trim any excess left over from the temporary chunk data.
   if (!file_util::TruncateFile(new_file_.get()))
     return false;

   // Close the file handle and swizzle the file into place.
   new_file_.reset();
   if (!file_util::Delete(filename_, false) &&
       file_util::PathExists(filename_))
     return false;

   const FilePath new_filename = TemporaryFileForFilename(filename_);
   if (!file_util::Move(new_filename, filename_))
     return false;

   // Record counts before swapping to caller.
   UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefixes.size());
   UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefixes.size());

   // Pass the resulting data off to the caller.
   add_prefixes_result->swap(add_prefixes);
   add_full_hashes_result->swap(add_full_hashes);

   return true;
 }

 bool SafeBrowsingStoreFile::FinishUpdate(
     const std::vector<SBAddFullHash>& pending_adds,
     const std::set<SBPrefix>& prefix_misses,
     SBAddPrefixes* add_prefixes_result,
     std::vector<SBAddFullHash>* add_full_hashes_result) {
   DCHECK(add_prefixes_result);
   DCHECK(add_full_hashes_result);

   bool ret = DoUpdate(pending_adds, prefix_misses,
                       add_prefixes_result, add_full_hashes_result);

   if (!ret) {
     CancelUpdate();
     return false;
   }

   DCHECK(!new_file_.get());
   DCHECK(!file_.get());

   return Close();
 }

 bool SafeBrowsingStoreFile::CancelUpdate() {
   return Close();
 }

 void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
   add_chunks_cache_.insert(chunk_id);
 }

 bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
   return add_chunks_cache_.count(chunk_id) > 0;
 }

 void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
   out->clear();
   out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
 }

 void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
   sub_chunks_cache_.insert(chunk_id);
 }

 bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
   return sub_chunks_cache_.count(chunk_id) > 0;
 }

 void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
   out->clear();
   out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
 }

 void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
   add_del_cache_.insert(chunk_id);
 }

 void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
   sub_del_cache_.insert(chunk_id);
 }
	// Copyright (c) 2011 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "chrome/browser/safe_browsing/safe_browsing_store_file.h"

	#include "base/md5.h"
	#include "base/metrics/histogram.h"

	namespace {

	// NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
	// that byte-order changes force corruption.
	const int32 kFileMagic = 0x600D71FE;
	const int32 kFileVersion = 7; // SQLite storage was 6...

	// Header at the front of the main database file.
	struct FileHeader {
	int32 magic, version;
	uint32 add_chunk_count, sub_chunk_count;
	uint32 add_prefix_count, sub_prefix_count;
	uint32 add_hash_count, sub_hash_count;
	};

	// Header for each chunk in the chunk-accumulation file.
	struct ChunkHeader {
	uint32 add_prefix_count, sub_prefix_count;
	uint32 add_hash_count, sub_hash_count;
	};

	// Rewind the file. Using fseek(2) because rewind(3) errors are
	// weird.
	bool FileRewind(FILE* fp) {
	int rv = fseek(fp, 0, SEEK_SET);
	DCHECK_EQ(rv, 0);
	return rv == 0;
	}

	// Move file read pointer forward by \|bytes\| relative to current position.
	bool FileSkip(size_t bytes, FILE* fp) {
	// Although fseek takes negative values, for this case, we only want
	// to skip forward.
	DCHECK(static_cast<long>(bytes) >= 0);
	if (static_cast<long>(bytes) < 0)
	return false;
	int rv = fseek(fp, static_cast<long>(bytes), SEEK_CUR);
	DCHECK_EQ(rv, 0);
	return rv == 0;
	}

	// Read from \|fp\| into \|item\|, and fold the input data into the
	// checksum in \|context\|, if non-NULL. Return true on success.
	template <class T>
	bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
	const size_t ret = fread(item, sizeof(T), 1, fp);
	if (ret != 1)
	return false;

	if (context) {
	base::MD5Update(context,
	base::StringPiece(reinterpret_cast<char*>(item),
	sizeof(T)));
	}
	return true;
	}

	// Write \|item\| to \|fp\|, and fold the output data into the checksum in
	// \|context\|, if non-NULL. Return true on success.
	template <class T>
	bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
	const size_t ret = fwrite(&item, sizeof(T), 1, fp);
	if (ret != 1)
	return false;

	if (context) {
	base::MD5Update(context,
	base::StringPiece(reinterpret_cast<const char*>(&item),
	sizeof(T)));
	}

	return true;
	}

	// Read \|count\| items into \|values\| from \|fp\|, and fold them into the
	// checksum in \|context\|. Returns true on success.
	template <typename CT>
	bool ReadToContainer(CT* values, size_t count, FILE* fp,
	base::MD5Context* context) {
	if (!count)
	return true;

	for (size_t i = 0; i < count; ++i) {
	typename CT::value_type value;
	if (!ReadItem(&value, fp, context))
	return false;

	// push_back() is more obvious, but coded this way std::set can
	// also be read.
	values->insert(values->end(), value);
	}

	return true;
	}

	// Write all of \|values\| to \|fp\|, and fold the data into the checksum
	// in \|context\|, if non-NULL. Returns true on succsess.
	template <typename CT>
	bool WriteContainer(const CT& values, FILE* fp,
	base::MD5Context* context) {
	if (values.empty())
	return true;

	for (typename CT::const_iterator iter = values.begin();
	iter != values.end(); ++iter) {
	if (!WriteItem(*iter, fp, context))
	return false;
	}
	return true;
	}

	// Delete the chunks in \|deleted\| from \|chunks\|.
	void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
	std::set<int32>* chunks) {
	for (std::set<int32>::iterator iter = chunks->begin();
	iter != chunks->end();) {
	std::set<int32>::iterator prev = iter++;
	if (deleted.count(*prev) > 0)
	chunks->erase(prev);
	}
	}

	// Sanity-check the header against the file's size to make sure our
	// vectors aren't gigantic. This doubles as a cheap way to detect
	// corruption without having to checksum the entire file.
	bool FileHeaderSanityCheck(const FilePath& filename,
	const FileHeader& header) {
	int64 size = 0;
	if (!file_util::GetFileSize(filename, &size))
	return false;

	int64 expected_size = sizeof(FileHeader);
	expected_size += header.add_chunk_count * sizeof(int32);
	expected_size += header.sub_chunk_count * sizeof(int32);
	expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
	expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
	expected_size += header.add_hash_count * sizeof(SBAddFullHash);
	expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
	expected_size += sizeof(base::MD5Digest);
	if (size != expected_size)
	return false;

	return true;
	}

	// This a helper function that reads header to \|header\|. Returns true if the
	// magic number is correct and santiy check passes.
	bool ReadAndVerifyHeader(const FilePath& filename,
	FILE* fp,
	FileHeader* header,
	base::MD5Context* context) {
	if (!ReadItem(header, fp, context))
	return false;
	if (header->magic != kFileMagic \|\| header->version != kFileVersion)
	return false;
	if (!FileHeaderSanityCheck(filename, *header))
	return false;
	return true;
	}

	} // namespace

	// static
	void SafeBrowsingStoreFile::RecordFormatEvent(FormatEventType event_type) {
	UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
	}

	// static
	void SafeBrowsingStoreFile::CheckForOriginalAndDelete(
	const FilePath& current_filename) {
	const FilePath original_filename(
	current_filename.DirName().AppendASCII("Safe Browsing"));
	if (file_util::PathExists(original_filename)) {
	int64 size = 0;
	if (file_util::GetFileSize(original_filename, &size)) {
	UMA_HISTOGRAM_COUNTS("SB2.OldDatabaseKilobytes",
	static_cast<int>(size / 1024));
	}

	if (file_util::Delete(original_filename, false)) {
	RecordFormatEvent(FORMAT_EVENT_DELETED_ORIGINAL);
	} else {
	RecordFormatEvent(FORMAT_EVENT_DELETED_ORIGINAL_FAILED);
	}

	// Just best-effort on the journal file, don't want to get lost in
	// the weeds.
	const FilePath journal_filename(
	current_filename.DirName().AppendASCII("Safe Browsing-journal"));
	file_util::Delete(journal_filename, false);
	}
	}

	SafeBrowsingStoreFile::SafeBrowsingStoreFile()
	: chunks_written_(0),
	file_(NULL),
	empty_(false),
	corruption_seen_(false) {
	}

	SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
	Close();
	}

	bool SafeBrowsingStoreFile::Delete() {
	// The database should not be open at this point. But, just in
	// case, close everything before deleting.
	if (!Close()) {
	NOTREACHED();
	return false;
	}

	if (!file_util::Delete(filename_, false) &&
	file_util::PathExists(filename_)) {
	NOTREACHED();
	return false;
	}

	const FilePath new_filename = TemporaryFileForFilename(filename_);
	if (!file_util::Delete(new_filename, false) &&
	file_util::PathExists(new_filename)) {
	NOTREACHED();
	return false;
	}

	// With SQLite support gone, one way to get to this code is if the
	// existing file is a SQLite file. Make sure the journal file is
	// also removed.
	const FilePath journal_filename(
	filename_.value() + FILE_PATH_LITERAL("-journal"));
	if (file_util::PathExists(journal_filename))
	file_util::Delete(journal_filename, false);

	return true;
	}

	void SafeBrowsingStoreFile::Init(
	const FilePath& filename,
	const base::Closure& corruption_callback
	) {
	filename_ = filename;
	corruption_callback_ = corruption_callback;
	}

	bool SafeBrowsingStoreFile::BeginChunk() {
	return ClearChunkBuffers();
	}

	bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
	add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
	return true;
	}

	bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
	add_prefixes->clear();

	file_util::ScopedFILE file(file_util::OpenFile(filename_, "rb"));
	if (file.get() == NULL) return false;

	FileHeader header;
	if (!ReadAndVerifyHeader(filename_, file.get(), &header, NULL))
	return OnCorruptDatabase();

	size_t add_prefix_offset = header.add_chunk_count * sizeof(int32) +
	header.sub_chunk_count * sizeof(int32);
	if (!FileSkip(add_prefix_offset, file.get()))
	return false;

	if (!ReadToContainer(add_prefixes, header.add_prefix_count, file.get(), NULL))
	return false;

	return true;
	}

	bool SafeBrowsingStoreFile::GetAddFullHashes(
	std::vector<SBAddFullHash>* add_full_hashes) {
	add_full_hashes->clear();

	file_util::ScopedFILE file(file_util::OpenFile(filename_, "rb"));
	if (file.get() == NULL) return false;

	FileHeader header;
	if (!ReadAndVerifyHeader(filename_, file.get(), &header, NULL))
	return OnCorruptDatabase();

	size_t offset =
	header.add_chunk_count * sizeof(int32) +
	header.sub_chunk_count * sizeof(int32) +
	header.add_prefix_count * sizeof(SBAddPrefix) +
	header.sub_prefix_count * sizeof(SBSubPrefix);
	if (!FileSkip(offset, file.get()))
	return false;

	return ReadToContainer(add_full_hashes,
	header.add_hash_count,
	file.get(),
	NULL);
	}

	bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
	base::Time receive_time,
	const SBFullHash& full_hash) {
	add_hashes_.push_back(SBAddFullHash(chunk_id, receive_time, full_hash));
	return true;
	}

	bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
	int32 add_chunk_id,
	SBPrefix prefix) {
	sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
	return true;
	}

	bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
	const SBFullHash& full_hash) {
	sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
	return true;
	}

	bool SafeBrowsingStoreFile::OnCorruptDatabase() {
	if (!corruption_seen_)
	RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
	corruption_seen_ = true;

	corruption_callback_.Run();

	// Return false as a convenience to callers.
	return false;
	}

	bool SafeBrowsingStoreFile::Close() {
	ClearUpdateBuffers();

	// Make sure the files are closed.
	file_.reset();
	new_file_.reset();
	return true;
	}

	bool SafeBrowsingStoreFile::BeginUpdate() {
	DCHECK(!file_.get() && !new_file_.get());

	// Structures should all be clear unless something bad happened.
	DCHECK(add_chunks_cache_.empty());
	DCHECK(sub_chunks_cache_.empty());
	DCHECK(add_del_cache_.empty());
	DCHECK(sub_del_cache_.empty());
	DCHECK(add_prefixes_.empty());
	DCHECK(sub_prefixes_.empty());
	DCHECK(add_hashes_.empty());
	DCHECK(sub_hashes_.empty());
	DCHECK_EQ(chunks_written_, 0);

	// Since the following code will already hit the profile looking for
	// database files, this is a reasonable to time delete any old
	// files.
	CheckForOriginalAndDelete(filename_);

	corruption_seen_ = false;

	const FilePath new_filename = TemporaryFileForFilename(filename_);
	file_util::ScopedFILE new_file(file_util::OpenFile(new_filename, "wb+"));
	if (new_file.get() == NULL)
	return false;

	file_util::ScopedFILE file(file_util::OpenFile(filename_, "rb"));
	empty_ = (file.get() == NULL);
	if (empty_) {
	// If the file exists but cannot be opened, try to delete it (not
	// deleting directly, the bloom filter needs to be deleted, too).
	if (file_util::PathExists(filename_))
	return OnCorruptDatabase();

	new_file_.swap(new_file);
	return true;
	}

	FileHeader header;
	if (!ReadItem(&header, file.get(), NULL))
	return OnCorruptDatabase();

	if (header.magic != kFileMagic \|\| header.version != kFileVersion) {
	if (!strcmp(reinterpret_cast<char*>(&header.magic), "SQLite format 3")) {
	RecordFormatEvent(FORMAT_EVENT_FOUND_SQLITE);
	} else {
	RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
	}

	// Close the file so that it can be deleted.
	file.reset();

	return OnCorruptDatabase();
	}

	// TODO(shess): Under POSIX it is possible that this could size a
	// file different from the file which was opened.
	if (!FileHeaderSanityCheck(filename_, header))
	return OnCorruptDatabase();

	// Pull in the chunks-seen data for purposes of implementing
	// \|GetAddChunks()\| and \|GetSubChunks()\|. This data is sent up to
	// the server at the beginning of an update.
	if (!ReadToContainer(&add_chunks_cache_, header.add_chunk_count,
	file.get(), NULL) \|\|
	!ReadToContainer(&sub_chunks_cache_, header.sub_chunk_count,
	file.get(), NULL))
	return OnCorruptDatabase();

	file_.swap(file);
	new_file_.swap(new_file);
	return true;
	}

	bool SafeBrowsingStoreFile::FinishChunk() {
	if (!add_prefixes_.size() && !sub_prefixes_.size() &&
	!add_hashes_.size() && !sub_hashes_.size())
	return true;

	ChunkHeader header;
	header.add_prefix_count = add_prefixes_.size();
	header.sub_prefix_count = sub_prefixes_.size();
	header.add_hash_count = add_hashes_.size();
	header.sub_hash_count = sub_hashes_.size();
	if (!WriteItem(header, new_file_.get(), NULL))
	return false;

	if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) \|\|
	!WriteContainer(sub_prefixes_, new_file_.get(), NULL) \|\|
	!WriteContainer(add_hashes_, new_file_.get(), NULL) \|\|
	!WriteContainer(sub_hashes_, new_file_.get(), NULL))
	return false;

	++chunks_written_;

	// Clear everything to save memory.
	return ClearChunkBuffers();
	}

	bool SafeBrowsingStoreFile::DoUpdate(
	const std::vector<SBAddFullHash>& pending_adds,
	const std::set<SBPrefix>& prefix_misses,
	SBAddPrefixes* add_prefixes_result,
	std::vector<SBAddFullHash>* add_full_hashes_result) {
	DCHECK(file_.get() \|\| empty_);
	DCHECK(new_file_.get());
	CHECK(add_prefixes_result);
	CHECK(add_full_hashes_result);

	SBAddPrefixes add_prefixes;
	std::vector<SBSubPrefix> sub_prefixes;
	std::vector<SBAddFullHash> add_full_hashes;
	std::vector<SBSubFullHash> sub_full_hashes;

	// Read original data into the vectors.
	if (!empty_) {
	DCHECK(file_.get());

	if (!FileRewind(file_.get()))
	return OnCorruptDatabase();

	base::MD5Context context;
	base::MD5Init(&context);

	// Read the file header and make sure it looks right.
	FileHeader header;
	if (!ReadAndVerifyHeader(filename_, file_.get(), &header, &context))
	return OnCorruptDatabase();

	// Re-read the chunks-seen data to get to the later data in the
	// file and calculate the checksum. No new elements should be
	// added to the sets.
	if (!ReadToContainer(&add_chunks_cache_, header.add_chunk_count,
	file_.get(), &context) \|\|
	!ReadToContainer(&sub_chunks_cache_, header.sub_chunk_count,
	file_.get(), &context))
	return OnCorruptDatabase();

	if (!ReadToContainer(&add_prefixes, header.add_prefix_count,
	file_.get(), &context) \|\|
	!ReadToContainer(&sub_prefixes, header.sub_prefix_count,
	file_.get(), &context) \|\|
	!ReadToContainer(&add_full_hashes, header.add_hash_count,
	file_.get(), &context) \|\|
	!ReadToContainer(&sub_full_hashes, header.sub_hash_count,
	file_.get(), &context))
	return OnCorruptDatabase();

	// Calculate the digest to this point.
	base::MD5Digest calculated_digest;
	base::MD5Final(&calculated_digest, &context);

	// Read the stored checksum and verify it.
	base::MD5Digest file_digest;
	if (!ReadItem(&file_digest, file_.get(), NULL))
	return OnCorruptDatabase();

	if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
	return OnCorruptDatabase();

	// Close the file so we can later rename over it.
	file_.reset();
	}
	DCHECK(!file_.get());

	// Rewind the temporary storage.
	if (!FileRewind(new_file_.get()))
	return false;

	// Get chunk file's size for validating counts.
	int64 size = 0;
	if (!file_util::GetFileSize(TemporaryFileForFilename(filename_), &size))
	return OnCorruptDatabase();

	// Track update size to answer questions at http://crbug.com/72216 .
	// Log small updates as 1k so that the 0 (underflow) bucket can be
	// used for "empty" in SafeBrowsingDatabase.
	UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
	std::max(static_cast<int>(size / 1024), 1));

	// Append the accumulated chunks onto the vectors read from \|file_\|.
	for (int i = 0; i < chunks_written_; ++i) {
	ChunkHeader header;

	int64 ofs = ftell(new_file_.get());
	if (ofs == -1)
	return false;

	if (!ReadItem(&header, new_file_.get(), NULL))
	return false;

	// As a safety measure, make sure that the header describes a sane
	// chunk, given the remaining file size.
	int64 expected_size = ofs + sizeof(ChunkHeader);
	expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
	expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
	expected_size += header.add_hash_count * sizeof(SBAddFullHash);
	expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
	if (expected_size > size)
	return false;

	// TODO(shess): If the vectors were kept sorted, then this code
	// could use std::inplace_merge() to merge everything together in
	// sorted order. That might still be slower than just sorting at
	// the end if there were a large number of chunks. In that case
	// some sort of recursive binary merge might be in order (merge
	// chunks pairwise, merge those chunks pairwise, and so on, then
	// merge the result with the main list).
	if (!ReadToContainer(&add_prefixes, header.add_prefix_count,
	new_file_.get(), NULL) \|\|
	!ReadToContainer(&sub_prefixes, header.sub_prefix_count,
	new_file_.get(), NULL) \|\|
	!ReadToContainer(&add_full_hashes, header.add_hash_count,
	new_file_.get(), NULL) \|\|
	!ReadToContainer(&sub_full_hashes, header.sub_hash_count,
	new_file_.get(), NULL))
	return false;
	}

	// Append items from \|pending_adds\|.
	add_full_hashes.insert(add_full_hashes.end(),
	pending_adds.begin(), pending_adds.end());

	// Check how often a prefix was checked which wasn't in the
	// database.
	SBCheckPrefixMisses(add_prefixes, prefix_misses);

	// Knock the subs from the adds and process deleted chunks.
	SBProcessSubs(&add_prefixes, &sub_prefixes,
	&add_full_hashes, &sub_full_hashes,
	add_del_cache_, sub_del_cache_);

	// We no longer need to track deleted chunks.
	DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
	DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);

	// Write the new data to new_file_.
	if (!FileRewind(new_file_.get()))
	return false;

	base::MD5Context context;
	base::MD5Init(&context);

	// Write a file header.
	FileHeader header;
	header.magic = kFileMagic;
	header.version = kFileVersion;
	header.add_chunk_count = add_chunks_cache_.size();
	header.sub_chunk_count = sub_chunks_cache_.size();
	header.add_prefix_count = add_prefixes.size();
	header.sub_prefix_count = sub_prefixes.size();
	header.add_hash_count = add_full_hashes.size();
	header.sub_hash_count = sub_full_hashes.size();
	if (!WriteItem(header, new_file_.get(), &context))
	return false;

	// Write all the chunk data.
	if (!WriteContainer(add_chunks_cache_, new_file_.get(), &context) \|\|
	!WriteContainer(sub_chunks_cache_, new_file_.get(), &context) \|\|
	!WriteContainer(add_prefixes, new_file_.get(), &context) \|\|
	!WriteContainer(sub_prefixes, new_file_.get(), &context) \|\|
	!WriteContainer(add_full_hashes, new_file_.get(), &context) \|\|
	!WriteContainer(sub_full_hashes, new_file_.get(), &context))
	return false;

	// Write the checksum at the end.
	base::MD5Digest digest;
	base::MD5Final(&digest, &context);
	if (!WriteItem(digest, new_file_.get(), NULL))
	return false;

	// Trim any excess left over from the temporary chunk data.
	if (!file_util::TruncateFile(new_file_.get()))
	return false;

	// Close the file handle and swizzle the file into place.
	new_file_.reset();
	if (!file_util::Delete(filename_, false) &&
	file_util::PathExists(filename_))
	return false;

	const FilePath new_filename = TemporaryFileForFilename(filename_);
	if (!file_util::Move(new_filename, filename_))
	return false;

	// Record counts before swapping to caller.
	UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefixes.size());
	UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefixes.size());

	// Pass the resulting data off to the caller.
	add_prefixes_result->swap(add_prefixes);
	add_full_hashes_result->swap(add_full_hashes);

	return true;
	}

	bool SafeBrowsingStoreFile::FinishUpdate(
	const std::vector<SBAddFullHash>& pending_adds,
	const std::set<SBPrefix>& prefix_misses,
	SBAddPrefixes* add_prefixes_result,
	std::vector<SBAddFullHash>* add_full_hashes_result) {
	DCHECK(add_prefixes_result);
	DCHECK(add_full_hashes_result);

	bool ret = DoUpdate(pending_adds, prefix_misses,
	add_prefixes_result, add_full_hashes_result);

	if (!ret) {
	CancelUpdate();
	return false;
	}

	DCHECK(!new_file_.get());
	DCHECK(!file_.get());

	return Close();
	}

	bool SafeBrowsingStoreFile::CancelUpdate() {
	return Close();
	}

	void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
	add_chunks_cache_.insert(chunk_id);
	}

	bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
	return add_chunks_cache_.count(chunk_id) > 0;
	}

	void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
	out->clear();
	out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
	}

	void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
	sub_chunks_cache_.insert(chunk_id);
	}

	bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
	return sub_chunks_cache_.count(chunk_id) > 0;
	}

	void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
	out->clear();
	out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
	}

	void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
	add_del_cache_.insert(chunk_id);
	}

	void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
	sub_del_cache_.insert(chunk_id);
	}