chrome/browser/safe_browsing/safe_browsing_store.cc - chromium/src - Git at Google

 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "chrome/browser/safe_browsing/safe_browsing_store.h"

 #include <algorithm>

 #include "base/metrics/histogram.h"

 namespace {

 // Find items matching between |subs| and |adds|, and remove them,
 // recording the item from |adds| in |adds_removed|.  To minimize
 // copies, the inputs are processing in parallel, so |subs| and |adds|
 // should be compatibly ordered (either by SBAddPrefixLess or
 // SBAddPrefixHashLess).
 //
 // |predAddSub| provides add < sub, |predSubAdd| provides sub < add,
 // for the tightest compare appropriate (see calls in SBProcessSubs).
 template <typename SubsT, typename AddsT,
           typename PredAddSubT, typename PredSubAddT>
 void KnockoutSubs(SubsT* subs, AddsT* adds,
                   PredAddSubT predAddSub, PredSubAddT predSubAdd,
                   AddsT* adds_removed) {
   // Keep a pair of output iterators for writing kept items.  Due to
   // deletions, these may lag the main iterators.  Using erase() on
   // individual items would result in O(N^2) copies.  Using std::list
   // would work around that, at double or triple the memory cost.
   typename AddsT::iterator add_out = adds->begin();
   typename SubsT::iterator sub_out = subs->begin();

   // Current location in containers.
   // TODO(shess): I want these to be const_iterator, but then
   // std::copy() gets confused.  Could snag a const_iterator add_end,
   // or write an inline std::copy(), but it seems like I'm doing
   // something wrong.
   typename AddsT::iterator add_iter = adds->begin();
   typename SubsT::iterator sub_iter = subs->begin();

   while (add_iter != adds->end() && sub_iter != subs->end()) {
     // If |*sub_iter| < |*add_iter|, retain the sub.
     if (predSubAdd(*sub_iter, *add_iter)) {
       *sub_out = *sub_iter;
       ++sub_out;
       ++sub_iter;

       // If |*add_iter| < |*sub_iter|, retain the add.
     } else if (predAddSub(*add_iter, *sub_iter)) {
       *add_out = *add_iter;
       ++add_out;
       ++add_iter;

       // Record equal items and drop them.
     } else {
       adds_removed->push_back(*add_iter);
       ++add_iter;
       ++sub_iter;
     }
   }

   // Erase any leftover gap.
   adds->erase(add_out, add_iter);
   subs->erase(sub_out, sub_iter);
 }

 // Remove items in |removes| from |full_hashes|.  |full_hashes| and
 // |removes| should be ordered by SBAddPrefix component.
 template <typename HashesT, typename AddsT>
 void RemoveMatchingPrefixes(const AddsT& removes, HashesT* full_hashes) {
   // This is basically an inline of std::set_difference().
   // Unfortunately, that algorithm requires that the two iterator
   // pairs use the same value types.

   // Where to store kept items.
   typename HashesT::iterator out = full_hashes->begin();

   typename HashesT::iterator hash_iter = full_hashes->begin();
   typename AddsT::const_iterator remove_iter = removes.begin();

   while (hash_iter != full_hashes->end() && remove_iter != removes.end()) {
     // Keep items less than |*remove_iter|.
     if (SBAddPrefixLess(*hash_iter, *remove_iter)) {
       *out = *hash_iter;
       ++out;
       ++hash_iter;

       // No hit for |*remove_iter|, bump it forward.
     } else if (SBAddPrefixLess(*remove_iter, *hash_iter)) {
       ++remove_iter;

       // Drop equal items, there may be multiple hits.
     } else {
       do {
         ++hash_iter;
       } while (hash_iter != full_hashes->end() &&
                !SBAddPrefixLess(*remove_iter, *hash_iter));
       ++remove_iter;
     }
   }

   // Erase any leftover gap.
   full_hashes->erase(out, hash_iter);
 }

 // Remove deleted items (|chunk_id| in |del_set|) from the container.
 template <typename ItemsT>
 void RemoveDeleted(ItemsT* items, const base::hash_set<int32>& del_set) {
   DCHECK(items);

   // Move items from |iter| to |end_iter|, skipping items in |del_set|.
   typename ItemsT::iterator end_iter = items->begin();
   for (typename ItemsT::iterator iter = end_iter;
        iter != items->end(); ++iter) {
     if (del_set.count(iter->chunk_id) == 0) {
       *end_iter = *iter;
       ++end_iter;
     }
   }
   items->erase(end_iter, items->end());
 }

 enum MissTypes {
   MISS_TYPE_ALL,
   MISS_TYPE_FALSE,

   // Always at the end.
   MISS_TYPE_MAX
 };

 }  // namespace

 void SBCheckPrefixMisses(const SBAddPrefixes& add_prefixes,
                          const std::set<SBPrefix>& prefix_misses) {
   if (prefix_misses.empty())
     return;

   // Record a hit for all prefixes which missed when sent to the
   // server.
   for (size_t i = 0; i < prefix_misses.size(); ++i) {
     UMA_HISTOGRAM_ENUMERATION("SB2.BloomFilterFalsePositives",
                               MISS_TYPE_ALL, MISS_TYPE_MAX);
   }

   // Collect the misses which are not present in |add_prefixes|.
   // Since |add_prefixes| can contain multiple copies of the same
   // prefix, it is not sufficient to count the number of elements
   // present in both collections.
   std::set<SBPrefix> false_misses(prefix_misses.begin(), prefix_misses.end());
   for (SBAddPrefixes::const_iterator iter = add_prefixes.begin();
        iter != add_prefixes.end(); ++iter) {
     // |erase()| on an absent element should cost like |find()|.
     false_misses.erase(iter->prefix);
   }

   // Record a hit for prefixes which we shouldn't have sent in the
   // first place.
   for (size_t i = 0; i < false_misses.size(); ++i) {
     UMA_HISTOGRAM_ENUMERATION("SB2.BloomFilterFalsePositives",
                               MISS_TYPE_FALSE, MISS_TYPE_MAX);
   }

   // Divide |MISS_TYPE_FALSE| by |MISS_TYPE_ALL| to get the
   // bloom-filter false-positive rate.
 }

 void SBProcessSubs(SBAddPrefixes* add_prefixes,
                    std::vector<SBSubPrefix>* sub_prefixes,
                    std::vector<SBAddFullHash>* add_full_hashes,
                    std::vector<SBSubFullHash>* sub_full_hashes,
                    const base::hash_set<int32>& add_chunks_deleted,
                    const base::hash_set<int32>& sub_chunks_deleted) {
   // It is possible to structure templates and template
   // specializations such that the following calls work without having
   // to qualify things.  It becomes very arbitrary, though, and less
   // clear how things are working.

   // Sort the inputs by the SBAddPrefix bits.
   std::sort(add_prefixes->begin(), add_prefixes->end(),
             SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
   std::sort(sub_prefixes->begin(), sub_prefixes->end(),
             SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
   std::sort(add_full_hashes->begin(), add_full_hashes->end(),
             SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
   std::sort(sub_full_hashes->begin(), sub_full_hashes->end(),
             SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);

   // Factor out the prefix subs.
   SBAddPrefixes removed_adds;
   KnockoutSubs(sub_prefixes, add_prefixes,
                SBAddPrefixLess<SBAddPrefix,SBSubPrefix>,
                SBAddPrefixLess<SBSubPrefix,SBAddPrefix>,
                &removed_adds);

   // Remove the full-hashes corrosponding to the adds which
   // KnockoutSubs() removed.  Processing these w/in KnockoutSubs()
   // would make the code more complicated, and they are very small
   // relative to the prefix lists so the gain would be modest.
   RemoveMatchingPrefixes(removed_adds, add_full_hashes);
   RemoveMatchingPrefixes(removed_adds, sub_full_hashes);

   // http://crbug.com/52385
   // TODO(shess): AFAICT this pass is not done on the trunk.  I
   // believe that's a bug, but it may not matter because full-hash
   // subs almost never happen (I think you'd need multiple collisions
   // where one of the sites stopped being flagged?).  Enable this once
   // everything is in.  [if(0) instead of #ifdef 0 to make sure it
   // compiles.]
   if (0) {
     // Factor out the full-hash subs.
     std::vector<SBAddFullHash> removed_full_adds;
     KnockoutSubs(sub_full_hashes, add_full_hashes,
                  SBAddPrefixHashLess<SBAddFullHash,SBSubFullHash>,
                  SBAddPrefixHashLess<SBSubFullHash,SBAddFullHash>,
                  &removed_full_adds);
   }

   // Remove items from the deleted chunks.  This is done after other
   // processing to allow subs to knock out adds (and be removed) even
   // if the add's chunk is deleted.
   RemoveDeleted(add_prefixes, add_chunks_deleted);
   RemoveDeleted(sub_prefixes, sub_chunks_deleted);
   RemoveDeleted(add_full_hashes, add_chunks_deleted);
   RemoveDeleted(sub_full_hashes, sub_chunks_deleted);
 }
	// Copyright (c) 2011 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "chrome/browser/safe_browsing/safe_browsing_store.h"

	#include <algorithm>

	#include "base/metrics/histogram.h"

	namespace {

	// Find items matching between \|subs\| and \|adds\|, and remove them,
	// recording the item from \|adds\| in \|adds_removed\|. To minimize
	// copies, the inputs are processing in parallel, so \|subs\| and \|adds\|
	// should be compatibly ordered (either by SBAddPrefixLess or
	// SBAddPrefixHashLess).
	//
	// \|predAddSub\| provides add < sub, \|predSubAdd\| provides sub < add,
	// for the tightest compare appropriate (see calls in SBProcessSubs).
	template <typename SubsT, typename AddsT,
	typename PredAddSubT, typename PredSubAddT>
	void KnockoutSubs(SubsT* subs, AddsT* adds,
	PredAddSubT predAddSub, PredSubAddT predSubAdd,
	AddsT* adds_removed) {
	// Keep a pair of output iterators for writing kept items. Due to
	// deletions, these may lag the main iterators. Using erase() on
	// individual items would result in O(N^2) copies. Using std::list
	// would work around that, at double or triple the memory cost.
	typename AddsT::iterator add_out = adds->begin();
	typename SubsT::iterator sub_out = subs->begin();

	// Current location in containers.
	// TODO(shess): I want these to be const_iterator, but then
	// std::copy() gets confused. Could snag a const_iterator add_end,
	// or write an inline std::copy(), but it seems like I'm doing
	// something wrong.
	typename AddsT::iterator add_iter = adds->begin();
	typename SubsT::iterator sub_iter = subs->begin();

	while (add_iter != adds->end() && sub_iter != subs->end()) {
	// If \|sub_iter\| < \|add_iter\|, retain the sub.
	if (predSubAdd(sub_iter, add_iter)) {
	sub_out = sub_iter;
	++sub_out;
	++sub_iter;

	// If \|add_iter\| < \|sub_iter\|, retain the add.
	} else if (predAddSub(add_iter, sub_iter)) {
	add_out = add_iter;
	++add_out;
	++add_iter;

	// Record equal items and drop them.
	} else {
	adds_removed->push_back(*add_iter);
	++add_iter;
	++sub_iter;
	}
	}

	// Erase any leftover gap.
	adds->erase(add_out, add_iter);
	subs->erase(sub_out, sub_iter);
	}

	// Remove items in \|removes\| from \|full_hashes\|. \|full_hashes\| and
	// \|removes\| should be ordered by SBAddPrefix component.
	template <typename HashesT, typename AddsT>
	void RemoveMatchingPrefixes(const AddsT& removes, HashesT* full_hashes) {
	// This is basically an inline of std::set_difference().
	// Unfortunately, that algorithm requires that the two iterator
	// pairs use the same value types.

	// Where to store kept items.
	typename HashesT::iterator out = full_hashes->begin();

	typename HashesT::iterator hash_iter = full_hashes->begin();
	typename AddsT::const_iterator remove_iter = removes.begin();

	while (hash_iter != full_hashes->end() && remove_iter != removes.end()) {
	// Keep items less than \|*remove_iter\|.
	if (SBAddPrefixLess(hash_iter, remove_iter)) {
	out = hash_iter;
	++out;
	++hash_iter;

	// No hit for \|*remove_iter\|, bump it forward.
	} else if (SBAddPrefixLess(remove_iter, hash_iter)) {
	++remove_iter;

	// Drop equal items, there may be multiple hits.
	} else {
	do {
	++hash_iter;
	} while (hash_iter != full_hashes->end() &&
	!SBAddPrefixLess(remove_iter, hash_iter));
	++remove_iter;
	}
	}

	// Erase any leftover gap.
	full_hashes->erase(out, hash_iter);
	}

	// Remove deleted items (\|chunk_id\| in \|del_set\|) from the container.
	template <typename ItemsT>
	void RemoveDeleted(ItemsT* items, const base::hash_set<int32>& del_set) {
	DCHECK(items);

	// Move items from \|iter\| to \|end_iter\|, skipping items in \|del_set\|.
	typename ItemsT::iterator end_iter = items->begin();
	for (typename ItemsT::iterator iter = end_iter;
	iter != items->end(); ++iter) {
	if (del_set.count(iter->chunk_id) == 0) {
	end_iter = iter;
	++end_iter;
	}
	}
	items->erase(end_iter, items->end());
	}

	enum MissTypes {
	MISS_TYPE_ALL,
	MISS_TYPE_FALSE,

	// Always at the end.
	MISS_TYPE_MAX
	};

	} // namespace

	void SBCheckPrefixMisses(const SBAddPrefixes& add_prefixes,
	const std::set<SBPrefix>& prefix_misses) {
	if (prefix_misses.empty())
	return;

	// Record a hit for all prefixes which missed when sent to the
	// server.
	for (size_t i = 0; i < prefix_misses.size(); ++i) {
	UMA_HISTOGRAM_ENUMERATION("SB2.BloomFilterFalsePositives",
	MISS_TYPE_ALL, MISS_TYPE_MAX);
	}

	// Collect the misses which are not present in \|add_prefixes\|.
	// Since \|add_prefixes\| can contain multiple copies of the same
	// prefix, it is not sufficient to count the number of elements
	// present in both collections.
	std::set<SBPrefix> false_misses(prefix_misses.begin(), prefix_misses.end());
	for (SBAddPrefixes::const_iterator iter = add_prefixes.begin();
	iter != add_prefixes.end(); ++iter) {
	// \|erase()\| on an absent element should cost like \|find()\|.
	false_misses.erase(iter->prefix);
	}

	// Record a hit for prefixes which we shouldn't have sent in the
	// first place.
	for (size_t i = 0; i < false_misses.size(); ++i) {
	UMA_HISTOGRAM_ENUMERATION("SB2.BloomFilterFalsePositives",
	MISS_TYPE_FALSE, MISS_TYPE_MAX);
	}

	// Divide \|MISS_TYPE_FALSE\| by \|MISS_TYPE_ALL\| to get the
	// bloom-filter false-positive rate.
	}

	void SBProcessSubs(SBAddPrefixes* add_prefixes,
	std::vector<SBSubPrefix>* sub_prefixes,
	std::vector<SBAddFullHash>* add_full_hashes,
	std::vector<SBSubFullHash>* sub_full_hashes,
	const base::hash_set<int32>& add_chunks_deleted,
	const base::hash_set<int32>& sub_chunks_deleted) {
	// It is possible to structure templates and template
	// specializations such that the following calls work without having
	// to qualify things. It becomes very arbitrary, though, and less
	// clear how things are working.

	// Sort the inputs by the SBAddPrefix bits.
	std::sort(add_prefixes->begin(), add_prefixes->end(),
	SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
	std::sort(sub_prefixes->begin(), sub_prefixes->end(),
	SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
	std::sort(add_full_hashes->begin(), add_full_hashes->end(),
	SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
	std::sort(sub_full_hashes->begin(), sub_full_hashes->end(),
	SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);

	// Factor out the prefix subs.
	SBAddPrefixes removed_adds;
	KnockoutSubs(sub_prefixes, add_prefixes,
	SBAddPrefixLess<SBAddPrefix,SBSubPrefix>,
	SBAddPrefixLess<SBSubPrefix,SBAddPrefix>,
	&removed_adds);

	// Remove the full-hashes corrosponding to the adds which
	// KnockoutSubs() removed. Processing these w/in KnockoutSubs()
	// would make the code more complicated, and they are very small
	// relative to the prefix lists so the gain would be modest.
	RemoveMatchingPrefixes(removed_adds, add_full_hashes);
	RemoveMatchingPrefixes(removed_adds, sub_full_hashes);

	// http://crbug.com/52385
	// TODO(shess): AFAICT this pass is not done on the trunk. I
	// believe that's a bug, but it may not matter because full-hash
	// subs almost never happen (I think you'd need multiple collisions
	// where one of the sites stopped being flagged?). Enable this once
	// everything is in. [if(0) instead of #ifdef 0 to make sure it
	// compiles.]
	if (0) {
	// Factor out the full-hash subs.
	std::vector<SBAddFullHash> removed_full_adds;
	KnockoutSubs(sub_full_hashes, add_full_hashes,
	SBAddPrefixHashLess<SBAddFullHash,SBSubFullHash>,
	SBAddPrefixHashLess<SBSubFullHash,SBAddFullHash>,
	&removed_full_adds);
	}

	// Remove items from the deleted chunks. This is done after other
	// processing to allow subs to knock out adds (and be removed) even
	// if the add's chunk is deleted.
	RemoveDeleted(add_prefixes, add_chunks_deleted);
	RemoveDeleted(sub_prefixes, sub_chunks_deleted);
	RemoveDeleted(add_full_hashes, add_chunks_deleted);
	RemoveDeleted(sub_full_hashes, sub_chunks_deleted);
	}