components/omnibox/browser/url_index_private_data.h - chromium/src - Git at Google

 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_
 #define COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_

 #include <stddef.h>

 #include <map>
 #include <set>
 #include <string>

 #include "base/containers/stack.h"
 #include "base/files/file_path.h"
 #include "base/gtest_prod_util.h"
 #include "base/memory/ref_counted.h"
 #include "components/history/core/browser/history_service.h"
 #include "components/omnibox/browser/in_memory_url_index_cache.pb.h"
 #include "components/omnibox/browser/in_memory_url_index_types.h"
 #include "components/omnibox/browser/scored_history_match.h"

 class HistoryQuickProviderTest;
 class TemplateURLService;

 namespace bookmarks {
 class BookmarkModel;
 }

 namespace in_memory_url_index {
 class InMemoryURLIndexCacheItem;
 }

 namespace history {
 class HistoryDatabase;
 class InMemoryURLIndex;
 }

 // Current version of the cache file.
 static const int kCurrentCacheFileVersion = 5;

 // A structure private to InMemoryURLIndex describing its internal data and
 // providing for restoring, rebuilding and updating that internal data. As
 // this class is for exclusive use by the InMemoryURLIndex class there should
 // be no calls from any other class.
 //
 // All public member functions are called on the main thread unless otherwise
 // annotated.
 class URLIndexPrivateData
     : public base::RefCountedThreadSafe<URLIndexPrivateData> {
  public:
   // The maximum number of recent visits stored.  Public so that
   // ScoredHistoryMatch can enuse that this number is greater than the number
   // of visits it wants to use for scoring.
   static constexpr size_t kMaxVisitsToStoreInCache = 10;

   URLIndexPrivateData();

   // Given a base::string16 in |term_string|, scans the history index and
   // returns a vector with all scored, matching history items. The
   // |term_string| is broken down into individual terms (words), each of which
   // must occur in the candidate history item's URL or page title for the item
   // to qualify; however, the terms do not necessarily have to be adjacent. We
   // also allow breaking |term_string| at |cursor_position| (if
   // set). Once we have a set of candidates, they are filtered to ensure
   // that all |term_string| terms, as separated by whitespace and the
   // cursor (if set), occur within the candidate's URL or page title.
   // Scores are then calculated on no more than |kItemsToScoreLimit|
   // candidates, as the scoring of such a large number of candidates may
   // cause perceptible typing response delays in the omnibox. This is
   // likely to occur for short omnibox terms such as 'h' and 'w' which
   // will be found in nearly all history candidates. Results are sorted by
   // descending score. The full results set (i.e. beyond the
   // |kItemsToScoreLimit| limit) will be retained and used for subsequent calls
   // to this function. In total, |max_matches| of items will be returned in the
   // |ScoredHistoryMatches| vector.
   ScoredHistoryMatches HistoryItemsForTerms(
       base::string16 term_string,
       size_t cursor_position,
       size_t max_matches,
       bookmarks::BookmarkModel* bookmark_model,
       TemplateURLService* template_url_service);

   // Adds the history item in |row| to the index if it does not already already
   // exist and it meets the minimum 'quick' criteria. If the row already exists
   // in the index then the index will be updated if the row still meets the
   // criteria, otherwise the row will be removed from the index. Returns true
   // if the index was actually updated. |scheme_whitelist| is used to filter
   // non-qualifying schemes. |history_service| is used to schedule an update to
   // the recent visits component of this URL's entry in the index.
   bool UpdateURL(history::HistoryService* history_service,
                  const history::URLRow& row,
                  const std::set<std::string>& scheme_whitelist,
                  base::CancelableTaskTracker* tracker);

   // Updates the entry for |url_id| in the index, replacing its
   // recent visits information with |recent_visits|.  If |url_id|
   // is not in the index, does nothing.
   void UpdateRecentVisits(history::URLID url_id,
                           const history::VisitVector& recent_visits);

   // Using |history_service| schedules an update (using the historyDB
   // thread) for the recent visits information for |url_id|.  Unless
   // something unexpectedly goes wrong, UdpateRecentVisits() should
   // eventually be called from a callback.
   void ScheduleUpdateRecentVisits(history::HistoryService* history_service,
                                   history::URLID url_id,
                                   base::CancelableTaskTracker* tracker);

   // Deletes index data for the history item with the given |url|.
   // The item may not have actually been indexed, which is the case if it did
   // not previously meet minimum 'quick' criteria. Returns true if the index
   // was actually updated.
   bool DeleteURL(const GURL& url);

   // Constructs a new object by restoring its contents from the cache file
   // at |path|. Returns the new URLIndexPrivateData which on success will
   // contain the restored data but upon failure will be empty.
   // This function should be run on the the file thread.
   static scoped_refptr<URLIndexPrivateData> RestoreFromFile(
       const base::FilePath& path);

   // Constructs a new object by rebuilding its contents from the history
   // database in |history_db|. Returns the new URLIndexPrivateData which on
   // success will contain the rebuilt data but upon failure will be empty.
   static scoped_refptr<URLIndexPrivateData> RebuildFromHistory(
       history::HistoryDatabase* history_db,
       const std::set<std::string>& scheme_whitelist);

   // Writes |private_data| as a cache file to |file_path| and returns success.
   static bool WritePrivateDataToCacheFileTask(
       scoped_refptr<URLIndexPrivateData> private_data,
       const base::FilePath& file_path);

   // Creates a copy of ourself.
   scoped_refptr<URLIndexPrivateData> Duplicate() const;

   // Returns true if there is no data in the index.
   bool Empty() const;

   // Initializes all index data members in preparation for restoring the index
   // from the cache or a complete rebuild from the history database.
   void Clear();

   // Estimates dynamic memory usage.
   // See base/trace_event/memory_usage_estimator.h for more info.
   size_t EstimateMemoryUsage() const;

  private:
   friend class base::RefCountedThreadSafe<URLIndexPrivateData>;
   ~URLIndexPrivateData();

   friend class ::HistoryQuickProviderTest;
   friend class InMemoryURLIndexTest;
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CalculateWordStartsOffsets);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest,
                            CalculateWordStartsOffsetsUnderscore);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, HugeResultSet);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, ReadVisitsFromHistory);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, RebuildFromHistoryIfCacheOld);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TrimHistoryIds);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, WhitelistedURLs);
   FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization);

   // Support caching of term results so that we can optimize searches which
   // build upon a previous search. Each entry in this map represents one
   // search term from the most recent search. For example, if the user had
   // typed "google blog trans" and then typed an additional 'l' (at the end,
   // of course) then there would be four items in the cache: 'blog', 'google',
   // 'trans', and 'transl'. All would be marked as being in use except for the
   // 'trans' item; its cached data would have been used when optimizing the
   // construction of the search results candidates for 'transl' but then would
   // no longer needed.
   //
   // Items stored in the search term cache. If a search term exactly matches one
   // in the cache then we can quickly supply the proper |history_id_set_| (and
   // marking the cache item as being |used_|. If we find a prefix for a search
   // term in the cache (which is very likely to occur as the user types each
   // term into the omnibox) then we can short-circuit the index search for those
   // characters in the prefix by returning the |word_id_set|. In that case we do
   // not mark the item as being |used_|.
   struct SearchTermCacheItem {
     SearchTermCacheItem(const WordIDSet& word_id_set,
                         const HistoryIDSet& history_id_set);
     // Creates a cache item for a term which has no results.
     SearchTermCacheItem();
     SearchTermCacheItem(const SearchTermCacheItem& other);

     ~SearchTermCacheItem();

     // Estimates dynamic memory usage.
     // See base/trace_event/memory_usage_estimator.h for more info.
     size_t EstimateMemoryUsage() const;

     WordIDSet word_id_set_;
     HistoryIDSet history_id_set_;
     bool used_;  // True if this item has been used for the current term search.
   };
   typedef std::map<base::string16, SearchTermCacheItem> SearchTermCacheMap;

   // A helper predicate class used to filter excess history items when the
   // candidate results set is too large.
   class HistoryItemFactorGreater {
    public:
     explicit HistoryItemFactorGreater(const HistoryInfoMap& history_info_map);
     ~HistoryItemFactorGreater();

     bool operator()(const HistoryID h1, const HistoryID h2);

    private:
     const HistoryInfoMap& history_info_map_;
   };

   // URL History indexing support functions.

   // Composes a vector of history item IDs by intersecting the set for each word
   // in |unsorted_words|.
   HistoryIDVector HistoryIDsFromWords(const String16Vector& unsorted_words);

   // Trims the candidate pool in advance of doing proper substring searching, to
   // cap the cost of such searching. Discards the least-relevant items (based on
   // visit stats), which are least likely to score highly in the end.  To
   // minimize the risk of discarding a valuable URL, the candidate pool is still
   // left two orders of magnitude larger than the final number of results
   // returned from the HQP. Returns whether anything was trimmed.
   bool TrimHistoryIdsPool(HistoryIDVector* history_ids) const;

   // Helper function to HistoryIDSetFromWords which composes a set of history
   // ids for the given term given in |term|.
   HistoryIDSet HistoryIDsForTerm(const base::string16& term);

   // Given a set of Char16s, finds words containing those characters.
   WordIDSet WordIDSetForTermChars(const Char16Set& term_chars);

   // Helper function for HistoryItemsForTerms().  Fills in |scored_items| from
   // the matches listed in |history_ids|.
   void HistoryIdsToScoredMatches(HistoryIDVector history_ids,
                                  const base::string16& lower_raw_string,
                                  const TemplateURLService* template_url_service,
                                  bookmarks::BookmarkModel* bookmark_model,
                                  ScoredHistoryMatches* scored_items) const;

   // Fills in |terms_to_word_starts_offsets| according to where the word starts
   // in each term.  For example, in the term "-foo" the word starts at offset 1.
   static void CalculateWordStartsOffsets(
       const String16Vector& terms,
       WordStarts* terms_to_word_starts_offsets);

   static void CalculateWordStartsOffsets(
       const String16Vector& terms,
       bool force_break_on_underscore,
       WordStarts* terms_to_word_starts_offsets);

   // Indexes one URL history item as described by |row|. Returns true if the
   // row was actually indexed. |scheme_whitelist| is used to filter
   // non-qualifying schemes.  If |history_db| is not NULL then this function
   // uses the history database synchronously to get the URL's recent visits
   // information.  This mode should/ only be used on the historyDB thread.
   // If |history_db| is NULL, then this function uses |history_service| to
   // schedule a task on the historyDB thread to fetch and update the recent
   // visits information.
   bool IndexRow(history::HistoryDatabase* history_db,
                 history::HistoryService* history_service,
                 const history::URLRow& row,
                 const std::set<std::string>& scheme_whitelist,
                 base::CancelableTaskTracker* tracker);

   // Parses and indexes the words in the URL and page title of |row| and
   // calculate the word starts in each, saving the starts in |word_starts|.
   void AddRowWordsToIndex(const history::URLRow& row,
                           RowWordStarts* word_starts);

   // Given a single word in |uni_word|, adds a reference for the containing
   // history item identified by |history_id| to the index.
   void AddWordToIndex(const base::string16& uni_word, HistoryID history_id);

   // Adds a new entry to |word_list_|. Uses previously freed positions if
   // available.
   WordID AddNewWordToWordList(const base::string16& term);

   // Removes |row| and all associated words and characters from the index.
   void RemoveRowFromIndex(const history::URLRow& row);

   // Removes all words and characters associated with |row| from the index.
   void RemoveRowWordsFromIndex(const history::URLRow& row);

   // Clears |used_| for each item in the search term cache.
   void ResetSearchTermCache();

   // Caches the index private data and writes the cache file to the profile
   // directory.  Called by WritePrivateDataToCacheFileTask.
   bool SaveToFile(const base::FilePath& file_path);

   // Encode a data structure into the protobuf |cache|.
   void SavePrivateData(
       in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
   void SaveWordList(
       in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
   void SaveWordMap(in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
   void SaveCharWordMap(
       in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
   void SaveWordIDHistoryMap(
       in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
   void SaveHistoryInfoMap(
       in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
   void SaveWordStartsMap(
       in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;

   // Decode a data structure from the protobuf |cache|. Return false if there
   // is any kind of failure.
   bool RestorePrivateData(
       const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
   bool RestoreWordList(
       const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
   bool RestoreWordMap(
       const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
   bool RestoreCharWordMap(
       const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
   bool RestoreWordIDHistoryMap(
       const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
   bool RestoreHistoryInfoMap(
       const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
   bool RestoreWordStartsMap(
       const in_memory_url_index::InMemoryURLIndexCacheItem& cache);

   // Determines if |gurl| has a whitelisted scheme and returns true if so.
   static bool URLSchemeIsWhitelisted(const GURL& gurl,
                                      const std::set<std::string>& whitelist);

   // Returns true if the URL associated with |history_id| is missing, malformed,
   // or otherwise should not be displayed.  (Results from the default search
   // provider fall into this category.)
   bool ShouldFilter(const HistoryID history_id,
                     const TemplateURLService* template_url_service) const;

   // Cache of search terms.
   SearchTermCacheMap search_term_cache_;

   // Start of data members that are cached -------------------------------------

   // The version of the cache file most recently used to restore this instance
   // of the private data. If the private data was rebuilt from the history
   // database this will be 0.
   int restored_cache_version_;

   // The last time the data was rebuilt from the history database.
   base::Time last_time_rebuilt_from_history_;

   // A list of all of indexed words. The index of a word in this list is the
   // ID of the word in the word_map_. It reduces the memory overhead by
   // replacing a potentially long and repeated string with a simple index.
   String16Vector word_list_;

   // A list of available words slots in |word_list_|. An available word slot
   // is the index of a unused word in word_list_ vector, also referred to as
   // a WordID. As URL visits are added or modified new words may be added to
   // the index, in which case any available words are used, if any, and then
   // words are added to the end of the word_list_. When URL visits are
   // modified or deleted old words may be removed from the index, in which
   // case the slots for those words are added to available_words_ for reuse
   // by future URL updates.
   base::stack<WordID> available_words_;

   // A one-to-one mapping from the a word string to its slot number (i.e.
   // WordID) in the |word_list_|.
   WordMap word_map_;

   // A one-to-many mapping from a single character to all WordIDs of words
   // containing that character.
   CharWordIDMap char_word_map_;

   // A one-to-many mapping from a WordID to all HistoryIDs (the row_id as
   // used in the history database) of history items in which the word occurs.
   WordIDHistoryMap word_id_history_map_;

   // A one-to-many mapping from a HistoryID to all WordIDs of words that occur
   // in the URL and/or page title of the history item referenced by that
   // HistoryID.
   HistoryIDWordMap history_id_word_map_;

   // A one-to-one mapping from HistoryID to the history item data governing
   // index inclusion and relevance scoring.
   HistoryInfoMap history_info_map_;

   // A one-to-one mapping from HistoryID to the word starts detected in each
   // item's URL and page title.
   WordStartsMap word_starts_map_;

   // End of data members that are cached ---------------------------------------

   // For unit testing only. Specifies the version of the cache file to be saved.
   // Used only for testing upgrading of an older version of the cache upon
   // restore.
   int saved_cache_version_;
 };

 #endif  // COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_
	// Copyright (c) 2012 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#ifndef COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_
	#define COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_

	#include <stddef.h>

	#include <map>
	#include <set>
	#include <string>

	#include "base/containers/stack.h"
	#include "base/files/file_path.h"
	#include "base/gtest_prod_util.h"
	#include "base/memory/ref_counted.h"
	#include "components/history/core/browser/history_service.h"
	#include "components/omnibox/browser/in_memory_url_index_cache.pb.h"
	#include "components/omnibox/browser/in_memory_url_index_types.h"
	#include "components/omnibox/browser/scored_history_match.h"

	class HistoryQuickProviderTest;
	class TemplateURLService;

	namespace bookmarks {
	class BookmarkModel;
	}

	namespace in_memory_url_index {
	class InMemoryURLIndexCacheItem;
	}

	namespace history {
	class HistoryDatabase;
	class InMemoryURLIndex;
	}

	// Current version of the cache file.
	static const int kCurrentCacheFileVersion = 5;

	// A structure private to InMemoryURLIndex describing its internal data and
	// providing for restoring, rebuilding and updating that internal data. As
	// this class is for exclusive use by the InMemoryURLIndex class there should
	// be no calls from any other class.
	//
	// All public member functions are called on the main thread unless otherwise
	// annotated.
	class URLIndexPrivateData
	: public base::RefCountedThreadSafe<URLIndexPrivateData> {
	public:
	// The maximum number of recent visits stored. Public so that
	// ScoredHistoryMatch can enuse that this number is greater than the number
	// of visits it wants to use for scoring.
	static constexpr size_t kMaxVisitsToStoreInCache = 10;

	URLIndexPrivateData();

	// Given a base::string16 in \|term_string\|, scans the history index and
	// returns a vector with all scored, matching history items. The
	// \|term_string\| is broken down into individual terms (words), each of which
	// must occur in the candidate history item's URL or page title for the item
	// to qualify; however, the terms do not necessarily have to be adjacent. We
	// also allow breaking \|term_string\| at \|cursor_position\| (if
	// set). Once we have a set of candidates, they are filtered to ensure
	// that all \|term_string\| terms, as separated by whitespace and the
	// cursor (if set), occur within the candidate's URL or page title.
	// Scores are then calculated on no more than \|kItemsToScoreLimit\|
	// candidates, as the scoring of such a large number of candidates may
	// cause perceptible typing response delays in the omnibox. This is
	// likely to occur for short omnibox terms such as 'h' and 'w' which
	// will be found in nearly all history candidates. Results are sorted by
	// descending score. The full results set (i.e. beyond the
	// \|kItemsToScoreLimit\| limit) will be retained and used for subsequent calls
	// to this function. In total, \|max_matches\| of items will be returned in the
	// \|ScoredHistoryMatches\| vector.
	ScoredHistoryMatches HistoryItemsForTerms(
	base::string16 term_string,
	size_t cursor_position,
	size_t max_matches,
	bookmarks::BookmarkModel* bookmark_model,
	TemplateURLService* template_url_service);

	// Adds the history item in \|row\| to the index if it does not already already
	// exist and it meets the minimum 'quick' criteria. If the row already exists
	// in the index then the index will be updated if the row still meets the
	// criteria, otherwise the row will be removed from the index. Returns true
	// if the index was actually updated. \|scheme_whitelist\| is used to filter
	// non-qualifying schemes. \|history_service\| is used to schedule an update to
	// the recent visits component of this URL's entry in the index.
	bool UpdateURL(history::HistoryService* history_service,
	const history::URLRow& row,
	const std::set<std::string>& scheme_whitelist,
	base::CancelableTaskTracker* tracker);

	// Updates the entry for \|url_id\| in the index, replacing its
	// recent visits information with \|recent_visits\|. If \|url_id\|
	// is not in the index, does nothing.
	void UpdateRecentVisits(history::URLID url_id,
	const history::VisitVector& recent_visits);

	// Using \|history_service\| schedules an update (using the historyDB
	// thread) for the recent visits information for \|url_id\|. Unless
	// something unexpectedly goes wrong, UdpateRecentVisits() should
	// eventually be called from a callback.
	void ScheduleUpdateRecentVisits(history::HistoryService* history_service,
	history::URLID url_id,
	base::CancelableTaskTracker* tracker);

	// Deletes index data for the history item with the given \|url\|.
	// The item may not have actually been indexed, which is the case if it did
	// not previously meet minimum 'quick' criteria. Returns true if the index
	// was actually updated.
	bool DeleteURL(const GURL& url);

	// Constructs a new object by restoring its contents from the cache file
	// at \|path\|. Returns the new URLIndexPrivateData which on success will
	// contain the restored data but upon failure will be empty.
	// This function should be run on the the file thread.
	static scoped_refptr<URLIndexPrivateData> RestoreFromFile(
	const base::FilePath& path);

	// Constructs a new object by rebuilding its contents from the history
	// database in \|history_db\|. Returns the new URLIndexPrivateData which on
	// success will contain the rebuilt data but upon failure will be empty.
	static scoped_refptr<URLIndexPrivateData> RebuildFromHistory(
	history::HistoryDatabase* history_db,
	const std::set<std::string>& scheme_whitelist);

	// Writes \|private_data\| as a cache file to \|file_path\| and returns success.
	static bool WritePrivateDataToCacheFileTask(
	scoped_refptr<URLIndexPrivateData> private_data,
	const base::FilePath& file_path);

	// Creates a copy of ourself.
	scoped_refptr<URLIndexPrivateData> Duplicate() const;

	// Returns true if there is no data in the index.
	bool Empty() const;

	// Initializes all index data members in preparation for restoring the index
	// from the cache or a complete rebuild from the history database.
	void Clear();

	// Estimates dynamic memory usage.
	// See base/trace_event/memory_usage_estimator.h for more info.
	size_t EstimateMemoryUsage() const;

	private:
	friend class base::RefCountedThreadSafe<URLIndexPrivateData>;
	~URLIndexPrivateData();

	friend class ::HistoryQuickProviderTest;
	friend class InMemoryURLIndexTest;
	FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore);
	FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CalculateWordStartsOffsets);
	FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest,
	CalculateWordStartsOffsetsUnderscore);
	FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, HugeResultSet);
	FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, ReadVisitsFromHistory);
	FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, RebuildFromHistoryIfCacheOld);
	FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring);
	FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch);
	FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TrimHistoryIds);
	FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);
	FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, WhitelistedURLs);
	FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization);

	// Support caching of term results so that we can optimize searches which
	// build upon a previous search. Each entry in this map represents one
	// search term from the most recent search. For example, if the user had
	// typed "google blog trans" and then typed an additional 'l' (at the end,
	// of course) then there would be four items in the cache: 'blog', 'google',
	// 'trans', and 'transl'. All would be marked as being in use except for the
	// 'trans' item; its cached data would have been used when optimizing the
	// construction of the search results candidates for 'transl' but then would
	// no longer needed.
	//
	// Items stored in the search term cache. If a search term exactly matches one
	// in the cache then we can quickly supply the proper \|history_id_set_\| (and
	// marking the cache item as being \|used_\|. If we find a prefix for a search
	// term in the cache (which is very likely to occur as the user types each
	// term into the omnibox) then we can short-circuit the index search for those
	// characters in the prefix by returning the \|word_id_set\|. In that case we do
	// not mark the item as being \|used_\|.
	struct SearchTermCacheItem {
	SearchTermCacheItem(const WordIDSet& word_id_set,
	const HistoryIDSet& history_id_set);
	// Creates a cache item for a term which has no results.
	SearchTermCacheItem();
	SearchTermCacheItem(const SearchTermCacheItem& other);

	~SearchTermCacheItem();

	// Estimates dynamic memory usage.
	// See base/trace_event/memory_usage_estimator.h for more info.
	size_t EstimateMemoryUsage() const;

	WordIDSet word_id_set_;
	HistoryIDSet history_id_set_;
	bool used_; // True if this item has been used for the current term search.
	};
	typedef std::map<base::string16, SearchTermCacheItem> SearchTermCacheMap;

	// A helper predicate class used to filter excess history items when the
	// candidate results set is too large.
	class HistoryItemFactorGreater {
	public:
	explicit HistoryItemFactorGreater(const HistoryInfoMap& history_info_map);
	~HistoryItemFactorGreater();

	bool operator()(const HistoryID h1, const HistoryID h2);

	private:
	const HistoryInfoMap& history_info_map_;
	};

	// URL History indexing support functions.

	// Composes a vector of history item IDs by intersecting the set for each word
	// in \|unsorted_words\|.
	HistoryIDVector HistoryIDsFromWords(const String16Vector& unsorted_words);

	// Trims the candidate pool in advance of doing proper substring searching, to
	// cap the cost of such searching. Discards the least-relevant items (based on
	// visit stats), which are least likely to score highly in the end. To
	// minimize the risk of discarding a valuable URL, the candidate pool is still
	// left two orders of magnitude larger than the final number of results
	// returned from the HQP. Returns whether anything was trimmed.
	bool TrimHistoryIdsPool(HistoryIDVector* history_ids) const;

	// Helper function to HistoryIDSetFromWords which composes a set of history
	// ids for the given term given in \|term\|.
	HistoryIDSet HistoryIDsForTerm(const base::string16& term);

	// Given a set of Char16s, finds words containing those characters.
	WordIDSet WordIDSetForTermChars(const Char16Set& term_chars);

	// Helper function for HistoryItemsForTerms(). Fills in \|scored_items\| from
	// the matches listed in \|history_ids\|.
	void HistoryIdsToScoredMatches(HistoryIDVector history_ids,
	const base::string16& lower_raw_string,
	const TemplateURLService* template_url_service,
	bookmarks::BookmarkModel* bookmark_model,
	ScoredHistoryMatches* scored_items) const;

	// Fills in \|terms_to_word_starts_offsets\| according to where the word starts
	// in each term. For example, in the term "-foo" the word starts at offset 1.
	static void CalculateWordStartsOffsets(
	const String16Vector& terms,
	WordStarts* terms_to_word_starts_offsets);

	static void CalculateWordStartsOffsets(
	const String16Vector& terms,
	bool force_break_on_underscore,
	WordStarts* terms_to_word_starts_offsets);

	// Indexes one URL history item as described by \|row\|. Returns true if the
	// row was actually indexed. \|scheme_whitelist\| is used to filter
	// non-qualifying schemes. If \|history_db\| is not NULL then this function
	// uses the history database synchronously to get the URL's recent visits
	// information. This mode should/ only be used on the historyDB thread.
	// If \|history_db\| is NULL, then this function uses \|history_service\| to
	// schedule a task on the historyDB thread to fetch and update the recent
	// visits information.
	bool IndexRow(history::HistoryDatabase* history_db,
	history::HistoryService* history_service,
	const history::URLRow& row,
	const std::set<std::string>& scheme_whitelist,
	base::CancelableTaskTracker* tracker);

	// Parses and indexes the words in the URL and page title of \|row\| and
	// calculate the word starts in each, saving the starts in \|word_starts\|.
	void AddRowWordsToIndex(const history::URLRow& row,
	RowWordStarts* word_starts);

	// Given a single word in \|uni_word\|, adds a reference for the containing
	// history item identified by \|history_id\| to the index.
	void AddWordToIndex(const base::string16& uni_word, HistoryID history_id);

	// Adds a new entry to \|word_list_\|. Uses previously freed positions if
	// available.
	WordID AddNewWordToWordList(const base::string16& term);

	// Removes \|row\| and all associated words and characters from the index.
	void RemoveRowFromIndex(const history::URLRow& row);

	// Removes all words and characters associated with \|row\| from the index.
	void RemoveRowWordsFromIndex(const history::URLRow& row);

	// Clears \|used_\| for each item in the search term cache.
	void ResetSearchTermCache();

	// Caches the index private data and writes the cache file to the profile
	// directory. Called by WritePrivateDataToCacheFileTask.
	bool SaveToFile(const base::FilePath& file_path);

	// Encode a data structure into the protobuf \|cache\|.
	void SavePrivateData(
	in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
	void SaveWordList(
	in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
	void SaveWordMap(in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
	void SaveCharWordMap(
	in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
	void SaveWordIDHistoryMap(
	in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
	void SaveHistoryInfoMap(
	in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
	void SaveWordStartsMap(
	in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;

	// Decode a data structure from the protobuf \|cache\|. Return false if there
	// is any kind of failure.
	bool RestorePrivateData(
	const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
	bool RestoreWordList(
	const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
	bool RestoreWordMap(
	const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
	bool RestoreCharWordMap(
	const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
	bool RestoreWordIDHistoryMap(
	const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
	bool RestoreHistoryInfoMap(
	const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
	bool RestoreWordStartsMap(
	const in_memory_url_index::InMemoryURLIndexCacheItem& cache);

	// Determines if \|gurl\| has a whitelisted scheme and returns true if so.
	static bool URLSchemeIsWhitelisted(const GURL& gurl,
	const std::set<std::string>& whitelist);

	// Returns true if the URL associated with \|history_id\| is missing, malformed,
	// or otherwise should not be displayed. (Results from the default search
	// provider fall into this category.)
	bool ShouldFilter(const HistoryID history_id,
	const TemplateURLService* template_url_service) const;

	// Cache of search terms.
	SearchTermCacheMap search_term_cache_;

	// Start of data members that are cached -------------------------------------

	// The version of the cache file most recently used to restore this instance
	// of the private data. If the private data was rebuilt from the history
	// database this will be 0.
	int restored_cache_version_;

	// The last time the data was rebuilt from the history database.
	base::Time last_time_rebuilt_from_history_;

	// A list of all of indexed words. The index of a word in this list is the
	// ID of the word in the word_map_. It reduces the memory overhead by
	// replacing a potentially long and repeated string with a simple index.
	String16Vector word_list_;

	// A list of available words slots in \|word_list_\|. An available word slot
	// is the index of a unused word in word_list_ vector, also referred to as
	// a WordID. As URL visits are added or modified new words may be added to
	// the index, in which case any available words are used, if any, and then
	// words are added to the end of the word_list_. When URL visits are
	// modified or deleted old words may be removed from the index, in which
	// case the slots for those words are added to available_words_ for reuse
	// by future URL updates.
	base::stack<WordID> available_words_;

	// A one-to-one mapping from the a word string to its slot number (i.e.
	// WordID) in the \|word_list_\|.
	WordMap word_map_;

	// A one-to-many mapping from a single character to all WordIDs of words
	// containing that character.
	CharWordIDMap char_word_map_;

	// A one-to-many mapping from a WordID to all HistoryIDs (the row_id as
	// used in the history database) of history items in which the word occurs.
	WordIDHistoryMap word_id_history_map_;

	// A one-to-many mapping from a HistoryID to all WordIDs of words that occur
	// in the URL and/or page title of the history item referenced by that
	// HistoryID.
	HistoryIDWordMap history_id_word_map_;

	// A one-to-one mapping from HistoryID to the history item data governing
	// index inclusion and relevance scoring.
	HistoryInfoMap history_info_map_;

	// A one-to-one mapping from HistoryID to the word starts detected in each
	// item's URL and page title.
	WordStartsMap word_starts_map_;

	// End of data members that are cached ---------------------------------------

	// For unit testing only. Specifies the version of the cache file to be saved.
	// Used only for testing upgrading of an older version of the cache upon
	// restore.
	int saved_cache_version_;
	};

	#endif // COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_