chromeos/components/local_search_service/inverted_index.cc - chromium/src - Git at Google

 // Copyright 2020 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "chromeos/components/local_search_service/inverted_index.h"

 #include <numeric>
 #include <string>
 #include <tuple>
 #include <vector>

 #include "base/bind.h"
 #include "base/callback.h"
 #include "base/strings/string16.h"
 #include "base/strings/utf_string_conversions.h"
 #include "base/task/task_traits.h"
 #include "base/task/thread_pool.h"
 #include "chromeos/components/local_search_service/search_utils.h"

 namespace chromeos {
 namespace local_search_service {

 namespace {

 // (document-score, posting-of-all-matching-terms).
 using ScoreWithPosting = std::pair<double, Posting>;

 // Calculates TF-IDF scores for a term
 std::vector<TfidfResult> CalculateTfidf(const base::string16& term,
                                         const DocLength& doc_length,
                                         const Dictionary& dictionary) {
   std::vector<TfidfResult> results;
   // We don't apply weights to idf because the effect is likely small.
   const float idf =
       1.0 + log((1.0 + doc_length.size()) / (1.0 + dictionary.at(term).size()));

   for (const auto& item : dictionary.at(term)) {
     // If a term has a very low content weight in a doc, its effective number of
     // occurrences in the doc should be lower. Strictly speaking, the effective
     // length of the doc should be smaller too. However, for performance
     // reasons, we only apply the weight to the term occurrences but not doc
     // length.
     // TODO(jiameng): this is an expensive operation, we will need to monitor
     // its performance and optimize it.
     const double effective_term_occ = std::accumulate(
         item.second.begin(), item.second.end(), 0.0,
         [](double sum, const WeightedPosition& weighted_position) {
           return sum + weighted_position.weight;
         });
     const float tf = effective_term_occ / doc_length.at(item.first);
     results.push_back({item.first, item.second, tf * idf});
   }
   return results;
 }

 // Builds TF-IDF cache given the data. Since this function is expensive, it
 // should run on a non-blocking thread that is different than the main thread.
 TfidfCache BuildTfidf(uint32_t num_docs_from_last_update,
                       const DocLength& doc_length,
                       const Dictionary& dictionary,
                       const TermSet& terms_to_be_updated,
                       const TfidfCache& tfidf_cache) {
   // TODO(crbug/1137560): consider moving the helper functions inside the class
   // so that we can use SequenceChecker.
   TfidfCache new_cache(tfidf_cache);
   // If number of documents doesn't change from the last time index was built,
   // we only need to update terms in |terms_to_be_updated|. Otherwise we need
   // to rebuild the index.
   if (num_docs_from_last_update == doc_length.size()) {
     for (const auto& term : terms_to_be_updated) {
       if (dictionary.find(term) != dictionary.end()) {
         new_cache[term] = CalculateTfidf(term, doc_length, dictionary);
       } else {
         new_cache.erase(term);
       }
     }
   } else {
     new_cache.clear();
     for (const auto& item : dictionary) {
       new_cache[item.first] =
           CalculateTfidf(item.first, doc_length, dictionary);
     }
   }
   return new_cache;
 }

 // Removes a document from document state variables given it's ID. Don't do
 // anything if the ID doesn't exist. Return true if the document is removed.
 bool RemoveDocumentIfExist(const std::string& document_id,
                            DocLength* doc_length,
                            Dictionary* dictionary,
                            TermSet* terms_to_be_updated) {
   CHECK(doc_length);
   CHECK(dictionary);
   CHECK(terms_to_be_updated);
   bool document_removed = false;
   if (doc_length->find(document_id) == doc_length->end())
     return document_removed;
   doc_length->erase(document_id);
   for (auto it = dictionary->begin(); it != dictionary->end();) {
     if (it->second.find(document_id) != it->second.end()) {
       terms_to_be_updated->insert(it->first);
       it->second.erase(document_id);
       document_removed = true;
     }

     // Removes term from the dictionary if its posting list is empty.
     if (it->second.empty()) {
       it = dictionary->erase(it);
     } else {
       it++;
     }
   }
   return document_removed;
 }

 // Given list of documents to update and document state variables, returns new
 // document state variables and number of deleted documents.
 std::pair<DocumentStateVariables, uint32_t> UpdateDocumentStateVariables(
     DocumentToUpdate&& documents_to_update,
     const DocLength& doc_length,
     Dictionary&& dictionary,
     TermSet&& terms_to_be_updated) {
   DocLength new_doc_length(doc_length);
   uint32_t num_deleted = 0u;
   for (const auto& document : documents_to_update) {
     const std::string document_id(document.first);
     bool is_deleted = RemoveDocumentIfExist(document_id, &new_doc_length,
                                             &dictionary, &terms_to_be_updated);

     // Update the document if necessary.
     if (!document.second.empty()) {
       // If document content is not empty, it is being updated but not
       // deleted.
       is_deleted = false;
       for (const auto& token : document.second) {
         dictionary[token.content][document_id] = token.positions;
         new_doc_length[document_id] += token.positions.size();
         terms_to_be_updated.insert(token.content);
       }
     }
     num_deleted += (is_deleted) ? 1 : 0;
   }

   return std::make_pair(
       std::make_tuple(std::move(new_doc_length), std::move(dictionary),
                       std::move(terms_to_be_updated)),
       num_deleted);
 }

 // Given the index variables, clear all the data.
 std::pair<DocumentStateVariables, TfidfCache> ClearData(
     DocumentToUpdate&& documents_to_update,
     const DocLength& doc_length,
     Dictionary&& dictionary,
     TermSet&& terms_to_be_updated,
     TfidfCache&& tfidf_cache) {
   DocLength new_doc_length;
   documents_to_update.clear();
   dictionary.clear();
   terms_to_be_updated.clear();
   tfidf_cache.clear();
   return std::make_pair(
       std::make_tuple(std::move(new_doc_length), std::move(dictionary),
                       std::move(terms_to_be_updated)),
       std::move(tfidf_cache));
 }
 }  // namespace

 InvertedIndex::InvertedIndex() {
   task_runner_ = base::ThreadPool::CreateSequencedTaskRunner(
       {base::TaskPriority::BEST_EFFORT, base::MayBlock(),
        base::TaskShutdownBehavior::CONTINUE_ON_SHUTDOWN});
 }
 InvertedIndex::~InvertedIndex() = default;

 PostingList InvertedIndex::FindTerm(const base::string16& term) const {
   if (dictionary_.find(term) != dictionary_.end())
     return dictionary_.at(term);

   return {};
 }

 std::vector<Result> InvertedIndex::FindMatchingDocumentsApproximately(
     const std::unordered_set<base::string16>& terms,
     double prefix_threshold,
     double block_threshold) const {
   // For each document, its score is the sum of TF-IDF scores of its terms
   // that match one of more query term.
   // The map is keyed by the document id.
   std::unordered_map<std::string, ScoreWithPosting> matching_docs;
   for (const auto& kv : tfidf_cache_) {
     const base::string16& index_term = kv.first;
     const std::vector<TfidfResult>& tfidf_results = kv.second;
     for (const auto& term : terms) {
       if (IsRelevantApproximately(term, index_term, prefix_threshold,
                                   block_threshold)) {
         // If the |index_term| is relevant, all of the enclosing documents will
         // have their ranking scores updated.
         for (const auto& docid_tfidf : tfidf_results) {
           const std::string& docid = std::get<0>(docid_tfidf);
           const Posting& posting = std::get<1>(docid_tfidf);
           const float tfidf = std::get<2>(docid_tfidf);
           auto it = matching_docs.find(docid);
           if (it == matching_docs.end()) {
             it = matching_docs.emplace(docid, ScoreWithPosting(0.0, {})).first;
           }

           auto& score_posting = it->second;
           // TODO(jiameng): add position penalty.
           score_posting.first += tfidf;
           // Also update matching positions.
           auto& existing_posting = score_posting.second;
           existing_posting.insert(existing_posting.end(), posting.begin(),
                                   posting.end());
         }
         // Break out from inner loop, i.e. no need to check other query terms.
         break;
       }
     }
   }

   std::vector<Result> sorted_matching_docs;
   for (const auto& kv : matching_docs) {
     // We don't need to include weights in the search results.
     std::vector<Position> positions;
     for (const auto& weighted_position : kv.second.second) {
       positions.emplace_back(weighted_position.position);
     }
     sorted_matching_docs.emplace_back(
         Result(kv.first, kv.second.first, positions));
   }
   std::sort(sorted_matching_docs.begin(), sorted_matching_docs.end(),
             CompareResults);
   return sorted_matching_docs;
 }

 void InvertedIndex::AddDocuments(const DocumentToUpdate& documents) {
   if (documents.empty())
     return;

   is_index_built_ = false;
   documents_to_update_.insert(documents_to_update_.end(), documents.begin(),
                               documents.end());
   InvertedIndexController();
 }

 void InvertedIndex::AddDocuments(const DocumentToUpdate& documents,
                                  base::OnceCallback<void()> callback) {
   if (documents.empty())
     return;

   task_runner_->PostTaskAndReplyWithResult(
       FROM_HERE,
       base::BindOnce(&UpdateDocumentStateVariables, documents,
                      std::move(doc_length_), std::move(dictionary_),
                      std::move(terms_to_be_updated_)),
       base::BindOnce(&InvertedIndex::OnAddDocumentsComplete,
                      weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
 }

 uint32_t InvertedIndex::RemoveDocuments(
     const std::vector<std::string>& document_ids) {
   if (document_ids.empty())
     return 0;

   for (const auto& id : document_ids) {
     // |doc_length_| could be in the process of being updated when this function
     // is called, hence we cannot use it to verify the existence of the |id|.
     documents_to_update_.push_back({id, std::vector<Token>()});
   }

   is_index_built_ = false;
   InvertedIndexController();
   return document_ids.size();
 }

 void InvertedIndex::RemoveDocuments(
     const std::vector<std::string>& document_ids,
     base::OnceCallback<void(uint32_t)> callback) {
   DocumentToUpdate documents;
   for (const auto& id : document_ids) {
     documents.push_back({id, std::vector<Token>()});
   }

   task_runner_->PostTaskAndReplyWithResult(
       FROM_HERE,
       base::BindOnce(&UpdateDocumentStateVariables, documents,
                      std::move(doc_length_), std::move(dictionary_),
                      std::move(terms_to_be_updated_)),
       base::BindOnce(&InvertedIndex::OnUpdateDocumentsComplete,
                      weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
 }

 void InvertedIndex::UpdateDocuments(
     const DocumentToUpdate& documents,
     base::OnceCallback<void(uint32_t)> callback) {
   task_runner_->PostTaskAndReplyWithResult(
       FROM_HERE,
       base::BindOnce(&UpdateDocumentStateVariables, documents,
                      std::move(doc_length_), std::move(dictionary_),
                      std::move(terms_to_be_updated_)),
       base::BindOnce(&InvertedIndex::OnUpdateDocumentsComplete,
                      weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
 }

 std::vector<TfidfResult> InvertedIndex::GetTfidf(
     const base::string16& term) const {
   if (tfidf_cache_.find(term) != tfidf_cache_.end()) {
     return tfidf_cache_.at(term);
   }

   return {};
 }

 void InvertedIndex::BuildInvertedIndex() {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
   request_to_build_index_ = true;
   InvertedIndexController();
 }

 void InvertedIndex::BuildInvertedIndex(base::OnceCallback<void()> callback) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
   task_runner_->PostTaskAndReplyWithResult(
       FROM_HERE,
       base::BindOnce(&BuildTfidf, num_docs_from_last_update_, doc_length_,
                      dictionary_, std::move(terms_to_be_updated_),
                      tfidf_cache_),
       base::BindOnce(&InvertedIndex::OnBuildTfidfComplete,
                      weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
 }

 void InvertedIndex::ClearInvertedIndex() {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
   request_to_clear_index_ = true;
   InvertedIndexController();
 }

 void InvertedIndex::ClearInvertedIndex(base::OnceCallback<void()> callback) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
   task_runner_->PostTaskAndReplyWithResult(
       FROM_HERE,
       base::BindOnce(&ClearData, std::move(documents_to_update_), doc_length_,
                      std::move(dictionary_), std::move(terms_to_be_updated_),
                      std::move(tfidf_cache_)),
       base::BindOnce(&InvertedIndex::OnDataCleared,
                      weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
 }

 void InvertedIndex::InvertedIndexController() {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
   // TODO(thanhdng): A clear-index call should ideally cancel all other update
   // operations. Need to update the code to reflect this.
   if (update_in_progress_)
     return;

   if (request_to_clear_index_) {
     update_in_progress_ = true;
     request_to_clear_index_ = false;
     task_runner_->PostTaskAndReplyWithResult(
         FROM_HERE,
         base::BindOnce(&ClearData, std::move(documents_to_update_), doc_length_,
                        std::move(dictionary_), std::move(terms_to_be_updated_),
                        std::move(tfidf_cache_)),
         base::BindOnce(&InvertedIndex::OnDataClearedSync,
                        weak_ptr_factory_.GetWeakPtr()));
     return;
   }

   if (documents_to_update_.empty()) {
     if (request_to_build_index_) {
       update_in_progress_ = true;
       index_building_in_progress_ = true;
       request_to_build_index_ = false;
       task_runner_->PostTaskAndReplyWithResult(
           FROM_HERE,
           base::BindOnce(&BuildTfidf, num_docs_from_last_update_, doc_length_,
                          dictionary_, std::move(terms_to_be_updated_),
                          tfidf_cache_),
           base::BindOnce(&InvertedIndex::OnBuildTfidfCompleteSync,
                          weak_ptr_factory_.GetWeakPtr()));
     } else if (terms_to_be_updated_.empty()) {
       // If there's no more work to do and all changed terms have been used to
       // update the index, then mark index is built.
       is_index_built_ = true;
     }
   } else {
     update_in_progress_ = true;
     task_runner_->PostTaskAndReplyWithResult(
         FROM_HERE,
         // TODO(jiameng): |doc_length_| can be moved since it's not used for
         // document existence checking any more.
         base::BindOnce(&UpdateDocumentStateVariables,
                        std::move(documents_to_update_), doc_length_,
                        std::move(dictionary_), std::move(terms_to_be_updated_)),
         base::BindOnce(&InvertedIndex::OnUpdateDocumentsCompleteSync,
                        weak_ptr_factory_.GetWeakPtr()));
   }
 }

 void InvertedIndex::OnBuildTfidfCompleteSync(TfidfCache&& new_cache) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
   num_docs_from_last_update_ = doc_length_.size();
   tfidf_cache_ = std::move(new_cache);

   update_in_progress_ = false;
   index_building_in_progress_ = false;
   InvertedIndexController();
 }

 void InvertedIndex::OnBuildTfidfComplete(base::OnceCallback<void()> callback,
                                          TfidfCache&& new_cache) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
   num_docs_from_last_update_ = doc_length_.size();
   tfidf_cache_ = std::move(new_cache);

   std::move(callback).Run();
 }

 void InvertedIndex::OnUpdateDocumentsCompleteSync(
     std::pair<DocumentStateVariables, uint32_t>&&
         document_state_variables_and_num_deleted) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
   doc_length_ =
       std::move(std::get<0>(document_state_variables_and_num_deleted.first));
   dictionary_ =
       std::move(std::get<1>(document_state_variables_and_num_deleted.first));
   terms_to_be_updated_ =
       std::move(std::get<2>(document_state_variables_and_num_deleted.first));

   update_in_progress_ = false;
   InvertedIndexController();
 }

 void InvertedIndex::OnUpdateDocumentsComplete(
     base::OnceCallback<void(uint32_t)> callback,
     std::pair<DocumentStateVariables, uint32_t>&&
         document_state_variables_and_num_deleted) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
   doc_length_ =
       std::move(std::get<0>(document_state_variables_and_num_deleted.first));
   dictionary_ =
       std::move(std::get<1>(document_state_variables_and_num_deleted.first));
   terms_to_be_updated_ =
       std::move(std::get<2>(document_state_variables_and_num_deleted.first));

   BuildInvertedIndex(base::BindOnce(
       [](base::OnceCallback<void(uint32_t)> callback, uint32_t num_deleted) {
         std::move(callback).Run(num_deleted);
       },
       std::move(callback), document_state_variables_and_num_deleted.second));
 }

 void InvertedIndex::OnAddDocumentsComplete(
     base::OnceCallback<void()> callback,
     std::pair<DocumentStateVariables, uint32_t>&&
         document_state_variables_and_num_deleted) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
   DCHECK_EQ(document_state_variables_and_num_deleted.second, 0u);
   doc_length_ =
       std::move(std::get<0>(document_state_variables_and_num_deleted.first));
   dictionary_ =
       std::move(std::get<1>(document_state_variables_and_num_deleted.first));
   terms_to_be_updated_ =
       std::move(std::get<2>(document_state_variables_and_num_deleted.first));

   BuildInvertedIndex(std::move(callback));
 }

 void InvertedIndex::OnDataClearedSync(
     std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
   doc_length_ = std::move(std::get<0>(inverted_index_data.first));
   dictionary_ = std::move(std::get<1>(inverted_index_data.first));
   terms_to_be_updated_ = std::move(std::get<2>(inverted_index_data.first));
   tfidf_cache_ = std::move(inverted_index_data.second);

   num_docs_from_last_update_ = 0;
   update_in_progress_ = false;
   InvertedIndexController();
 }

 void InvertedIndex::OnDataCleared(
     base::OnceCallback<void()> callback,
     std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
   doc_length_ = std::move(std::get<0>(inverted_index_data.first));
   dictionary_ = std::move(std::get<1>(inverted_index_data.first));
   terms_to_be_updated_ = std::move(std::get<2>(inverted_index_data.first));
   tfidf_cache_ = std::move(inverted_index_data.second);
   num_docs_from_last_update_ = 0;

   std::move(callback).Run();
 }

 }  // namespace local_search_service
 }  // namespace chromeos
	// Copyright 2020 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "chromeos/components/local_search_service/inverted_index.h"

	#include <numeric>
	#include <string>
	#include <tuple>
	#include <vector>

	#include "base/bind.h"
	#include "base/callback.h"
	#include "base/strings/string16.h"
	#include "base/strings/utf_string_conversions.h"
	#include "base/task/task_traits.h"
	#include "base/task/thread_pool.h"
	#include "chromeos/components/local_search_service/search_utils.h"

	namespace chromeos {
	namespace local_search_service {

	namespace {

	// (document-score, posting-of-all-matching-terms).
	using ScoreWithPosting = std::pair<double, Posting>;

	// Calculates TF-IDF scores for a term
	std::vector<TfidfResult> CalculateTfidf(const base::string16& term,
	const DocLength& doc_length,
	const Dictionary& dictionary) {
	std::vector<TfidfResult> results;
	// We don't apply weights to idf because the effect is likely small.
	const float idf =
	1.0 + log((1.0 + doc_length.size()) / (1.0 + dictionary.at(term).size()));

	for (const auto& item : dictionary.at(term)) {
	// If a term has a very low content weight in a doc, its effective number of
	// occurrences in the doc should be lower. Strictly speaking, the effective
	// length of the doc should be smaller too. However, for performance
	// reasons, we only apply the weight to the term occurrences but not doc
	// length.
	// TODO(jiameng): this is an expensive operation, we will need to monitor
	// its performance and optimize it.
	const double effective_term_occ = std::accumulate(
	item.second.begin(), item.second.end(), 0.0,
	[](double sum, const WeightedPosition& weighted_position) {
	return sum + weighted_position.weight;
	});
	const float tf = effective_term_occ / doc_length.at(item.first);
	results.push_back({item.first, item.second, tf * idf});
	}
	return results;
	}

	// Builds TF-IDF cache given the data. Since this function is expensive, it
	// should run on a non-blocking thread that is different than the main thread.
	TfidfCache BuildTfidf(uint32_t num_docs_from_last_update,
	const DocLength& doc_length,
	const Dictionary& dictionary,
	const TermSet& terms_to_be_updated,
	const TfidfCache& tfidf_cache) {
	// TODO(crbug/1137560): consider moving the helper functions inside the class
	// so that we can use SequenceChecker.
	TfidfCache new_cache(tfidf_cache);
	// If number of documents doesn't change from the last time index was built,
	// we only need to update terms in \|terms_to_be_updated\|. Otherwise we need
	// to rebuild the index.
	if (num_docs_from_last_update == doc_length.size()) {
	for (const auto& term : terms_to_be_updated) {
	if (dictionary.find(term) != dictionary.end()) {
	new_cache[term] = CalculateTfidf(term, doc_length, dictionary);
	} else {
	new_cache.erase(term);
	}
	}
	} else {
	new_cache.clear();
	for (const auto& item : dictionary) {
	new_cache[item.first] =
	CalculateTfidf(item.first, doc_length, dictionary);
	}
	}
	return new_cache;
	}

	// Removes a document from document state variables given it's ID. Don't do
	// anything if the ID doesn't exist. Return true if the document is removed.
	bool RemoveDocumentIfExist(const std::string& document_id,
	DocLength* doc_length,
	Dictionary* dictionary,
	TermSet* terms_to_be_updated) {
	CHECK(doc_length);
	CHECK(dictionary);
	CHECK(terms_to_be_updated);
	bool document_removed = false;
	if (doc_length->find(document_id) == doc_length->end())
	return document_removed;
	doc_length->erase(document_id);
	for (auto it = dictionary->begin(); it != dictionary->end();) {
	if (it->second.find(document_id) != it->second.end()) {
	terms_to_be_updated->insert(it->first);
	it->second.erase(document_id);
	document_removed = true;
	}

	// Removes term from the dictionary if its posting list is empty.
	if (it->second.empty()) {
	it = dictionary->erase(it);
	} else {
	it++;
	}
	}
	return document_removed;
	}

	// Given list of documents to update and document state variables, returns new
	// document state variables and number of deleted documents.
	std::pair<DocumentStateVariables, uint32_t> UpdateDocumentStateVariables(
	DocumentToUpdate&& documents_to_update,
	const DocLength& doc_length,
	Dictionary&& dictionary,
	TermSet&& terms_to_be_updated) {
	DocLength new_doc_length(doc_length);
	uint32_t num_deleted = 0u;
	for (const auto& document : documents_to_update) {
	const std::string document_id(document.first);
	bool is_deleted = RemoveDocumentIfExist(document_id, &new_doc_length,
	&dictionary, &terms_to_be_updated);

	// Update the document if necessary.
	if (!document.second.empty()) {
	// If document content is not empty, it is being updated but not
	// deleted.
	is_deleted = false;
	for (const auto& token : document.second) {
	dictionary[token.content][document_id] = token.positions;
	new_doc_length[document_id] += token.positions.size();
	terms_to_be_updated.insert(token.content);
	}
	}
	num_deleted += (is_deleted) ? 1 : 0;
	}

	return std::make_pair(
	std::make_tuple(std::move(new_doc_length), std::move(dictionary),
	std::move(terms_to_be_updated)),
	num_deleted);
	}

	// Given the index variables, clear all the data.
	std::pair<DocumentStateVariables, TfidfCache> ClearData(
	DocumentToUpdate&& documents_to_update,
	const DocLength& doc_length,
	Dictionary&& dictionary,
	TermSet&& terms_to_be_updated,
	TfidfCache&& tfidf_cache) {
	DocLength new_doc_length;
	documents_to_update.clear();
	dictionary.clear();
	terms_to_be_updated.clear();
	tfidf_cache.clear();
	return std::make_pair(
	std::make_tuple(std::move(new_doc_length), std::move(dictionary),
	std::move(terms_to_be_updated)),
	std::move(tfidf_cache));
	}
	} // namespace

	InvertedIndex::InvertedIndex() {
	task_runner_ = base::ThreadPool::CreateSequencedTaskRunner(
	{base::TaskPriority::BEST_EFFORT, base::MayBlock(),
	base::TaskShutdownBehavior::CONTINUE_ON_SHUTDOWN});
	}
	InvertedIndex::~InvertedIndex() = default;

	PostingList InvertedIndex::FindTerm(const base::string16& term) const {
	if (dictionary_.find(term) != dictionary_.end())
	return dictionary_.at(term);

	return {};
	}

	std::vector<Result> InvertedIndex::FindMatchingDocumentsApproximately(
	const std::unordered_set<base::string16>& terms,
	double prefix_threshold,
	double block_threshold) const {
	// For each document, its score is the sum of TF-IDF scores of its terms
	// that match one of more query term.
	// The map is keyed by the document id.
	std::unordered_map<std::string, ScoreWithPosting> matching_docs;
	for (const auto& kv : tfidf_cache_) {
	const base::string16& index_term = kv.first;
	const std::vector<TfidfResult>& tfidf_results = kv.second;
	for (const auto& term : terms) {
	if (IsRelevantApproximately(term, index_term, prefix_threshold,
	block_threshold)) {
	// If the \|index_term\| is relevant, all of the enclosing documents will
	// have their ranking scores updated.
	for (const auto& docid_tfidf : tfidf_results) {
	const std::string& docid = std::get<0>(docid_tfidf);
	const Posting& posting = std::get<1>(docid_tfidf);
	const float tfidf = std::get<2>(docid_tfidf);
	auto it = matching_docs.find(docid);
	if (it == matching_docs.end()) {
	it = matching_docs.emplace(docid, ScoreWithPosting(0.0, {})).first;
	}

	auto& score_posting = it->second;
	// TODO(jiameng): add position penalty.
	score_posting.first += tfidf;
	// Also update matching positions.
	auto& existing_posting = score_posting.second;
	existing_posting.insert(existing_posting.end(), posting.begin(),
	posting.end());
	}
	// Break out from inner loop, i.e. no need to check other query terms.
	break;
	}
	}
	}

	std::vector<Result> sorted_matching_docs;
	for (const auto& kv : matching_docs) {
	// We don't need to include weights in the search results.
	std::vector<Position> positions;
	for (const auto& weighted_position : kv.second.second) {
	positions.emplace_back(weighted_position.position);
	}
	sorted_matching_docs.emplace_back(
	Result(kv.first, kv.second.first, positions));
	}
	std::sort(sorted_matching_docs.begin(), sorted_matching_docs.end(),
	CompareResults);
	return sorted_matching_docs;
	}

	void InvertedIndex::AddDocuments(const DocumentToUpdate& documents) {
	if (documents.empty())
	return;

	is_index_built_ = false;
	documents_to_update_.insert(documents_to_update_.end(), documents.begin(),
	documents.end());
	InvertedIndexController();
	}

	void InvertedIndex::AddDocuments(const DocumentToUpdate& documents,
	base::OnceCallback<void()> callback) {
	if (documents.empty())
	return;

	task_runner_->PostTaskAndReplyWithResult(
	FROM_HERE,
	base::BindOnce(&UpdateDocumentStateVariables, documents,
	std::move(doc_length_), std::move(dictionary_),
	std::move(terms_to_be_updated_)),
	base::BindOnce(&InvertedIndex::OnAddDocumentsComplete,
	weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
	}

	uint32_t InvertedIndex::RemoveDocuments(
	const std::vector<std::string>& document_ids) {
	if (document_ids.empty())
	return 0;

	for (const auto& id : document_ids) {
	// \|doc_length_\| could be in the process of being updated when this function
	// is called, hence we cannot use it to verify the existence of the \|id\|.
	documents_to_update_.push_back({id, std::vector<Token>()});
	}

	is_index_built_ = false;
	InvertedIndexController();
	return document_ids.size();
	}

	void InvertedIndex::RemoveDocuments(
	const std::vector<std::string>& document_ids,
	base::OnceCallback<void(uint32_t)> callback) {
	DocumentToUpdate documents;
	for (const auto& id : document_ids) {
	documents.push_back({id, std::vector<Token>()});
	}

	task_runner_->PostTaskAndReplyWithResult(
	FROM_HERE,
	base::BindOnce(&UpdateDocumentStateVariables, documents,
	std::move(doc_length_), std::move(dictionary_),
	std::move(terms_to_be_updated_)),
	base::BindOnce(&InvertedIndex::OnUpdateDocumentsComplete,
	weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
	}

	void InvertedIndex::UpdateDocuments(
	const DocumentToUpdate& documents,
	base::OnceCallback<void(uint32_t)> callback) {
	task_runner_->PostTaskAndReplyWithResult(
	FROM_HERE,
	base::BindOnce(&UpdateDocumentStateVariables, documents,
	std::move(doc_length_), std::move(dictionary_),
	std::move(terms_to_be_updated_)),
	base::BindOnce(&InvertedIndex::OnUpdateDocumentsComplete,
	weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
	}

	std::vector<TfidfResult> InvertedIndex::GetTfidf(
	const base::string16& term) const {
	if (tfidf_cache_.find(term) != tfidf_cache_.end()) {
	return tfidf_cache_.at(term);
	}

	return {};
	}

	void InvertedIndex::BuildInvertedIndex() {
	DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
	request_to_build_index_ = true;
	InvertedIndexController();
	}

	void InvertedIndex::BuildInvertedIndex(base::OnceCallback<void()> callback) {
	DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
	task_runner_->PostTaskAndReplyWithResult(
	FROM_HERE,
	base::BindOnce(&BuildTfidf, num_docs_from_last_update_, doc_length_,
	dictionary_, std::move(terms_to_be_updated_),
	tfidf_cache_),
	base::BindOnce(&InvertedIndex::OnBuildTfidfComplete,
	weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
	}

	void InvertedIndex::ClearInvertedIndex() {
	DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
	request_to_clear_index_ = true;
	InvertedIndexController();
	}

	void InvertedIndex::ClearInvertedIndex(base::OnceCallback<void()> callback) {
	DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
	task_runner_->PostTaskAndReplyWithResult(
	FROM_HERE,
	base::BindOnce(&ClearData, std::move(documents_to_update_), doc_length_,
	std::move(dictionary_), std::move(terms_to_be_updated_),
	std::move(tfidf_cache_)),
	base::BindOnce(&InvertedIndex::OnDataCleared,
	weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
	}

	void InvertedIndex::InvertedIndexController() {
	DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
	// TODO(thanhdng): A clear-index call should ideally cancel all other update
	// operations. Need to update the code to reflect this.
	if (update_in_progress_)
	return;

	if (request_to_clear_index_) {
	update_in_progress_ = true;
	request_to_clear_index_ = false;
	task_runner_->PostTaskAndReplyWithResult(
	FROM_HERE,
	base::BindOnce(&ClearData, std::move(documents_to_update_), doc_length_,
	std::move(dictionary_), std::move(terms_to_be_updated_),
	std::move(tfidf_cache_)),
	base::BindOnce(&InvertedIndex::OnDataClearedSync,
	weak_ptr_factory_.GetWeakPtr()));
	return;
	}

	if (documents_to_update_.empty()) {
	if (request_to_build_index_) {
	update_in_progress_ = true;
	index_building_in_progress_ = true;
	request_to_build_index_ = false;
	task_runner_->PostTaskAndReplyWithResult(
	FROM_HERE,
	base::BindOnce(&BuildTfidf, num_docs_from_last_update_, doc_length_,
	dictionary_, std::move(terms_to_be_updated_),
	tfidf_cache_),
	base::BindOnce(&InvertedIndex::OnBuildTfidfCompleteSync,
	weak_ptr_factory_.GetWeakPtr()));
	} else if (terms_to_be_updated_.empty()) {
	// If there's no more work to do and all changed terms have been used to
	// update the index, then mark index is built.
	is_index_built_ = true;
	}
	} else {
	update_in_progress_ = true;
	task_runner_->PostTaskAndReplyWithResult(
	FROM_HERE,
	// TODO(jiameng): \|doc_length_\| can be moved since it's not used for
	// document existence checking any more.
	base::BindOnce(&UpdateDocumentStateVariables,
	std::move(documents_to_update_), doc_length_,
	std::move(dictionary_), std::move(terms_to_be_updated_)),
	base::BindOnce(&InvertedIndex::OnUpdateDocumentsCompleteSync,
	weak_ptr_factory_.GetWeakPtr()));
	}
	}

	void InvertedIndex::OnBuildTfidfCompleteSync(TfidfCache&& new_cache) {
	DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
	num_docs_from_last_update_ = doc_length_.size();
	tfidf_cache_ = std::move(new_cache);

	update_in_progress_ = false;
	index_building_in_progress_ = false;
	InvertedIndexController();
	}

	void InvertedIndex::OnBuildTfidfComplete(base::OnceCallback<void()> callback,
	TfidfCache&& new_cache) {
	DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
	num_docs_from_last_update_ = doc_length_.size();
	tfidf_cache_ = std::move(new_cache);

	std::move(callback).Run();
	}

	void InvertedIndex::OnUpdateDocumentsCompleteSync(
	std::pair<DocumentStateVariables, uint32_t>&&
	document_state_variables_and_num_deleted) {
	DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
	doc_length_ =
	std::move(std::get<0>(document_state_variables_and_num_deleted.first));
	dictionary_ =
	std::move(std::get<1>(document_state_variables_and_num_deleted.first));
	terms_to_be_updated_ =
	std::move(std::get<2>(document_state_variables_and_num_deleted.first));

	update_in_progress_ = false;
	InvertedIndexController();
	}

	void InvertedIndex::OnUpdateDocumentsComplete(
	base::OnceCallback<void(uint32_t)> callback,
	std::pair<DocumentStateVariables, uint32_t>&&
	document_state_variables_and_num_deleted) {
	DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
	doc_length_ =
	std::move(std::get<0>(document_state_variables_and_num_deleted.first));
	dictionary_ =
	std::move(std::get<1>(document_state_variables_and_num_deleted.first));
	terms_to_be_updated_ =
	std::move(std::get<2>(document_state_variables_and_num_deleted.first));

	BuildInvertedIndex(base::BindOnce(
	[](base::OnceCallback<void(uint32_t)> callback, uint32_t num_deleted) {
	std::move(callback).Run(num_deleted);
	},
	std::move(callback), document_state_variables_and_num_deleted.second));
	}

	void InvertedIndex::OnAddDocumentsComplete(
	base::OnceCallback<void()> callback,
	std::pair<DocumentStateVariables, uint32_t>&&
	document_state_variables_and_num_deleted) {
	DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
	DCHECK_EQ(document_state_variables_and_num_deleted.second, 0u);
	doc_length_ =
	std::move(std::get<0>(document_state_variables_and_num_deleted.first));
	dictionary_ =
	std::move(std::get<1>(document_state_variables_and_num_deleted.first));
	terms_to_be_updated_ =
	std::move(std::get<2>(document_state_variables_and_num_deleted.first));

	BuildInvertedIndex(std::move(callback));
	}

	void InvertedIndex::OnDataClearedSync(
	std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data) {
	DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
	doc_length_ = std::move(std::get<0>(inverted_index_data.first));
	dictionary_ = std::move(std::get<1>(inverted_index_data.first));
	terms_to_be_updated_ = std::move(std::get<2>(inverted_index_data.first));
	tfidf_cache_ = std::move(inverted_index_data.second);

	num_docs_from_last_update_ = 0;
	update_in_progress_ = false;
	InvertedIndexController();
	}

	void InvertedIndex::OnDataCleared(
	base::OnceCallback<void()> callback,
	std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data) {
	DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
	doc_length_ = std::move(std::get<0>(inverted_index_data.first));
	dictionary_ = std::move(std::get<1>(inverted_index_data.first));
	terms_to_be_updated_ = std::move(std::get<2>(inverted_index_data.first));
	tfidf_cache_ = std::move(inverted_index_data.second);
	num_docs_from_last_update_ = 0;

	std::move(callback).Run();
	}

	} // namespace local_search_service
	} // namespace chromeos