blob: 0a1f193c08f0684cb47913f90afd0da5067d8e07 [file] [log] [blame]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chromeos/components/local_search_service/inverted_index.h"
#include <numeric>
#include <string>
#include <tuple>
#include <vector>
#include "base/bind.h"
#include "base/callback.h"
#include "base/strings/string16.h"
#include "base/strings/utf_string_conversions.h"
#include "base/task/task_traits.h"
#include "base/task/thread_pool.h"
#include "chromeos/components/local_search_service/search_utils.h"
namespace chromeos {
namespace local_search_service {
namespace {
// (document-score, posting-of-all-matching-terms).
using ScoreWithPosting = std::pair<double, Posting>;
// Calculates TF-IDF scores for a term
std::vector<TfidfResult> CalculateTfidf(const base::string16& term,
const DocLength& doc_length,
const Dictionary& dictionary) {
std::vector<TfidfResult> results;
// We don't apply weights to idf because the effect is likely small.
const float idf =
1.0 + log((1.0 + doc_length.size()) / (1.0 + dictionary.at(term).size()));
for (const auto& item : dictionary.at(term)) {
// If a term has a very low content weight in a doc, its effective number of
// occurrences in the doc should be lower. Strictly speaking, the effective
// length of the doc should be smaller too. However, for performance
// reasons, we only apply the weight to the term occurrences but not doc
// length.
// TODO(jiameng): this is an expensive operation, we will need to monitor
// its performance and optimize it.
const double effective_term_occ = std::accumulate(
item.second.begin(), item.second.end(), 0.0,
[](double sum, const WeightedPosition& weighted_position) {
return sum + weighted_position.weight;
});
const float tf = effective_term_occ / doc_length.at(item.first);
results.push_back({item.first, item.second, tf * idf});
}
return results;
}
// Builds TF-IDF cache given the data. Since this function is expensive, it
// should run on a non-blocking thread that is different than the main thread.
TfidfCache BuildTfidf(uint32_t num_docs_from_last_update,
const DocLength& doc_length,
const Dictionary& dictionary,
const TermSet& terms_to_be_updated,
const TfidfCache& tfidf_cache) {
// TODO(crbug/1137560): consider moving the helper functions inside the class
// so that we can use SequenceChecker.
TfidfCache new_cache(tfidf_cache);
// If number of documents doesn't change from the last time index was built,
// we only need to update terms in |terms_to_be_updated|. Otherwise we need
// to rebuild the index.
if (num_docs_from_last_update == doc_length.size()) {
for (const auto& term : terms_to_be_updated) {
if (dictionary.find(term) != dictionary.end()) {
new_cache[term] = CalculateTfidf(term, doc_length, dictionary);
} else {
new_cache.erase(term);
}
}
} else {
new_cache.clear();
for (const auto& item : dictionary) {
new_cache[item.first] =
CalculateTfidf(item.first, doc_length, dictionary);
}
}
return new_cache;
}
// Removes a document from document state variables given it's ID. Don't do
// anything if the ID doesn't exist. Return true if the document is removed.
bool RemoveDocumentIfExist(const std::string& document_id,
DocLength* doc_length,
Dictionary* dictionary,
TermSet* terms_to_be_updated) {
CHECK(doc_length);
CHECK(dictionary);
CHECK(terms_to_be_updated);
bool document_removed = false;
if (doc_length->find(document_id) == doc_length->end())
return document_removed;
doc_length->erase(document_id);
for (auto it = dictionary->begin(); it != dictionary->end();) {
if (it->second.find(document_id) != it->second.end()) {
terms_to_be_updated->insert(it->first);
it->second.erase(document_id);
document_removed = true;
}
// Removes term from the dictionary if its posting list is empty.
if (it->second.empty()) {
it = dictionary->erase(it);
} else {
it++;
}
}
return document_removed;
}
// Given list of documents to update and document state variables, returns new
// document state variables and number of deleted documents.
std::pair<DocumentStateVariables, uint32_t> UpdateDocumentStateVariables(
DocumentToUpdate&& documents_to_update,
const DocLength& doc_length,
Dictionary&& dictionary,
TermSet&& terms_to_be_updated) {
DocLength new_doc_length(doc_length);
uint32_t num_deleted = 0u;
for (const auto& document : documents_to_update) {
const std::string document_id(document.first);
bool is_deleted = RemoveDocumentIfExist(document_id, &new_doc_length,
&dictionary, &terms_to_be_updated);
// Update the document if necessary.
if (!document.second.empty()) {
// If document content is not empty, it is being updated but not
// deleted.
is_deleted = false;
for (const auto& token : document.second) {
dictionary[token.content][document_id] = token.positions;
new_doc_length[document_id] += token.positions.size();
terms_to_be_updated.insert(token.content);
}
}
num_deleted += (is_deleted) ? 1 : 0;
}
return std::make_pair(
std::make_tuple(std::move(new_doc_length), std::move(dictionary),
std::move(terms_to_be_updated)),
num_deleted);
}
// Given the index variables, clear all the data.
std::pair<DocumentStateVariables, TfidfCache> ClearData(
DocumentToUpdate&& documents_to_update,
const DocLength& doc_length,
Dictionary&& dictionary,
TermSet&& terms_to_be_updated,
TfidfCache&& tfidf_cache) {
DocLength new_doc_length;
documents_to_update.clear();
dictionary.clear();
terms_to_be_updated.clear();
tfidf_cache.clear();
return std::make_pair(
std::make_tuple(std::move(new_doc_length), std::move(dictionary),
std::move(terms_to_be_updated)),
std::move(tfidf_cache));
}
} // namespace
InvertedIndex::InvertedIndex() {
task_runner_ = base::ThreadPool::CreateSequencedTaskRunner(
{base::TaskPriority::BEST_EFFORT, base::MayBlock(),
base::TaskShutdownBehavior::CONTINUE_ON_SHUTDOWN});
}
InvertedIndex::~InvertedIndex() = default;
PostingList InvertedIndex::FindTerm(const base::string16& term) const {
if (dictionary_.find(term) != dictionary_.end())
return dictionary_.at(term);
return {};
}
std::vector<Result> InvertedIndex::FindMatchingDocumentsApproximately(
const std::unordered_set<base::string16>& terms,
double prefix_threshold,
double block_threshold) const {
// For each document, its score is the sum of TF-IDF scores of its terms
// that match one of more query term.
// The map is keyed by the document id.
std::unordered_map<std::string, ScoreWithPosting> matching_docs;
for (const auto& kv : tfidf_cache_) {
const base::string16& index_term = kv.first;
const std::vector<TfidfResult>& tfidf_results = kv.second;
for (const auto& term : terms) {
if (IsRelevantApproximately(term, index_term, prefix_threshold,
block_threshold)) {
// If the |index_term| is relevant, all of the enclosing documents will
// have their ranking scores updated.
for (const auto& docid_tfidf : tfidf_results) {
const std::string& docid = std::get<0>(docid_tfidf);
const Posting& posting = std::get<1>(docid_tfidf);
const float tfidf = std::get<2>(docid_tfidf);
auto it = matching_docs.find(docid);
if (it == matching_docs.end()) {
it = matching_docs.emplace(docid, ScoreWithPosting(0.0, {})).first;
}
auto& score_posting = it->second;
// TODO(jiameng): add position penalty.
score_posting.first += tfidf;
// Also update matching positions.
auto& existing_posting = score_posting.second;
existing_posting.insert(existing_posting.end(), posting.begin(),
posting.end());
}
// Break out from inner loop, i.e. no need to check other query terms.
break;
}
}
}
std::vector<Result> sorted_matching_docs;
for (const auto& kv : matching_docs) {
// We don't need to include weights in the search results.
std::vector<Position> positions;
for (const auto& weighted_position : kv.second.second) {
positions.emplace_back(weighted_position.position);
}
sorted_matching_docs.emplace_back(
Result(kv.first, kv.second.first, positions));
}
std::sort(sorted_matching_docs.begin(), sorted_matching_docs.end(),
CompareResults);
return sorted_matching_docs;
}
void InvertedIndex::AddDocuments(const DocumentToUpdate& documents) {
if (documents.empty())
return;
is_index_built_ = false;
documents_to_update_.insert(documents_to_update_.end(), documents.begin(),
documents.end());
InvertedIndexController();
}
void InvertedIndex::AddDocuments(const DocumentToUpdate& documents,
base::OnceCallback<void()> callback) {
if (documents.empty())
return;
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&UpdateDocumentStateVariables, documents,
std::move(doc_length_), std::move(dictionary_),
std::move(terms_to_be_updated_)),
base::BindOnce(&InvertedIndex::OnAddDocumentsComplete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
uint32_t InvertedIndex::RemoveDocuments(
const std::vector<std::string>& document_ids) {
if (document_ids.empty())
return 0;
for (const auto& id : document_ids) {
// |doc_length_| could be in the process of being updated when this function
// is called, hence we cannot use it to verify the existence of the |id|.
documents_to_update_.push_back({id, std::vector<Token>()});
}
is_index_built_ = false;
InvertedIndexController();
return document_ids.size();
}
void InvertedIndex::RemoveDocuments(
const std::vector<std::string>& document_ids,
base::OnceCallback<void(uint32_t)> callback) {
DocumentToUpdate documents;
for (const auto& id : document_ids) {
documents.push_back({id, std::vector<Token>()});
}
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&UpdateDocumentStateVariables, documents,
std::move(doc_length_), std::move(dictionary_),
std::move(terms_to_be_updated_)),
base::BindOnce(&InvertedIndex::OnUpdateDocumentsComplete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndex::UpdateDocuments(
const DocumentToUpdate& documents,
base::OnceCallback<void(uint32_t)> callback) {
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&UpdateDocumentStateVariables, documents,
std::move(doc_length_), std::move(dictionary_),
std::move(terms_to_be_updated_)),
base::BindOnce(&InvertedIndex::OnUpdateDocumentsComplete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
std::vector<TfidfResult> InvertedIndex::GetTfidf(
const base::string16& term) const {
if (tfidf_cache_.find(term) != tfidf_cache_.end()) {
return tfidf_cache_.at(term);
}
return {};
}
void InvertedIndex::BuildInvertedIndex() {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
request_to_build_index_ = true;
InvertedIndexController();
}
void InvertedIndex::BuildInvertedIndex(base::OnceCallback<void()> callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&BuildTfidf, num_docs_from_last_update_, doc_length_,
dictionary_, std::move(terms_to_be_updated_),
tfidf_cache_),
base::BindOnce(&InvertedIndex::OnBuildTfidfComplete,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndex::ClearInvertedIndex() {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
request_to_clear_index_ = true;
InvertedIndexController();
}
void InvertedIndex::ClearInvertedIndex(base::OnceCallback<void()> callback) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&ClearData, std::move(documents_to_update_), doc_length_,
std::move(dictionary_), std::move(terms_to_be_updated_),
std::move(tfidf_cache_)),
base::BindOnce(&InvertedIndex::OnDataCleared,
weak_ptr_factory_.GetWeakPtr(), std::move(callback)));
}
void InvertedIndex::InvertedIndexController() {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
// TODO(thanhdng): A clear-index call should ideally cancel all other update
// operations. Need to update the code to reflect this.
if (update_in_progress_)
return;
if (request_to_clear_index_) {
update_in_progress_ = true;
request_to_clear_index_ = false;
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&ClearData, std::move(documents_to_update_), doc_length_,
std::move(dictionary_), std::move(terms_to_be_updated_),
std::move(tfidf_cache_)),
base::BindOnce(&InvertedIndex::OnDataClearedSync,
weak_ptr_factory_.GetWeakPtr()));
return;
}
if (documents_to_update_.empty()) {
if (request_to_build_index_) {
update_in_progress_ = true;
index_building_in_progress_ = true;
request_to_build_index_ = false;
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
base::BindOnce(&BuildTfidf, num_docs_from_last_update_, doc_length_,
dictionary_, std::move(terms_to_be_updated_),
tfidf_cache_),
base::BindOnce(&InvertedIndex::OnBuildTfidfCompleteSync,
weak_ptr_factory_.GetWeakPtr()));
} else if (terms_to_be_updated_.empty()) {
// If there's no more work to do and all changed terms have been used to
// update the index, then mark index is built.
is_index_built_ = true;
}
} else {
update_in_progress_ = true;
task_runner_->PostTaskAndReplyWithResult(
FROM_HERE,
// TODO(jiameng): |doc_length_| can be moved since it's not used for
// document existence checking any more.
base::BindOnce(&UpdateDocumentStateVariables,
std::move(documents_to_update_), doc_length_,
std::move(dictionary_), std::move(terms_to_be_updated_)),
base::BindOnce(&InvertedIndex::OnUpdateDocumentsCompleteSync,
weak_ptr_factory_.GetWeakPtr()));
}
}
void InvertedIndex::OnBuildTfidfCompleteSync(TfidfCache&& new_cache) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
num_docs_from_last_update_ = doc_length_.size();
tfidf_cache_ = std::move(new_cache);
update_in_progress_ = false;
index_building_in_progress_ = false;
InvertedIndexController();
}
void InvertedIndex::OnBuildTfidfComplete(base::OnceCallback<void()> callback,
TfidfCache&& new_cache) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
num_docs_from_last_update_ = doc_length_.size();
tfidf_cache_ = std::move(new_cache);
std::move(callback).Run();
}
void InvertedIndex::OnUpdateDocumentsCompleteSync(
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
doc_length_ =
std::move(std::get<0>(document_state_variables_and_num_deleted.first));
dictionary_ =
std::move(std::get<1>(document_state_variables_and_num_deleted.first));
terms_to_be_updated_ =
std::move(std::get<2>(document_state_variables_and_num_deleted.first));
update_in_progress_ = false;
InvertedIndexController();
}
void InvertedIndex::OnUpdateDocumentsComplete(
base::OnceCallback<void(uint32_t)> callback,
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
doc_length_ =
std::move(std::get<0>(document_state_variables_and_num_deleted.first));
dictionary_ =
std::move(std::get<1>(document_state_variables_and_num_deleted.first));
terms_to_be_updated_ =
std::move(std::get<2>(document_state_variables_and_num_deleted.first));
BuildInvertedIndex(base::BindOnce(
[](base::OnceCallback<void(uint32_t)> callback, uint32_t num_deleted) {
std::move(callback).Run(num_deleted);
},
std::move(callback), document_state_variables_and_num_deleted.second));
}
void InvertedIndex::OnAddDocumentsComplete(
base::OnceCallback<void()> callback,
std::pair<DocumentStateVariables, uint32_t>&&
document_state_variables_and_num_deleted) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK_EQ(document_state_variables_and_num_deleted.second, 0u);
doc_length_ =
std::move(std::get<0>(document_state_variables_and_num_deleted.first));
dictionary_ =
std::move(std::get<1>(document_state_variables_and_num_deleted.first));
terms_to_be_updated_ =
std::move(std::get<2>(document_state_variables_and_num_deleted.first));
BuildInvertedIndex(std::move(callback));
}
void InvertedIndex::OnDataClearedSync(
std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
doc_length_ = std::move(std::get<0>(inverted_index_data.first));
dictionary_ = std::move(std::get<1>(inverted_index_data.first));
terms_to_be_updated_ = std::move(std::get<2>(inverted_index_data.first));
tfidf_cache_ = std::move(inverted_index_data.second);
num_docs_from_last_update_ = 0;
update_in_progress_ = false;
InvertedIndexController();
}
void InvertedIndex::OnDataCleared(
base::OnceCallback<void()> callback,
std::pair<DocumentStateVariables, TfidfCache>&& inverted_index_data) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
doc_length_ = std::move(std::get<0>(inverted_index_data.first));
dictionary_ = std::move(std::get<1>(inverted_index_data.first));
terms_to_be_updated_ = std::move(std::get<2>(inverted_index_data.first));
tfidf_cache_ = std::move(inverted_index_data.second);
num_docs_from_last_update_ = 0;
std::move(callback).Run();
}
} // namespace local_search_service
} // namespace chromeos