| // Copyright 2020 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "chromeos/components/local_search_service/inverted_index_search.h" |
| |
| #include <utility> |
| #include <vector> |
| |
| #include "base/bind.h" |
| #include "base/i18n/rtl.h" |
| #include "base/optional.h" |
| #include "base/strings/string_split.h" |
| #include "base/strings/string_util.h" |
| #include "base/task/task_traits.h" |
| #include "base/task/thread_pool.h" |
| #include "base/task_runner_util.h" |
| #include "base/time/time.h" |
| #include "chromeos/components/local_search_service/content_extraction_utils.h" |
| #include "chromeos/components/local_search_service/inverted_index.h" |
| #include "chromeos/components/string_matching/tokenized_string.h" |
| |
| namespace chromeos { |
| namespace local_search_service { |
| |
| namespace { |
| |
| using chromeos::string_matching::TokenizedString; |
| using ExtractedContent = |
| std::vector<std::pair<std::string, std::vector<Token>>>; |
| |
| std::vector<Token> ExtractDocumentTokens(const Data& data) { |
| // Use input locale unless it's empty. In this case we will use system |
| // default locale. |
| const std::string locale = |
| data.locale.empty() ? base::i18n::GetConfiguredLocale() : data.locale; |
| std::vector<Token> document_tokens; |
| for (const Content& content : data.contents) { |
| DCHECK_GE(content.weight, 0); |
| DCHECK_LE(content.weight, 1); |
| const std::vector<Token> content_tokens = |
| ExtractContent(content.id, content.content, content.weight, locale); |
| document_tokens.insert(document_tokens.end(), content_tokens.begin(), |
| content_tokens.end()); |
| } |
| return ConsolidateToken(document_tokens); |
| } |
| |
| ExtractedContent ExtractDocumentsContent(const std::vector<Data>& data) { |
| ExtractedContent documents; |
| for (const Data& d : data) { |
| const std::vector<Token> document_tokens = ExtractDocumentTokens(d); |
| documents.push_back({d.id, document_tokens}); |
| } |
| |
| return documents; |
| } |
| |
| std::unordered_set<base::string16> GetTokenizedQuery( |
| const base::string16& query) { |
| // TODO(jiameng): actual input query may not be the same as default locale. |
| // Need another way to determine actual language of the query. |
| const TokenizedString::Mode mode = |
| IsNonLatinLocale(base::i18n::GetConfiguredLocale()) |
| ? TokenizedString::Mode::kCamelCase |
| : TokenizedString::Mode::kWords; |
| |
| const TokenizedString tokenized_query(query, mode); |
| std::unordered_set<base::string16> tokens; |
| for (const auto& token : tokenized_query.tokens()) { |
| // TODO(jiameng): we are not removing stopword because they shouldn't exist |
| // in the index. However, for performance reason, it may be worth to be |
| // removed. |
| tokens.insert(token); |
| } |
| return tokens; |
| } |
| |
| } // namespace |
| |
| InvertedIndexSearch::InvertedIndexSearch(IndexId index_id) |
| : Index(index_id, Backend::kInvertedIndex), |
| inverted_index_(std::make_unique<InvertedIndex>()), |
| blocking_task_runner_(base::ThreadPool::CreateSequencedTaskRunner( |
| {base::TaskPriority::BEST_EFFORT, base::MayBlock(), |
| base::TaskShutdownBehavior::CONTINUE_ON_SHUTDOWN})) {} |
| |
| InvertedIndexSearch::~InvertedIndexSearch() { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| } |
| |
| void InvertedIndexSearch::GetSize(GetSizeCallback callback) { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| std::move(callback).Run(inverted_index_->NumberDocuments()); |
| } |
| |
| void InvertedIndexSearch::AddOrUpdate(const std::vector<Data>& data, |
| AddOrUpdateCallback callback) { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| DCHECK(!data.empty()); |
| base::PostTaskAndReplyWithResult( |
| blocking_task_runner_.get(), FROM_HERE, |
| base::BindOnce(&ExtractDocumentsContent, data), |
| base::BindOnce(&InvertedIndexSearch::FinalizeAddOrUpdate, |
| weak_ptr_factory_.GetWeakPtr(), std::move(callback))); |
| } |
| |
| void InvertedIndexSearch::Delete(const std::vector<std::string>& ids, |
| DeleteCallback callback) { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| DCHECK(!ids.empty()); |
| blocking_task_runner_->PostTaskAndReply( |
| FROM_HERE, base::DoNothing(), |
| base::BindOnce(&InvertedIndexSearch::FinalizeDelete, |
| weak_ptr_factory_.GetWeakPtr(), std::move(callback), ids)); |
| } |
| |
| void InvertedIndexSearch::UpdateDocuments(const std::vector<Data>& data, |
| UpdateDocumentsCallback callback) { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| DCHECK(!data.empty()); |
| base::PostTaskAndReplyWithResult( |
| blocking_task_runner_.get(), FROM_HERE, |
| base::BindOnce(&ExtractDocumentsContent, data), |
| base::BindOnce(&InvertedIndexSearch::FinalizeUpdateDocuments, |
| weak_ptr_factory_.GetWeakPtr(), std::move(callback))); |
| } |
| |
| void InvertedIndexSearch::Find(const base::string16& query, |
| uint32_t max_results, |
| FindCallback callback) { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| const base::TimeTicks start = base::TimeTicks::Now(); |
| if (query.empty()) { |
| const ResponseStatus status = ResponseStatus::kEmptyQuery; |
| MaybeLogSearchResultsStats(status, 0u, base::TimeDelta()); |
| std::move(callback).Run(status, base::nullopt); |
| return; |
| } |
| if (inverted_index_->NumberDocuments() == 0u) { |
| const ResponseStatus status = ResponseStatus::kEmptyIndex; |
| MaybeLogSearchResultsStats(status, 0u, base::TimeDelta()); |
| std::move(callback).Run(status, base::nullopt); |
| return; |
| } |
| |
| std::vector<Result> results = |
| inverted_index_->FindMatchingDocumentsApproximately( |
| GetTokenizedQuery(query), search_params_.prefix_threshold, |
| search_params_.fuzzy_threshold); |
| |
| if (results.size() > max_results && max_results > 0u) |
| results.resize(max_results); |
| |
| const ResponseStatus status = ResponseStatus::kSuccess; |
| const base::TimeTicks end = base::TimeTicks::Now(); |
| MaybeLogSearchResultsStats(status, results.size(), end - start); |
| std::move(callback).Run(status, results); |
| } |
| |
| void InvertedIndexSearch::ClearIndex(ClearIndexCallback callback) { |
| inverted_index_->ClearInvertedIndex(); |
| std::move(callback).Run(); |
| } |
| |
| std::vector<std::pair<std::string, uint32_t>> |
| InvertedIndexSearch::FindTermForTesting(const base::string16& term) const { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| const PostingList posting_list = inverted_index_->FindTerm(term); |
| std::vector<std::pair<std::string, uint32_t>> doc_with_freq; |
| for (const auto& kv : posting_list) { |
| doc_with_freq.push_back({kv.first, kv.second.size()}); |
| } |
| |
| return doc_with_freq; |
| } |
| |
| void InvertedIndexSearch::FinalizeAddOrUpdate( |
| AddOrUpdateCallback callback, |
| const ExtractedContent& documents) { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| inverted_index_->AddDocuments(documents, std::move(callback)); |
| } |
| |
| void InvertedIndexSearch::FinalizeDelete(DeleteCallback callback, |
| const std::vector<std::string>& ids) { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| inverted_index_->RemoveDocuments(ids, std::move(callback)); |
| } |
| |
| void InvertedIndexSearch::FinalizeUpdateDocuments( |
| UpdateDocumentsCallback callback, |
| const ExtractedContent& documents) { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| inverted_index_->UpdateDocuments(documents, std::move(callback)); |
| } |
| |
| void InvertedIndexSearch::MaybeBuildInvertedIndex() { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_); |
| if (num_queued_index_updates_ == 0) { |
| inverted_index_->BuildInvertedIndex(); |
| } |
| } |
| |
| } // namespace local_search_service |
| } // namespace chromeos |