blob: 54cb2daf32e4070ed95452c6cc56749980556f47 [file] [log] [blame]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chromeos/components/local_search_service/inverted_index_search.h"
#include <utility>
#include "base/i18n/rtl.h"
#include "base/optional.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "chromeos/components/local_search_service/content_extraction_utils.h"
#include "chromeos/components/local_search_service/inverted_index.h"
#include "chromeos/components/string_matching/tokenized_string.h"
namespace chromeos {
namespace local_search_service {
namespace {
using chromeos::string_matching::TokenizedString;
std::vector<Token> ExtractDocumentTokens(const Data& data) {
// Use input locale unless it's empty. In this case we will use system
// default locale.
const std::string locale =
data.locale.empty() ? base::i18n::GetConfiguredLocale() : data.locale;
std::vector<Token> document_tokens;
for (const Content& content : data.contents) {
DCHECK_GE(content.weight, 0);
DCHECK_LE(content.weight, 1);
const std::vector<Token> content_tokens =
ExtractContent(content.id, content.content, content.weight, locale);
document_tokens.insert(document_tokens.end(), content_tokens.begin(),
content_tokens.end());
}
return ConsolidateToken(document_tokens);
}
} // namespace
InvertedIndexSearch::InvertedIndexSearch(IndexId index_id,
PrefService* local_state)
: Index(index_id, Backend::kInvertedIndex, local_state),
inverted_index_(std::make_unique<InvertedIndex>()) {}
InvertedIndexSearch::~InvertedIndexSearch() = default;
uint64_t InvertedIndexSearch::GetSize() {
return inverted_index_->NumberDocuments();
}
void InvertedIndexSearch::AddOrUpdate(
const std::vector<local_search_service::Data>& data) {
for (const Data& d : data) {
const std::vector<Token> document_tokens = ExtractDocumentTokens(d);
DCHECK(!document_tokens.empty());
inverted_index_->AddDocument(d.id, document_tokens);
}
inverted_index_->BuildInvertedIndex();
}
uint32_t InvertedIndexSearch::Delete(const std::vector<std::string>& ids) {
uint32_t num_deleted = 0u;
for (const auto& id : ids) {
DCHECK(!id.empty());
num_deleted += inverted_index_->RemoveDocument(id);
}
inverted_index_->BuildInvertedIndex();
return num_deleted;
}
ResponseStatus InvertedIndexSearch::Find(const base::string16& query,
uint32_t max_results,
std::vector<Result>* results) {
DCHECK(results);
results->clear();
if (query.empty()) {
return ResponseStatus::kEmptyQuery;
}
if (GetSize() == 0u)
return ResponseStatus::kEmptyIndex;
// TODO(jiameng): actual input query may not be the same as default locale.
// Need another way to determine actual language of the query.
const TokenizedString::Mode mode =
IsNonLatinLocale(base::i18n::GetConfiguredLocale())
? TokenizedString::Mode::kCamelCase
: TokenizedString::Mode::kWords;
const TokenizedString tokenized_query(query, mode);
std::unordered_set<base::string16> tokens;
for (const auto& token : tokenized_query.tokens()) {
// TODO(jiameng): we are not removing stopword because they shouldn't exist
// in the index. However, for performance reason, it may be worth to be
// removed.
tokens.insert(token);
}
*results = inverted_index_->FindMatchingDocumentsApproximately(
tokens, search_params_.prefix_threshold, search_params_.fuzzy_threshold);
if (results->size() > max_results && max_results > 0u)
results->resize(max_results);
return ResponseStatus::kSuccess;
}
std::vector<std::pair<std::string, uint32_t>>
InvertedIndexSearch::FindTermForTesting(const base::string16& term) const {
const PostingList posting_list = inverted_index_->FindTerm(term);
std::vector<std::pair<std::string, uint32_t>> doc_with_freq;
for (const auto& kv : posting_list) {
doc_with_freq.push_back({kv.first, kv.second.size()});
}
return doc_with_freq;
}
} // namespace local_search_service
} // namespace chromeos