blob: 906e2638210f523d6b7e018f16ae27c5b3fbec43 [file] [log] [blame]
// Copyright 2014 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/bookmarks/browser/titled_url_index.h"
#include "base/strings/utf_string_conversions.h"
#include <stdint.h>
#include <algorithm>
#include <iterator>
#include <unordered_set>
#include <utility>
#include "base/i18n/case_conversion.h"
#include "base/i18n/unicodestring.h"
#include "base/logging.h"
#include "base/ranges/algorithm.h"
#include "base/stl_util.h"
#include "base/strings/utf_offset_string_conversions.h"
#include "base/trace_event/memory_usage_estimator.h"
#include "build/build_config.h"
#include "components/bookmarks/browser/bookmark_utils.h"
#include "components/bookmarks/browser/titled_url_match.h"
#include "components/bookmarks/browser/titled_url_node.h"
#include "components/query_parser/snippet.h"
#include "third_party/icu/source/common/unicode/normalizer2.h"
#include "third_party/icu/source/common/unicode/utypes.h"
namespace bookmarks {
namespace {
// Return true if `prefix` is a prefix of `string`.
bool IsPrefix(const std::u16string& prefix, const std::u16string& string) {
return prefix.size() <= string.size() &&
prefix.compare(0, prefix.size(), string, 0, prefix.size()) == 0;
}
} // namespace
TitledUrlIndex::TitledUrlIndex(std::unique_ptr<TitledUrlNodeSorter> sorter)
: sorter_(std::move(sorter)) {}
TitledUrlIndex::~TitledUrlIndex() = default;
void TitledUrlIndex::SetNodeSorter(
std::unique_ptr<TitledUrlNodeSorter> sorter) {
sorter_ = std::move(sorter);
}
void TitledUrlIndex::Add(const TitledUrlNode* node) {
for (const std::u16string& term : ExtractIndexTerms(node))
RegisterNode(term, node);
}
void TitledUrlIndex::Remove(const TitledUrlNode* node) {
for (const std::u16string& term : ExtractIndexTerms(node))
UnregisterNode(term, node);
}
void TitledUrlIndex::AddPath(const TitledUrlNode* node) {
for (const std::u16string& term :
ExtractQueryWords(Normalize(node->GetTitledUrlNodeTitle()))) {
path_index_[term]++;
}
}
void TitledUrlIndex::RemovePath(const TitledUrlNode* node) {
for (const std::u16string& term :
ExtractQueryWords(Normalize(node->GetTitledUrlNodeTitle()))) {
// `path_index_.count(term)` should be > 0, since nodes can't be
// removed/renamed if they didn't exist to begin with. But some tests don't
// fully load bookmarks so it's not `DCHECK`ed.
if (path_index_.count(term) && !--path_index_[term])
path_index_.erase(term);
}
}
std::vector<TitledUrlMatch> TitledUrlIndex::GetResultsMatching(
const std::u16string& input_query,
size_t max_count,
query_parser::MatchingAlgorithm matching_algorithm) {
const std::u16string query = Normalize(input_query);
std::vector<std::u16string> terms = ExtractQueryWords(query);
if (terms.empty())
return {};
// `matches` shouldn't exclude nodes that don't match every query term, as the
// query terms may match in the ancestors. `MatchTitledUrlNodeWithQuery()`
// below will filter out nodes that neither match nor ancestor-match every
// query term.
static const size_t kMaxNodes = 1000;
TitledUrlNodeSet matches =
RetrieveNodesMatchingAnyTerms(terms, matching_algorithm, kMaxNodes);
if (matches.empty())
return {};
TitledUrlNodes sorted_nodes;
SortMatches(matches, &sorted_nodes);
// We use a QueryParser to fill in match positions for us. It's not the most
// efficient way to go about this, but by the time we get here we know what
// matches and so this shouldn't be performance critical.
query_parser::QueryNodeVector query_nodes;
query_parser::QueryParser::ParseQueryNodes(query, matching_algorithm,
&query_nodes);
return MatchTitledUrlNodesWithQuery(sorted_nodes, query_nodes, terms,
max_count);
}
// static
std::u16string TitledUrlIndex::Normalize(std::u16string_view text) {
UErrorCode status = U_ZERO_ERROR;
const icu::Normalizer2* normalizer2 =
icu::Normalizer2::getInstance(nullptr, "nfkc", UNORM2_COMPOSE, status);
if (U_FAILURE(status)) {
// Log and crash right away to capture the error code in the crash report.
LOG(FATAL) << "failed to create a normalizer: " << u_errorName(status);
}
icu::UnicodeString unicode_text(text.data(),
static_cast<int32_t>(text.length()));
icu::UnicodeString unicode_normalized_text;
normalizer2->normalize(unicode_text, unicode_normalized_text, status);
if (U_FAILURE(status)) {
// This should not happen. Log the error and fall back.
LOG(ERROR) << "normalization failed: " << u_errorName(status);
return std::u16string(text);
}
return base::i18n::UnicodeStringToString16(unicode_normalized_text);
}
void TitledUrlIndex::SortMatches(const TitledUrlNodeSet& matches,
TitledUrlNodes* sorted_nodes) const {
if (sorter_) {
sorter_->SortMatches(matches, sorted_nodes);
} else {
sorted_nodes->insert(sorted_nodes->end(), matches.begin(), matches.end());
}
}
std::vector<TitledUrlMatch> TitledUrlIndex::MatchTitledUrlNodesWithQuery(
const TitledUrlNodes& nodes,
const query_parser::QueryNodeVector& query_nodes,
const std::vector<std::u16string>& query_terms,
size_t max_count) {
// The highest typed counts should be at the beginning of the `matches` vector
// so that the best matches will always be included in the results. The loop
// that calculates match relevance in
// `HistoryContentsProvider::ConvertResults()` will run backwards to assure
// higher relevance will be attributed to the best matches.
std::vector<TitledUrlMatch> matches;
for (TitledUrlNodes::const_iterator i = nodes.begin();
i != nodes.end() && matches.size() < max_count; ++i) {
std::optional<TitledUrlMatch> match =
MatchTitledUrlNodeWithQuery(*i, query_nodes, query_terms);
if (match)
matches.emplace_back(std::move(match).value());
}
return matches;
}
std::optional<TitledUrlMatch> TitledUrlIndex::MatchTitledUrlNodeWithQuery(
const TitledUrlNode* node,
const query_parser::QueryNodeVector& query_nodes,
const std::vector<std::u16string>& query_terms) {
if (!node) {
return std::nullopt;
}
// Check that the result matches the query. The previous search
// was a simple per-word search, while the more complex matching
// of QueryParser may filter it out. For example, the query
// ["thi"] will match the title [Thinking], but since
// ["thi"] is quoted we don't want to do a prefix match.
// Clean up the title, URL, and ancestor titles in preparation for string
// comparisons.
const std::u16string lower_title =
base::i18n::ToLower(Normalize(node->GetTitledUrlNodeTitle()));
base::OffsetAdjuster::Adjustments adjustments;
const std::u16string clean_url =
CleanUpUrlForMatching(node->GetTitledUrlNodeUrl(), &adjustments);
std::vector<std::u16string> lower_ancestor_titles;
base::ranges::transform(
node->GetTitledUrlNodeAncestorTitles(),
std::back_inserter(lower_ancestor_titles),
[](const auto& ancestor_title) {
return base::i18n::ToLower(Normalize(std::u16string(ancestor_title)));
});
// Check if the input approximately matches the node. This is less strict than
// the following check; it will return false positives. But it's also much
// faster, so if it returns false, early exit and avoid the expensive
// `ExtractQueryWords()` calls.
bool approximate_match =
base::ranges::all_of(query_terms, [&](const auto& word) {
if (lower_title.find(word) != std::u16string::npos)
return true;
if (clean_url.find(word) != std::u16string::npos)
return true;
for (const auto& ancestor_title : lower_ancestor_titles) {
if (ancestor_title.find(word) != std::u16string::npos)
return true;
}
return false;
});
if (!approximate_match)
return std::nullopt;
// If `node` passed the approximate check above, to the more accurate check.
query_parser::QueryWordVector title_words, url_words, ancestor_words;
query_parser::QueryParser::ExtractQueryWords(clean_url, &url_words);
query_parser::QueryParser::ExtractQueryWords(lower_title, &title_words);
for (const auto& ancestor_title : lower_ancestor_titles) {
query_parser::QueryParser::ExtractQueryWords(ancestor_title,
&ancestor_words);
}
query_parser::Snippet::MatchPositions title_matches, url_matches;
bool query_has_ancestor_matches = false;
for (const auto& query_node : query_nodes) {
const bool has_title_matches =
query_node->HasMatchIn(title_words, &title_matches);
const bool has_url_matches =
query_node->HasMatchIn(url_words, &url_matches);
const bool has_ancestor_matches =
query_node->HasMatchIn(ancestor_words, false);
query_has_ancestor_matches =
query_has_ancestor_matches || has_ancestor_matches;
if (!has_title_matches && !has_url_matches && !has_ancestor_matches)
return std::nullopt;
query_parser::QueryParser::SortAndCoalesceMatchPositions(&title_matches);
query_parser::QueryParser::SortAndCoalesceMatchPositions(&url_matches);
}
TitledUrlMatch match;
if (lower_title.length() == node->GetTitledUrlNodeTitle().length()) {
// Only use title matches if the lowercase string is the same length
// as the original string, otherwise the matches are meaningless.
// TODO(mpearson): revise match positions appropriately.
match.title_match_positions.swap(title_matches);
}
// Now that we're done processing this entry, correct the offsets of the
// matches in |url_matches| so they point to offsets in the original URL
// spec, not the cleaned-up URL string that we used for matching.
std::vector<size_t> offsets =
TitledUrlMatch::OffsetsFromMatchPositions(url_matches);
base::OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
url_matches =
TitledUrlMatch::ReplaceOffsetsInMatchPositions(url_matches, offsets);
match.url_match_positions.swap(url_matches);
match.has_ancestor_match = query_has_ancestor_matches;
match.node = node;
return match;
}
TitledUrlIndex::TitledUrlNodeSet TitledUrlIndex::RetrieveNodesMatchingAllTerms(
const std::vector<std::u16string>& terms,
query_parser::MatchingAlgorithm matching_algorithm) const {
DCHECK(!terms.empty());
TitledUrlNodeSet matches =
RetrieveNodesMatchingTerm(terms[0], matching_algorithm);
for (size_t i = 1; i < terms.size() && !matches.empty(); ++i) {
TitledUrlNodeSet term_matches =
RetrieveNodesMatchingTerm(terms[i], matching_algorithm);
// Compute intersection between the two sets.
base::EraseIf(matches, base::IsNotIn<TitledUrlNodeSet>(term_matches));
}
return matches;
}
TitledUrlIndex::TitledUrlNodeSet TitledUrlIndex::RetrieveNodesMatchingAnyTerms(
const std::vector<std::u16string>& terms,
query_parser::MatchingAlgorithm matching_algorithm,
size_t max_nodes) const {
DCHECK(!terms.empty());
if (terms.size() == 1)
return RetrieveNodesMatchingAllTerms(terms, matching_algorithm);
// If any term does not match a path, short circuit the expensive union
// and simply resort to the `RetrieveNodesMatchingAllTerms()` intersection
// of the terms not matching any path. The results are guaranteed to be the
// same, since all terms must either title, URL, or path match; but there'll
// be much fewer nodes returned.
std::vector<std::u16string> terms_not_path;
base::ranges::copy_if(terms, std::back_inserter(terms_not_path),
[&](const std::u16string& term) {
return !DoesTermMatchPath(term, matching_algorithm);
});
if (!terms_not_path.empty())
return RetrieveNodesMatchingAllTerms(terms_not_path, matching_algorithm);
std::vector<TitledUrlNodes> matches_per_term;
bool some_term_had_empty_matches = false;
for (const std::u16string& term : terms) {
// Use `matching_algorithm`, as opposed to exact matching, to allow inputs
// like 'myFolder goog' to match a 'google.com' bookmark in a 'myFolder'
// folder.
TitledUrlNodes term_matches =
RetrieveNodesMatchingTerm(term, matching_algorithm);
if (term_matches.empty())
some_term_had_empty_matches = true;
else
matches_per_term.push_back(std::move(term_matches));
}
// Sort `matches_per_term` least frequent first. This prevents terms like
// 'https', which match a lot of nodes, from wasting `max_nodes` capacity.
base::ranges::sort(
matches_per_term,
[](size_t first, size_t second) { return first < second; },
[](const auto& matches) { return matches.size(); });
// Use an `unordered_set` to avoid potentially 1000's of linear time
// insertions into the ordered `TitledUrlNodeSet` (i.e. `flat_set`).
std::unordered_set<const TitledUrlNode*> matches;
for (const auto& term_matches : matches_per_term) {
for (const TitledUrlNode* node : term_matches) {
matches.insert(node);
if (matches.size() == max_nodes)
break;
}
if (matches.size() == max_nodes)
break;
}
// Append all nodes that match every input term. This is necessary because
// with the `max_nodes` threshold above, it's possible that `matches` is
// missing some nodes that match every input term. Since `matches_per_term[i]`
// is a superset of the intersection of `matches_per_term`s, if
// `matches_per_term[0].size() <= max_nodes`, all of `matches_per_term[0]`,
// and therefore the intersection matches`, are already in `matches`.
TitledUrlNodeSet all_term_matches;
if (!some_term_had_empty_matches && matches_per_term[0].size() > max_nodes) {
all_term_matches = matches_per_term[0];
for (size_t i = 1; i < matches_per_term.size() && !all_term_matches.empty();
++i) {
// Compute intersection between the two sets.
base::EraseIf(all_term_matches,
base::IsNotIn<TitledUrlNodeSet>(matches_per_term[i]));
}
// `all_term_matches` is the intersection of each term's node matches; the
// same as `RetrieveNodesMatchingAllTerms()`. We don't call the latter as a
// performance optimization.
DCHECK(all_term_matches ==
RetrieveNodesMatchingAllTerms(terms, matching_algorithm));
}
matches.insert(all_term_matches.begin(), all_term_matches.end());
return TitledUrlNodeSet(matches.begin(), matches.end());
}
TitledUrlIndex::TitledUrlNodes TitledUrlIndex::RetrieveNodesMatchingTerm(
const std::u16string& term,
query_parser::MatchingAlgorithm matching_algorithm) const {
Index::const_iterator i = index_.lower_bound(term);
if (i == index_.end())
return {};
if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch(
term, matching_algorithm)) {
// Term is too short for prefix match, compare using exact match.
if (i->first != term)
return {}; // No title/URL pairs with this term.
return TitledUrlNodes(i->second.begin(), i->second.end());
}
// Loop through index adding all entries that start with term to
// |prefix_matches|.
TitledUrlNodes prefix_matches;
while (i != index_.end() && IsPrefix(term, i->first)) {
prefix_matches.insert(prefix_matches.end(), i->second.begin(),
i->second.end());
++i;
}
return prefix_matches;
}
bool TitledUrlIndex::DoesTermMatchPath(
const std::u16string& term,
query_parser::MatchingAlgorithm matching_algorithm) const {
// Term is too short for prefix match, compare using exact match.
if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch(
term, matching_algorithm)) {
return path_index_.count(term) > 0;
}
// Otherwise, see if any path is prefixed by `term`.
const auto i = path_index_.lower_bound(term);
return i != path_index_.end() && IsPrefix(term, i->first);
}
// static
std::vector<std::u16string> TitledUrlIndex::ExtractQueryWords(
const std::u16string& query) {
std::vector<std::u16string> terms;
if (query.empty())
return std::vector<std::u16string>();
query_parser::QueryParser::ParseQueryWords(
base::i18n::ToLower(query), query_parser::MatchingAlgorithm::DEFAULT,
&terms);
return terms;
}
// static
std::vector<std::u16string> TitledUrlIndex::ExtractIndexTerms(
const TitledUrlNode* node) {
std::vector<std::u16string> terms;
for (const std::u16string& term :
ExtractQueryWords(Normalize(node->GetTitledUrlNodeTitle()))) {
terms.push_back(term);
}
for (const std::u16string& term : ExtractQueryWords(CleanUpUrlForMatching(
node->GetTitledUrlNodeUrl(), /*adjustments=*/nullptr))) {
terms.push_back(term);
}
return terms;
}
void TitledUrlIndex::RegisterNode(const std::u16string& term,
const TitledUrlNode* node) {
index_[term].insert(node);
}
void TitledUrlIndex::UnregisterNode(const std::u16string& term,
const TitledUrlNode* node) {
auto i = index_.find(term);
if (i == index_.end()) {
// We can get here if the node has the same term more than once. For
// example, a node with the title 'foo foo' would end up here.
return;
}
i->second.erase(node);
if (i->second.empty())
index_.erase(i);
}
} // namespace bookmarks