blob: c75b41951c03aa67baef53c1429f33d07b318f71 [file] [log] [blame]
// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/omnibox/browser/history_fuzzy_provider.h"
#include <functional>
#include <memory>
#include <ostream>
#include <queue>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "base/check.h"
#include "base/containers/contains.h"
#include "base/memory/raw_ptr.h"
#include "base/memory/scoped_refptr.h"
#include "base/metrics/histogram_functions.h"
#include "base/metrics/histogram_macros.h"
#include "base/not_fatal_until.h"
#include "base/strings/utf_string_conversions.h"
#include "base/system/sys_info.h"
#include "base/time/time.h"
#include "base/trace_event/memory_usage_estimator.h"
#include "base/trace_event/trace_event.h"
#include "build/build_config.h"
#include "components/history/core/browser/history_database.h"
#include "components/history/core/browser/history_db_task.h"
#include "components/history/core/browser/history_service.h"
#include "components/history/core/browser/url_database.h"
#include "components/omnibox/browser/autocomplete_match_classification.h"
#include "components/omnibox/browser/autocomplete_match_type.h"
#include "components/omnibox/browser/autocomplete_provider_client.h"
#include "components/omnibox/browser/bookmark_provider.h"
#include "components/omnibox/browser/history_quick_provider.h"
#include "components/omnibox/browser/omnibox_field_trial.h"
#include "components/omnibox/browser/omnibox_triggered_feature_service.h"
#include "components/url_formatter/elide_url.h"
#include "third_party/metrics_proto/omnibox_event.pb.h"
#include "third_party/metrics_proto/omnibox_focus_type.pb.h"
#include "url/gurl.h"
namespace {
// Histogram names for measuring sub-provider match conversion efficacy.
// Reminder in case other sub-providers or metrics are added: update
// the `Omnibox.HistoryFuzzy.MatchConversion` entry in histograms.xml.
const char kMetricMatchConversionHistoryQuick[] =
"Omnibox.HistoryFuzzy.MatchConversion.HistoryQuick";
const char kMetricMatchConversionBookmark[] =
"Omnibox.HistoryFuzzy.MatchConversion.Bookmark";
// Histogram name for time spent on the fuzzy search portion of provider time.
const char kMetricSearchDuration[] = "Omnibox.HistoryFuzzy.SearchDuration";
// Histogram name for whether a presented fuzzy match was the one taken by the
// user at the moment a match was opened.
const char kMetricPrecision[] = "Omnibox.HistoryFuzzy.Precision";
// This cap ensures the search trie will not grow without bound. Up to half
// the total capacity may be filled at startup from loaded significant URLs.
// The enforced limit may be further constrained by
// `MaxNumHQPUrlsIndexedAtStartup`.
constexpr int kMaxTerminalCount = 256;
// This utility function reduces a URL to the most meaningful and likely part
// of the hostname to be matched against, i.e. the domain, the URL's TLD+1.
// May return an empty string if the given URL is not a good candidate for
// meaningful domain name matching.
std::u16string UrlDomainReduction(const GURL& url) {
std::u16string url_host;
std::u16string url_domain;
url_formatter::SplitHost(url, &url_host, &url_domain, nullptr);
return url_domain;
}
// This utility function prepares input text for fuzzy matching, or returns
// an empty string in cases unlikely to be worth a fuzzy matching search.
// Note, this is intended to be a fast way to improve matching and eliminate
// likely-unfruitful searches. It could make use of `SplitHost` as above, or
// `url_formatter::FormatUrlForDisplayOmitSchemePathAndTrivialSubdomains`,
// which uses `FormatUrlWithAdjustments` under the hood, but all that URL
// processing for input text that may not even be a URL seems like overkill,
// so this simple direct method is used instead.
std::u16string ReduceInputTextForMatching(const std::u16string& input) {
constexpr size_t kMaximumFuzzyMatchInputLength = 24;
constexpr size_t kPathCharacterCountToStopSearch = 6;
constexpr size_t kPostDotCharacterCountHintingSubdomain = 4;
// Long inputs are not fuzzy matched; doing so could be costly, and the
// length of input itself is a signal that it may not have been typed but
// simply pasted or edited in place.
if (input.length() > kMaximumFuzzyMatchInputLength) {
return std::u16string();
}
// Spaces hint that the input may be a search, not a URL.
if (input.find(u' ') != std::u16string::npos) {
return std::u16string();
}
// Inputs containing anything that looks like a scheme are a hint that this
// is an existing URL or an edit that's likely to be handled deliberately,
// not a messy human input that may need fuzzy matching.
if (input.find(u"://") != std::u16string::npos) {
return std::u16string();
}
std::u16string remaining;
// While typing a URL, the user may typo the domain but then continue on to
// the path; keeping input up to the path separator keeps the window open
// for fuzzy matching the domain as they continue to type, but we don't want
// to keep it open forever (doing so could result in potentially sticky false
// positives).
size_t index = input.find(u'/');
if (index != std::u16string::npos) {
if (index + kPathCharacterCountToStopSearch < input.length()) {
// User has moved well beyond typing domain and hasn't taken any fuzzy
// suggestions provided so far, and they won't get better, so we can
// save compute and suggestion results space by stopping the search.
return std::u16string();
}
remaining = input.substr(0, index);
} else {
remaining = input;
}
index = remaining.find(u'.');
if (index != std::u16string::npos &&
index + kPostDotCharacterCountHintingSubdomain < remaining.length()) {
// Keep input with dot if near the end (within range of .com, .org, .edu).
// With a dot earlier in the string, the user might be typing a subdomain
// and we only have the TLD+1 stored in the trie, so skip the dot and match
// against the remaining text. This may be helpful in common cases like
// typing an unnecessary "www." before the domain name.
remaining = remaining.substr(index + 1);
}
return remaining;
}
} // namespace
namespace fuzzy {
Edit::Edit(Kind kind, size_t at, char16_t new_char)
: kind(kind), new_char(new_char), at(at) {}
void Edit::ApplyTo(std::u16string& text) const {
switch (kind) {
case Kind::DELETE: {
text.erase(at, 1);
break;
}
case Kind::INSERT: {
text.insert(at, 1, new_char);
break;
}
case Kind::REPLACE: {
text[at] = new_char;
break;
}
case Kind::TRANSPOSE: {
text[at] = text[at + 1];
text[at + 1] = new_char;
break;
}
case Kind::KEEP:
default: {
NOTREACHED();
}
}
}
Correction Correction::WithEdit(Edit edit) const {
DCHECK(edit_count < Correction::kMaxEdits);
Correction correction = *this;
correction.edits[edit_count] = edit;
correction.edit_count++;
return correction;
}
void Correction::ApplyTo(std::u16string& text) const {
size_t i = edit_count;
while (i > 0) {
i--;
edits[i].ApplyTo(text);
}
}
Node::Node() = default;
Node::Node(Node&&) = default;
Node::~Node() = default;
void Node::Insert(const std::u16string& text, size_t text_index) {
if (text_index >= text.length()) {
relevance_total += 1 - relevance;
relevance = 1;
return;
}
std::unique_ptr<Node>& node = next[text[text_index]];
if (!node) {
node = std::make_unique<Node>();
}
relevance_total -= node->relevance_total;
node->Insert(text, text_index + 1);
relevance_total += node->relevance_total;
}
void Node::Delete(const std::u16string& text, size_t text_index) {
if (text_index < text.length()) {
auto it = next.find(text[text_index]);
if (it != next.end()) {
Node* const node = it->second.get();
relevance_total -= node->relevance_total;
node->Delete(text, text_index + 1);
if (node->relevance_total == 0) {
next.erase(it);
} else {
relevance_total += node->relevance_total;
}
}
} else {
relevance_total -= relevance;
relevance = 0;
}
}
void Node::Clear() {
next.clear();
}
bool Node::FindCorrections(const std::u16string& text,
ToleranceSchedule tolerance_schedule,
std::vector<Correction>& corrections) const {
DCHECK(corrections.empty());
DCHECK(tolerance_schedule.limit <= Correction::kMaxEdits);
if (text.length() == 0) {
return true;
}
// A utility class to track search progression.
struct Step {
// Walks through trie.
raw_ptr<const Node> node;
// Edit distance.
int distance;
// Advances through input text. This effectively tells how much of the
// input has been consumed so far, regardless of output text length.
size_t index;
// Length of corrected text. This tells how long the output string will
// be, regardless of input text length. It is independent of `index`
// because corrections are not only 1:1 replacements but may involve
// insertions or deletions as well.
int length;
// Backtracking data to enable text correction (from end of string back
// to beginning, i.e. correction chains are applied in reverse).
Correction correction;
// std::priority_queue keeps the greatest element on top, so we want this
// operator implementation to make bad steps less than good steps.
// Prioritize minimum distance, with index and length to break ties.
// The first found solutions are best, and fastest in common cases
// near input on trie.
bool operator<(const Step& rhs) const {
if (distance > rhs.distance) {
return true;
} else if (distance == rhs.distance) {
if (index < rhs.index) {
return true;
} else if (index == rhs.index) {
return length < rhs.length;
}
}
return false;
}
};
std::priority_queue<Step> pq;
pq.push({this, 0, 0, 0, Correction()});
Step best{nullptr, INT_MAX, SIZE_MAX, INT_MAX, Correction()};
// Find and return all equally-distant results as soon as distance increases
// beyond that of first found results. Length is also considered to
// avoid producing shorter substring texts.
while (!pq.empty() && pq.top().distance <= best.distance) {
Step step = pq.top();
pq.pop();
// Strictly greater should not be possible for this comparison.
if (step.index >= text.length()) {
if (step.distance == 0) {
// Ideal common case, full input on trie with no correction required.
// Because search is directed by priority_queue, we get here before
// generating any corrections (straight line to goal is shortest path).
DCHECK(corrections.empty());
return true;
}
// Check `length` to keep longer results. Without this, we could end up
// with shorter substring corrections (e.g. both "was" and "wash").
// It may not be necessary to do this if priority_queue keeps results
// optimal or returns a first best result immediately.
DCHECK(best.distance == INT_MAX || step.distance == best.distance);
if (step.distance < best.distance || step.length > best.length) {
best = std::move(step);
corrections.clear();
// Dereference is safe because nonzero distance implies presence of
// nontrivial correction.
corrections.emplace_back(best.correction);
} else {
// Equal distance.
// Strictly greater should not be possible for this comparison.
if (step.length >= best.length) {
// Dereference is safe because this is another equally
// distant correction, necessarily discovered after the first.
corrections.emplace_back(step.correction);
}
#if DCHECK_ALWAYS_ON
std::u16string corrected = text;
step.correction.ApplyTo(corrected);
DCHECK_EQ(corrected.length(), static_cast<size_t>(step.length))
<< corrected;
#endif
}
continue;
}
int tolerance = tolerance_schedule.ToleranceAt(step.index);
if (step.distance < tolerance) {
// Delete
pq.push(
{step.node, step.distance + 1, step.index + 1, step.length,
step.correction.WithEdit({Edit::Kind::DELETE, step.index, '_'})});
}
for (const auto& entry : step.node->next) {
const char16_t step_text_char = text[step.index];
if (entry.first == step_text_char) {
// Keep
pq.push({entry.second.get(), step.distance, step.index + 1,
step.length + 1, step.correction});
} else if (step.distance < tolerance) {
// Insert
pq.push({entry.second.get(), step.distance + 1, step.index,
step.length + 1,
step.correction.WithEdit(
{Edit::Kind::INSERT, step.index, entry.first})});
// Replace. Note, we do not replace at the same position as a previous
// insertion because doing so could produce unnecessary duplicates.
const Edit& step_edit =
step.correction.edit_count > 0
? step.correction.edits[step.correction.edit_count - 1]
: Edit(Edit::Kind::KEEP, 0, '_');
if (step_edit.kind != Edit::Kind::INSERT ||
step_edit.at != step.index) {
pq.push({entry.second.get(), step.distance + 1, step.index + 1,
step.length + 1,
step.correction.WithEdit(
{Edit::Kind::REPLACE, step.index, entry.first})});
}
// Transpose. Look ahead cost can be balanced by faster
// advancement through input text resulting in shorter search.
if (text.size() > step.index + 1 &&
text[step.index + 1] == entry.first) {
const auto it = entry.second->next.find(step_text_char);
if (it != entry.second->next.end()) {
pq.push({it->second.get(), step.distance + 1, step.index + 2,
step.length + 2,
step.correction.WithEdit(
{Edit::Kind::TRANSPOSE, step.index, step_text_char})});
}
}
}
}
}
return false;
}
size_t Node::EstimateMemoryUsage() const {
size_t res = 0;
res += base::trace_event::EstimateMemoryUsage(next);
return res;
}
int Node::TerminalCount() const {
// This works as long as `relevance` values mark terminals with 1 and
// non-terminals with 0; see `Insert()`.
return relevance_total;
}
// This task class loads URLs considered significant according to
// `HistoryDatabase::InitURLEnumeratorForSignificant` but there's nothing
// special about that implementation; we may do something different for
// fuzzy matching. The goal in general is to load and keep a reasonably sized
// set of likely relevant host names for fast fuzzy correction.
class LoadSignificantUrls : public history::HistoryDBTask {
public:
using Callback = base::OnceCallback<void(Node)>;
explicit LoadSignificantUrls(Callback callback)
: callback_(std::move(callback)) {}
~LoadSignificantUrls() override = default;
bool RunOnDBThread(history::HistoryBackend* backend,
history::HistoryDatabase* db) override {
history::URLDatabase::URLEnumerator enumerator;
if (db && db->InitURLEnumeratorForSignificant(&enumerator)) {
history::URLRow row;
// The `MaxNumHQPUrlsIndexedAtStartup` dependency here is to ensure
// that we keep a lower cap for mobile; it's much higher on desktop.
// Note the divide, which ensures at least half the capacity will be kept
// for later visited domains. `GetNextUrl` takes the most significant
// URLs from the database (enumerator order) and duplicates won't count.
const int max_terminal_count =
std::min(OmniboxFieldTrial::MaxNumHQPUrlsIndexedAtStartup(),
kMaxTerminalCount) /
2;
while (enumerator.GetNextURL(&row) &&
node_.TerminalCount() < max_terminal_count) {
node_.Insert(UrlDomainReduction(row.url()), 0);
}
}
return true;
}
void DoneRunOnMainThread() override {
std::move(callback_).Run(std::move(node_));
}
private:
Node node_;
Callback callback_;
};
} // namespace fuzzy
// static
void HistoryFuzzyProvider::RecordOpenMatchMetrics(
const AutocompleteResult& result,
const AutocompleteMatch& match_opened) {
if (base::ranges::any_of(result, [](const AutocompleteMatch& match) {
return match.provider && match.provider->type() ==
AutocompleteProvider::TYPE_HISTORY_FUZZY;
})) {
const bool opened_fuzzy_match = match_opened.provider->type() ==
AutocompleteProvider::TYPE_HISTORY_FUZZY;
UMA_HISTOGRAM_BOOLEAN(kMetricPrecision, opened_fuzzy_match);
}
}
HistoryFuzzyProvider::HistoryFuzzyProvider(AutocompleteProviderClient* client)
: HistoryProvider(AutocompleteProvider::TYPE_HISTORY_FUZZY, client) {
// Set up tunable parameters. These can be used to affect fuzzy matching
// behavior and performance. Note, we use different `min_input_length_` values
// depending on desktop versus mobile platforms, determined by experiment.
#if BUILDFLAG(IS_ANDROID) || BUILDFLAG(IS_IOS)
min_input_length_ = 5;
#else
min_input_length_ = 3;
#endif
// These initial penalty values produce good results for most inputs:
// Using 10% reasonably took a 1334 relevance match down to 1200,
// but was harmful to HQP suggestions: as soon as a '.' was
// appended, a bunch of ~800 navsuggest results overtook a better
// HQP result that was bumped down to ~770. Using 5% lets this
// result compete in the navsuggest range.
penalty_low_ = 5;
penalty_high_ = 5;
// The default value of zero means "no taper", and only the lowest penalty
// will be applied.
penalty_taper_length_ = 0;
// In tests, history service is null and doesn't need to be observed.
if (client->GetHistoryService()) {
history_service_observation_.Observe(client->GetHistoryService());
client->GetHistoryService()->ScheduleDBTask(
FROM_HERE,
std::make_unique<fuzzy::LoadSignificantUrls>(
base::BindOnce(&HistoryFuzzyProvider::OnUrlsLoaded,
weak_ptr_factory_.GetWeakPtr())),
&task_tracker_);
}
}
void HistoryFuzzyProvider::Start(const AutocompleteInput& input,
bool minimal_changes) {
TRACE_EVENT0("omnibox", "HistoryFuzzyProvider::Start");
matches_.clear();
if (input.IsZeroSuggest() ||
input.type() == metrics::OmniboxInputType::EMPTY) {
return;
}
// Note this will always return early when bypassing for low-end devices;
// see comment in constructor.
if (!urls_loaded_event_.IsSignaled()) {
return;
}
autocomplete_input_ = input;
// Fuzzy matching intends to correct quick typos, and because it may involve
// a compute intensive search, some conditions are checked to bypass this
// provider early. When the cursor is moved from the end of input string,
// user may have slowed down to edit manually.
if (autocomplete_input_.cursor_position() ==
autocomplete_input_.text().length()) {
DoAutocomplete();
}
}
size_t HistoryFuzzyProvider::EstimateMemoryUsage() const {
size_t res = HistoryProvider::EstimateMemoryUsage();
res += base::trace_event::EstimateMemoryUsage(autocomplete_input_);
res += base::trace_event::EstimateMemoryUsage(root_);
return res;
}
HistoryFuzzyProvider::~HistoryFuzzyProvider() = default;
void HistoryFuzzyProvider::DoAutocomplete() {
constexpr fuzzy::ToleranceSchedule kToleranceSchedule = {
.start_index = 2,
.step_length = 4,
.limit = 3,
};
const std::u16string& text =
ReduceInputTextForMatching(autocomplete_input_.text());
const size_t input_length = text.length();
// Note: We can always return if `input_length` is zero, but
// `min_input_length_` is an experimental parameter for more control.
// So the second condition can be cleaned up if not needed, but
// the first condition should be kept regardless.
if (input_length == 0) {
return;
}
if (input_length < min_input_length_) {
return;
}
std::vector<fuzzy::Correction> corrections;
const base::TimeTicks time_start = base::TimeTicks::Now();
root_.FindCorrections(text, kToleranceSchedule, corrections);
const base::TimeTicks time_end = base::TimeTicks::Now();
UMA_HISTOGRAM_TIMES(kMetricSearchDuration, time_end - time_start);
if (!corrections.empty()) {
// Relevance ranges are nuanced enough that this should be kept reasonably
// simple, but the experience of the feature is sensitive to the penalty so
// we support a range from highest penalty on short inputs to lowest penalty
// on longer inputs, with a linear taper in between.
int penalty = penalty_low_;
// Compute additional penalty for very short inputs.
if (penalty_taper_length_ > 0) {
DCHECK_GE(input_length, min_input_length_);
const size_t extra_length = input_length - min_input_length_;
if (extra_length <= penalty_taper_length_) {
DCHECK_GE(penalty_high_, penalty_low_);
penalty += ((penalty_taper_length_ - extra_length) *
(penalty_high_ - penalty_low_)) /
penalty_taper_length_;
}
}
// Use of `scoped_refptr` is required here because destructor is private.
scoped_refptr<HistoryQuickProvider> history_quick_provider =
new HistoryQuickProvider(client());
scoped_refptr<BookmarkProvider> bookmark_provider =
new BookmarkProvider(client());
int count_history_quick = 0;
int count_bookmark = 0;
for (const auto& correction : corrections) {
std::u16string fixed = text;
correction.ApplyTo(fixed);
// Note the `cursor_position` could be changed by insert or delete
// corrections, but this is easy to adapt since we only fuzzy
// match when cursor is at end of input; just move to new end.
DCHECK_EQ(autocomplete_input_.cursor_position(),
autocomplete_input_.text().length());
AutocompleteInput corrected_input(
fixed, fixed.length(),
autocomplete_input_.current_page_classification(),
client()->GetSchemeClassifier());
history_quick_provider->Start(corrected_input, false);
DCHECK(history_quick_provider->done());
bookmark_provider->Start(corrected_input, false);
DCHECK(bookmark_provider->done());
count_history_quick +=
AddConvertedMatches(history_quick_provider->matches(), penalty);
count_bookmark +=
AddConvertedMatches(bookmark_provider->matches(), penalty);
}
if (matches_.size() > provider_max_matches_) {
// When too many matches are generated, take only the most relevant
// matches and correct the counts for accurate metrics.
std::partial_sort(matches_.begin(),
matches_.begin() + provider_max_matches_,
matches_.end(), AutocompleteMatch::MoreRelevant);
for (size_t i = provider_max_matches_; i < matches_.size(); i++) {
DCHECK(matches_[i].provider.get() == history_quick_provider ||
matches_[i].provider.get() == bookmark_provider)
<< matches_[i].provider->GetName();
if (matches_[i].provider.get() == history_quick_provider) {
count_history_quick--;
} else {
count_bookmark--;
}
}
matches_.resize(provider_max_matches_);
}
for (AutocompleteMatch& match : matches_) {
match.provider = this;
}
RecordMatchConversion(kMetricMatchConversionHistoryQuick,
count_history_quick);
RecordMatchConversion(kMetricMatchConversionBookmark, count_bookmark);
}
}
int HistoryFuzzyProvider::AddConvertedMatches(const ACMatches& matches,
int penalty) {
if (matches.empty()) {
return 0;
}
// Take only the most relevant match, to give the best chance of keeping
// the penalized fuzzy match while reducing risk of possible noise.
// Note that min_element is used instead of max_element because
// `AutocompleteMatch::MoreRelevant` reverses standard sort order such that
// matches with greater relevance are considered less than matches with lesser
// relevance. For performance reasons, `CompareWithDemoteByType` is not used,
// so ranking of the final result set will be more nuanced than ranking here.
ACMatches::const_iterator it = std::min_element(
matches.begin(), matches.end(), AutocompleteMatch::MoreRelevant);
CHECK(it != matches.end(), base::NotFatalUntil::M130);
matches_.push_back(*it);
// Update match in place. Note, `match.provider` will be reassigned after
// `DoAutocomplete` because source sub-provider must be kept for metrics.
AutocompleteMatch& match = matches_.back();
// It's important that fuzzy matches do not try to become default and inline
// autocomplete because the input/match-data mismatch can cause problems
// with user interaction and omnibox text editing; see crbug/1347440.
match.allowed_to_be_default_match = false;
match.inline_autocompletion.clear();
// Apply relevance penalty; all corrections are equal and we only apply this
// to the most relevant result, so edit distance isn't needed.
DCHECK_GE(penalty, 0);
DCHECK_LE(penalty, 100);
match.relevance -= match.relevance * penalty / 100;
// Scoring signals are calculated in the history and bookmark providers using
// the corrected input. These scoring signals are inaccurate for the true
// input, so clear them to prevent the ml model assigning an
// artificially high confidence to this suggestion.
match.scoring_signals.reset();
return 1;
}
void HistoryFuzzyProvider::OnUrlsLoaded(fuzzy::Node node) {
root_ = std::move(node);
urls_loaded_event_.Signal();
}
void HistoryFuzzyProvider::OnURLVisited(
history::HistoryService* history_service,
const history::URLRow& url_row,
const history::VisitRow& new_visit) {
if (root_.TerminalCount() <
std::min(OmniboxFieldTrial::MaxNumHQPUrlsIndexedAtStartup(),
kMaxTerminalCount)) {
root_.Insert(UrlDomainReduction(url_row.url()), 0);
}
}
void HistoryFuzzyProvider::OnHistoryDeletions(
history::HistoryService* history_service,
const history::DeletionInfo& deletion_info) {
// Note, this implementation is conservative in terms of user privacy; it
// deletes hosts from the trie if any URL with the given host is deleted.
if (deletion_info.IsAllHistory()) {
root_.Clear();
} else {
for (const history::URLRow& row : deletion_info.deleted_rows()) {
root_.Delete(UrlDomainReduction(row.url()), 0);
}
}
}
void HistoryFuzzyProvider::RecordMatchConversion(const char* name, int count) {
base::UmaHistogramExactLinear(
name, count, AutocompleteResult::kMaxAutocompletePositionValue);
}