blob: f1de9d126fb91011bb51b6665f729d457b33e37f [file] [log] [blame]
// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_OMNIBOX_BROWSER_HISTORY_FUZZY_PROVIDER_H_
#define COMPONENTS_OMNIBOX_BROWSER_HISTORY_FUZZY_PROVIDER_H_
#include <array>
#include <memory>
#include <unordered_map>
#include <vector>
#include "base/memory/weak_ptr.h"
#include "base/scoped_observation.h"
#include "base/synchronization/waitable_event.h"
#include "base/task/cancelable_task_tracker.h"
#include "components/history/core/browser/history_service.h"
#include "components/history/core/browser/history_service_observer.h"
#include "components/history/core/browser/history_types.h"
#include "components/omnibox/browser/autocomplete_input.h"
#include "components/omnibox/browser/autocomplete_match.h"
#include "components/omnibox/browser/autocomplete_result.h"
#include "components/omnibox/browser/history_provider.h"
// This namespace encapsulates the implementation details of fuzzy matching and
// correction. It is used by the public (non-namespaced) HistoryFuzzyProvider
// below.
namespace fuzzy {
// An `Edit` represents a single character change to a string.
struct Edit {
// The kind of change to apply to text. Note, KEEP essentially means
// no edit, and will never be applied or kept as part of a `Correction`.
enum class Kind {
KEEP,
DELETE,
INSERT,
REPLACE,
TRANSPOSE,
};
Edit(Kind kind, size_t at, char16_t new_char);
// Applies this edit to given text, mutating it in place.
void ApplyTo(std::u16string& text) const;
// The edit operation, the kind of change to make to text.
Kind kind;
// Character data; relevant for REPLACE and INSERT, and also for
// TRANSPOSE (as a minor optimization, first char is stored here).
char16_t new_char;
// Text index at which to apply change.
size_t at;
};
// A `Correction` is a short list of `Edit`s required to change
// an input string that escapes the trie into a string that is contained by it.
struct Correction {
// Tolerance schedules must use a `limit` of no more than `kMaxEdits`.
constexpr static int kMaxEdits = 3;
// Creates a new correction including this one plus a given `Edit`.
Correction WithEdit(Edit edit) const;
// Applies this edit to given text, mutating it in place.
void ApplyTo(std::u16string& text) const;
// Number of edits to apply for the full correction.
size_t edit_count = 0;
// The actual edits to apply; only the first `edit_count` elements are valid.
// This is a fixed-size in-struct array instead of a vector because
// finding and building corrections is performance critical and keeping this
// struct simple on the stack is much faster than pounding on the allocator.
std::array<Edit, kMaxEdits> edits = {Edit{Edit::Kind::KEEP, 0, '_'},
Edit{Edit::Kind::KEEP, 0, '_'},
Edit{Edit::Kind::KEEP, 0, '_'}};
};
// This utility struct defines how tolerance changes across the length
// of input text being processed.
// Example: this schedule `{ .start_index = 1, .step_length = 4, .limit = 3 }`
// means the first character must match, then starting from the second
// character, one correction is tolerated per four characters, up to a maximum
// of three total corrections.
struct ToleranceSchedule {
int ToleranceAt(int index) {
if (index < start_index) {
return 0;
}
if (step_length <= 0) {
return limit;
}
return std::min(limit, 1 + (index - start_index) / step_length);
}
// Index at which tolerance is allowed to exceed zero.
int start_index = 0;
// Number of index steps between successive tolerance increases.
// When nonpositive, the `limit` value is used directly instead of stepping.
int step_length = 0;
// Regardless of index, tolerance will not exceed this limit.
// Note, `limit` must not exceed `Correction::kMaxEdits`.
int limit = 0;
};
// Nodes form a trie structure used to find potential input corrections.
struct Node {
Node();
Node(Node&& node);
Node& operator=(Node&&) = default;
~Node();
// Walk the trie, injecting nodes as necessary to build the given `text`
// starting at `text_index`. The `text_index` parameter advances as an index
// into `text` and ensures recursion is bounded.
void Insert(const std::u16string& text, size_t text_index);
// Delete nodes as necessary to remove given `text` from the trie.
void Delete(const std::u16string& text, size_t text_index);
// Delete all nodes to clear the trie.
void Clear();
// Produce corrections necessary to get `text` back on trie. Each correction
// will be of size bounded by `tolerance_schedule`, and none will have smaller
// edit distance than any other (i.e. all corrections are equally optimal).
// Returns whether input `text` starting at `from` is present in this trie.
// - true without corrections -> `text` on trie.
// - false without corrections -> cannot complete on trie within schedule.
// - true with corrections -> never happens because `text` is on trie.
// - false with corrections -> `text` off trie but corrections are on trie.
// Note: For efficiency, not all possible corrections are returned; any found
// valid corrections will preclude further more elaborate subcorrections.
bool FindCorrections(const std::u16string& text,
ToleranceSchedule tolerance_schedule,
std::vector<Correction>& corrections) const;
// Estimates dynamic memory usage.
// See base/trace_event/memory_usage_estimator.h for more info.
size_t EstimateMemoryUsage() const;
// Returns number of terminals contained within this trie (may include self).
int TerminalCount() const;
// This is used to distinguish terminal nodes in the trie (nonzero values).
int relevance = 0;
// This maintains the sum of `relevance` plus all `relevance_total` values
// contained within `next`. As long as `relevance` values are 0 or 1, this can
// be used as a count of contained terminals. When it drops to zero, the
// node may be deleted from the trie.
int relevance_total = 0;
// Note: Some C++ implementations of unordered_map support using the
// containing struct (Node) as the element type, but some do not. To avoid
// potential build issues in downstream projects that use Chromium code,
// ensure the element type is of known size (a fully declared type).
std::unordered_map<char16_t, std::unique_ptr<Node>> next;
};
} // namespace fuzzy
// This class is an autocomplete provider which provides URL results from
// history for inputs that may match inexactly.
// The mechanism that makes it "fuzzy" is a trie search that tolerates errors
// and produces corrected inputs which can then be autocompleted as normal.
// Relevance penalties are applied for corrections as errors reduce confidence.
// The trie is built from history URL text and any text that can be formed
// by tracing a path from the root is said to be "on trie" while any text
// that escapes the trie with a character not present is said to be "off trie".
// If the autocomplete input is fully on trie, no suggestions are provided
// because exact matching should already provide the best results. If the
// autocomplete input is off trie, corrections of bounded size are produced
// to get the input back on trie, and these should then be eligible for
// autocompletion.
// The data structure could contain anything helpful, including ways to mark
// terminal nodes (signaling a path as a complete string). A relevance score
// could serve this purpose and make full independent autocompletion possible,
// but efficiency is a top concern so node size is minimized, just enough to
// get inputs back on track, not to replicate the full completion and scoring
// of other autocomplete providers.
class HistoryFuzzyProvider : public HistoryProvider,
public history::HistoryServiceObserver {
public:
// Records fuzzy matching related metrics when user opens a match.
static void RecordOpenMatchMetrics(const AutocompleteResult& result,
const AutocompleteMatch& match_opened);
explicit HistoryFuzzyProvider(AutocompleteProviderClient* client);
HistoryFuzzyProvider(const HistoryFuzzyProvider&) = delete;
HistoryFuzzyProvider& operator=(const HistoryFuzzyProvider&) = delete;
// HistoryProvider:
// AutocompleteProvider. `minimal_changes` is ignored since there is no async
// completion performed.
void Start(const AutocompleteInput& input, bool minimal_changes) override;
// Estimates dynamic memory usage.
// See base/trace_event/memory_usage_estimator.h for more info.
size_t EstimateMemoryUsage() const override;
private:
~HistoryFuzzyProvider() override;
// Performs the autocomplete matching and scoring.
void DoAutocomplete();
// Add the best matches, converting them to fuzzy suggestions in the process.
// The given `penalty` is a percentage relevance penalty that will
// deduct from the relevance of each match.
// Returns the number of matches actually added.
int AddConvertedMatches(const ACMatches& matches, int penalty);
// Main thread callback to receive trie of URLs loaded from database.
void OnUrlsLoaded(fuzzy::Node node);
// history::HistoryServiceObserver:
// Adds visited URL host to trie.
void OnURLVisited(history::HistoryService* history_service,
const history::URLRow& url_row,
const history::VisitRow& new_visit) override;
// Removes deleted (or all) URLs from trie.
void OnHistoryDeletions(history::HistoryService* history_service,
const history::DeletionInfo& deletion_info) override;
// Record UMA histogram data for measuring usefulness of sub-providers.
void RecordMatchConversion(const char* name, int count);
AutocompleteInput autocomplete_input_;
// This is the trie facilitating search for input alternatives.
fuzzy::Node root_;
// This provides a thread-safe way to check that loading has completed.
// Keeping the event may not be necessary since signal check is done from
// same main thread, but having it should provide some robustness in case
// we autocomplete from another thread or while db task is running.
base::WaitableEvent urls_loaded_event_;
// Task tracker for URL data loading.
base::CancelableTaskTracker task_tracker_;
// Tracks the observed history service, for cleanup.
base::ScopedObservation<history::HistoryService,
history::HistoryServiceObserver>
history_service_observation_{this};
// This threshold determines the input length at which fuzzy suggestions
// will start being searched and generated. Shorter inputs won't be checked.
size_t min_input_length_;
// These are tunable parameters that affect the fuzzy suggestion
// relevance penalty.
int penalty_low_;
int penalty_high_;
size_t penalty_taper_length_;
// Weak pointer factory for callback binding safety.
base::WeakPtrFactory<HistoryFuzzyProvider> weak_ptr_factory_{this};
};
#endif // COMPONENTS_OMNIBOX_BROWSER_HISTORY_FUZZY_PROVIDER_H_