components/omnibox/browser/history_fuzzy_provider.cc - chromium/src - Git at Google

 // Copyright (c) 2022 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "components/omnibox/browser/history_fuzzy_provider.h"

 #include <algorithm>
 #include <functional>
 #include <memory>
 #include <ostream>
 #include <queue>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>

 #include "base/check.h"
 #include "base/memory/raw_ptr.h"
 #include "base/memory/scoped_refptr.h"
 #include "base/strings/utf_string_conversions.h"
 #include "base/trace_event/memory_usage_estimator.h"
 #include "base/trace_event/trace_event.h"
 #include "components/history/core/browser/history_database.h"
 #include "components/history/core/browser/history_db_task.h"
 #include "components/history/core/browser/history_service.h"
 #include "components/history/core/browser/url_database.h"
 #include "components/omnibox/browser/autocomplete_match_classification.h"
 #include "components/omnibox/browser/autocomplete_match_type.h"
 #include "components/omnibox/browser/autocomplete_provider_client.h"
 #include "components/omnibox/browser/autocomplete_result.h"
 #include "components/omnibox/browser/bookmark_provider.h"
 #include "components/omnibox/browser/history_quick_provider.h"
 #include "components/search_engines/omnibox_focus_type.h"
 #include "components/url_formatter/elide_url.h"
 #include "url/gurl.h"

 namespace {

 // This utility function reduces a URL to the most meaningful and likely part
 // of the hostname to be matched against, i.e. the domain, the URL's TLD+1.
 // May return an empty string if the given URL is not a good candidate for
 // meaningful domain name matching.
 std::u16string UrlDomainReduction(const GURL& url) {
   std::u16string url_host;
   std::u16string url_domain;
   url_formatter::SplitHost(url, &url_host, &url_domain, nullptr);
   return url_domain;
 }

 // This utility function prepares input text for fuzzy matching, or returns
 // an empty string in cases unlikely to be worth a fuzzy matching search.
 // Note, this is intended to be a fast way to improve matching and eliminate
 // likely-unfruitful searches. It could make use of `SplitHost` as above, or
 // `url_formatter::FormatUrlForDisplayOmitSchemePathAndTrivialSubdomains`,
 // which uses `FormatUrlWithAdjustments` under the hood, but all that URL
 // processing for input text that may not even be a URL seems like overkill,
 // so this simple direct method is used instead.
 std::u16string ReduceInputTextForMatching(const std::u16string& input) {
   constexpr size_t kMaximumFuzzyMatchInputLength = 32;
   constexpr size_t kPathCharacterCountToStopSearch = 6;
   constexpr size_t kPostDotCharacterCountHintingSubdomain = 4;

   // Long inputs are not fuzzy matched; doing so could be costly, and the
   // length of input itself is a signal that it may not have been typed but
   // simply pasted or edited in place.
   // TODO(orinj): Consider tracking trie depth for use as maximum here.
   if (input.length() > kMaximumFuzzyMatchInputLength) {
     return std::u16string();
   }

   // Spaces hint that the input may be a search, not a URL.
   if (input.find(u' ') != std::u16string::npos) {
     return std::u16string();
   }

   // Inputs containing anything that looks like a scheme are a hint that this
   // is an existing URL or an edit that's likely to be handled deliberately,
   // not a messy human input that may need fuzzy matching.
   if (input.find(u"://") != std::u16string::npos) {
     return std::u16string();
   }

   std::u16string remaining;
   // While typing a URL, the user may typo the domain but then continue on to
   // the path; keeping input up to the path separator keeps the window open
   // for fuzzy matching the domain as they continue to type, but we don't want
   // to keep it open forever (doing so could result in potentially sticky false
   // positives).
   size_t index = input.find(u'/');
   if (index != std::u16string::npos) {
     if (index + kPathCharacterCountToStopSearch < input.length()) {
       // User has moved well beyond typing domain and hasn't taken any fuzzy
       // suggestions provided so far, and they won't get better, so we can
       // save compute and suggestion results space by stopping the search.
       return std::u16string();
     }
     remaining = input.substr(0, index);
   } else {
     remaining = input;
   }

   index = remaining.find(u'.');
   if (index != std::u16string::npos &&
       index + kPostDotCharacterCountHintingSubdomain < remaining.length()) {
     // Keep input with dot if near the end (within range of .com, .org, .edu).
     // With a dot earlier in the string, the user might be typing a subdomain
     // and we only have the TLD+1 stored in the trie, so skip the dot and match
     // against the remaining text. This may be helpful in common cases like
     // typing an unnecessary "www." before the domain name.
     remaining = remaining.substr(index + 1);
   }

   return remaining;
 }

 }  // namespace

 namespace fuzzy {

 Correction::Correction(const Correction& other) {
   kind = other.kind;
   at = other.at;
   new_char = other.new_char;
   if (other.next) {
     next = std::make_unique<Correction>(*other.next.get());
   }
 }

 Correction::Correction(Correction&&) = default;

 Correction::Correction(Kind kind, size_t at, char16_t new_char)
     : kind(kind), at(at), new_char(new_char) {}

 Correction::Correction(Kind kind,
                        size_t at,
                        char16_t new_char,
                        std::unique_ptr<Correction> next)
     : kind(kind), at(at), new_char(new_char), next(std::move(next)) {}

 Correction::~Correction() = default;

 void Correction::ApplyTo(std::u16string& text) const {
   switch (kind) {
     case Kind::DELETE: {
       text.erase(at, 1);
       break;
     }
     case Kind::INSERT: {
       text.insert(at, 1, new_char);
       break;
     }
     case Kind::REPLACE: {
       text[at] = new_char;
       break;
     }
     case Kind::KEEP:
     default: {
       NOTREACHED();
       break;
     }
   }
   if (next) {
     next->ApplyTo(text);
   }
 }

 std::unique_ptr<Correction> Correction::GetApplicableCorrection() {
   if (kind == Kind::KEEP) {
     // Because this function eliminates KEEP corrections as the chain is built,
     // it doesn't need to work recursively; a single elimination is sufficient.
     DCHECK(!next || next->kind != Kind::KEEP);
     return next ? std::make_unique<Correction>(*next) : nullptr;
   } else {
     // TODO(orinj): Consider a shared ownership model or preallocated pool to
     //  eliminate lots of copy allocations. For now this is kept simple with
     //  direct ownership of the full correction chain.
     return std::make_unique<Correction>(*this);
   }
 }

 // This operator implementation is for debugging.
 std::ostream& operator<<(std::ostream& os, const Correction& correction) {
   os << '{';
   switch (correction.kind) {
     case Correction::Kind::KEEP: {
       os << 'K';
       break;
     }
     case Correction::Kind::DELETE: {
       os << 'D';
       break;
     }
     case Correction::Kind::INSERT: {
       os << 'I';
       break;
     }
     case Correction::Kind::REPLACE: {
       os << 'R';
       break;
     }
     default: {
       NOTREACHED();
       break;
     }
   }
   os << "," << correction.at << "," << static_cast<char>(correction.new_char)
      << "}";
   return os;
 }

 Node::Node() = default;

 Node::Node(Node&&) = default;

 Node::~Node() = default;

 void Node::Insert(const std::u16string& text, size_t from) {
   if (from >= text.length()) {
     relevance = 1;
     return;
   }
   std::unique_ptr<Node>& node = next[text[from]];
   if (!node) {
     node = std::make_unique<Node>();
   }
   node->Insert(text, from + 1);
 }

 bool Node::Delete(const std::u16string& text, size_t from) {
   if (from < text.length()) {
     auto it = next.find(text[from]);
     if (it != next.end() && it->second->Delete(text, from + 1)) {
       next.erase(it);
     }
   }
   return next.empty();
 }

 void Node::Clear() {
   next.clear();
 }

 bool Node::FindCorrections(const std::u16string& text,
                            ToleranceSchedule tolerance_schedule,
                            std::vector<Correction>& corrections) const {
   DVLOG(1) << "FindCorrections(" << text << ", " << tolerance_schedule.limit
            << ")";
   DCHECK(corrections.empty());

   if (text.length() == 0) {
     return true;
   }

   // A utility class to track search progression.
   struct Step {
     // Walks through trie.
     raw_ptr<const Node> node;

     // Edit distance.
     int distance;

     // Advances through input text. This effectively tells how much of the
     // input has been consumed so far, regardless of output text length.
     size_t index;

     // Length of corrected text. This tells how long the output string will
     // be, regardless of input text length. It is independent of `index`
     // because corrections are not only 1:1 replacements but may involve
     // insertions or deletions as well.
     int length;

     // Backtracking data to enable text correction (from end of string back
     // to beginning, i.e. correction chains are applied in reverse).
     // TODO(orinj): This should be optimized in final algorithm; stop copying.
     Correction correction;

     // std::priority_queue keeps the greatest element on top, so we want this
     // operator implementation to make bad steps less than good steps.
     // Prioritize minimum distance, with index and length to break ties.
     // The first found solutions are best, and fastest in common cases
     // near input on trie.
     bool operator<(const Step& rhs) const {
       if (distance > rhs.distance) {
         return true;
       } else if (distance == rhs.distance) {
         if (index < rhs.index) {
           return true;
         } else if (index == rhs.index) {
           return length < rhs.length;
         }
       }
       return false;
     }
   };

   std::priority_queue<Step> pq;
   pq.push({this, 0, 0, 0, {Correction::Kind::KEEP, 0, '_'}});

   Step best{
       nullptr, INT_MAX, SIZE_MAX, INT_MAX, {Correction::Kind::KEEP, 0, '_'}};
   int i = 0;
   // Find and return all equally-distant results as soon as distance increases
   // beyond that of first found results. Length is also considered to
   // avoid producing shorter substring texts.
   while (!pq.empty() && pq.top().distance <= best.distance) {
     i++;
     Step step = pq.top();
     pq.pop();
     DVLOG(1) << i << "(" << step.distance << "," << step.index << ","
              << step.length << "," << step.correction << ")";
     // TODO(orinj): Enforce a tolerance schedule with index versus distance;
     //  this would allow more errors for longer inputs and prevents searching
     //  through long corrections near start of input.
     // Strictly greater should not be possible for this comparison.
     if (step.index >= text.length()) {
       if (step.distance == 0) {
         // Ideal common case, full input on trie with no correction required.
         // Because search is directed by priority_queue, we get here before
         // generating any corrections (straight line to goal is shortest path).
         DCHECK(corrections.empty());
         return true;
       }
       // Check `length` to keep longer results. Without this, we could end up
       // with shorter substring corrections (e.g. both "was" and "wash").
       // It may not be necessary to do this if priority_queue keeps results
       // optimal or returns a first best result immediately.
       DCHECK(best.distance == INT_MAX || step.distance == best.distance);
       if (step.distance < best.distance || step.length > best.length) {
         DVLOG(1) << "new best by "
                  << (step.distance < best.distance ? "distance" : "length");
         best = std::move(step);
         corrections.clear();
         // Dereference is safe because nonzero distance implies presence of
         // nontrivial correction.
         corrections.emplace_back(*best.correction.GetApplicableCorrection());
       } else {
         // Equal distance.
         // Strictly greater should not be possible for this comparison.
         if (step.length >= best.length) {
           // Dereference is safe because this is another equally
           // distant correction, necessarily discovered after the first.
           corrections.emplace_back(*step.correction.GetApplicableCorrection());
         }
 #if DCHECK_ALWAYS_ON
         std::u16string corrected = text;
         step.correction.GetApplicableCorrection()->ApplyTo(corrected);
         DCHECK_EQ(corrected.length(), static_cast<size_t>(step.length))
             << corrected;
 #endif
       }
       continue;
     }
     int tolerance = tolerance_schedule.ToleranceAt(step.index);
     if (step.distance < tolerance) {
       // Delete
       pq.push({step.node,
                step.distance + 1,
                step.index + 1,
                step.length,
                {Correction::Kind::DELETE, step.index, '_',
                 step.correction.GetApplicableCorrection()}});
     }
     for (const auto& entry : step.node->next) {
       if (entry.first == text[step.index]) {
         // Keep
         pq.push({entry.second.get(),
                  step.distance,
                  step.index + 1,
                  step.length + 1,
                  {Correction::Kind::KEEP, step.index, '_',
                   step.correction.GetApplicableCorrection()}});
       } else if (step.distance < tolerance) {
         // Insert
         pq.push({entry.second.get(),
                  step.distance + 1,
                  step.index,
                  step.length + 1,
                  {Correction::Kind::INSERT, step.index, entry.first,
                   step.correction.GetApplicableCorrection()}});
         // Replace. Note, we do not replace at the same position as a previous
         // insertion because doing so could produce unnecessary duplicates.
         if (step.correction.kind != Correction::Kind::INSERT ||
             step.correction.at != step.index) {
           pq.push({entry.second.get(),
                    step.distance + 1,
                    step.index + 1,
                    step.length + 1,
                    {Correction::Kind::REPLACE, step.index, entry.first,
                     step.correction.GetApplicableCorrection()}});
         }
       }
     }
   }
   if (!pq.empty()) {
     DVLOG(1) << "quit early on step with distance " << pq.top().distance;
   }
   return false;
 }

 void Node::Log(std::u16string built) const {
   if (relevance > 0) {
     DVLOG(1) << "  <" << built << ">";
   }
   for (const auto& entry : next) {
     entry.second->Log(built + entry.first);
   }
 }

 size_t Node::EstimateMemoryUsage() const {
   size_t res = 0;
   res += base::trace_event::EstimateMemoryUsage(next);
   return res;
 }

 // This task class loads URLs considered significant according to
 // `HistoryDatabase::InitURLEnumeratorForSignificant` but there's nothing
 // special about that implementation; we may do something different for
 // fuzzy matching. The goal in general is to load and keep a reasonably sized
 // set of likely relevant host names for fast fuzzy correction.
 class LoadSignificantUrls : public history::HistoryDBTask {
  public:
   using Callback = base::OnceCallback<void(Node)>;

   LoadSignificantUrls(base::WaitableEvent* event, Callback callback)
       : wait_event_(event), callback_(std::move(callback)) {
     DVLOG(1) << "LoadSignificantUrls ctor thread "
              << base::PlatformThread::CurrentId();
   }
   ~LoadSignificantUrls() override = default;

   bool RunOnDBThread(history::HistoryBackend* backend,
                      history::HistoryDatabase* db) override {
     DVLOG(1) << "LoadSignificantUrls run on db thread "
              << base::PlatformThread::CurrentId() << "; db: " << db;
     history::URLDatabase::URLEnumerator enumerator;
     if (db && db->InitURLEnumeratorForSignificant(&enumerator)) {
       DVLOG(1) << "Got InMemoryDatabase";
       history::URLRow row;
       while (enumerator.GetNextURL(&row)) {
         DVLOG(1) << "url #" << row.id() << ": " << row.url().host();
         node_.Insert(UrlDomainReduction(row.url()), 0);
       }
     } else {
       DVLOG(1) << "No significant InMemoryDatabase";
     }
     return true;
   }

   void DoneRunOnMainThread() override {
     DVLOG(1) << "Done thread " << base::PlatformThread::CurrentId();
     std::move(callback_).Run(std::move(node_));
     wait_event_->Signal();
   }

  private:
   Node node_;
   raw_ptr<base::WaitableEvent> wait_event_;
   Callback callback_;
 };

 }  // namespace fuzzy

 HistoryFuzzyProvider::HistoryFuzzyProvider(AutocompleteProviderClient* client)
     : HistoryProvider(AutocompleteProvider::TYPE_HISTORY_FUZZY, client) {
   history_service_observation_.Observe(client->GetHistoryService());
   client->GetHistoryService()->ScheduleDBTask(
       FROM_HERE,
       std::make_unique<fuzzy::LoadSignificantUrls>(
           &urls_loaded_event_,
           base::BindOnce(&HistoryFuzzyProvider::OnUrlsLoaded,
                          weak_ptr_factory_.GetWeakPtr())),
       &task_tracker_);
 }

 void HistoryFuzzyProvider::Start(const AutocompleteInput& input,
                                  bool minimal_changes) {
   TRACE_EVENT0("omnibox", "HistoryFuzzyProvider::Start");
   matches_.clear();
   if (input.focus_type() != OmniboxFocusType::DEFAULT ||
       input.type() == metrics::OmniboxInputType::EMPTY) {
     return;
   }

   if (!urls_loaded_event_.IsSignaled()) {
     return;
   }

   autocomplete_input_ = input;

   // Fuzzy matching intends to correct quick typos, and because it may involve
   // a compute intensive search, some conditions are checked to bypass this
   // provider early. When the cursor is moved from the end of input string,
   // user may have slowed down to edit manually.
   if (autocomplete_input_.cursor_position() ==
       autocomplete_input_.text().length()) {
     DoAutocomplete();
   }
 }

 size_t HistoryFuzzyProvider::EstimateMemoryUsage() const {
   size_t res = HistoryProvider::EstimateMemoryUsage();
   res += base::trace_event::EstimateMemoryUsage(autocomplete_input_);
   res += base::trace_event::EstimateMemoryUsage(root_);
   return res;
 }

 HistoryFuzzyProvider::~HistoryFuzzyProvider() = default;

 void HistoryFuzzyProvider::DoAutocomplete() {
   // TODO(orinj): This schedule may want some measurement and tinkering.
   constexpr fuzzy::ToleranceSchedule kToleranceSchedule = {
       .start_index = 1,
       .step_length = 4,
       .limit = 3,
   };

   const std::u16string& text =
       ReduceInputTextForMatching(autocomplete_input_.text());
   if (text.length() == 0) {
     DVLOG(1) << "Skipping fuzzy for input '" << autocomplete_input_.text()
              << "'";
     return;
   }
   if (text[text.length() - 1] == u'!') {
     if (text == u"log!") {
       DVLOG(1) << "Trie Log: !{";
       root_.Log(std::u16string());
       DVLOG(1) << "}!";
     } else {
       root_.Insert(text.substr(0, text.length() - 1), 0);
     }
   } else {
     std::vector<fuzzy::Correction> corrections;
     DVLOG(1) << "FindCorrections: <" << text << "> ---> ?{";
     if (root_.FindCorrections(text, kToleranceSchedule, corrections)) {
       DVLOG(1) << "Trie contains input; no fuzzy results needed";
     }
     if (!corrections.empty()) {
       // Use of `scoped_refptr` is required here because destructor is private.
       scoped_refptr<HistoryQuickProvider> history_quick_provider =
           new HistoryQuickProvider(client());
       scoped_refptr<BookmarkProvider> bookmark_provider =
           new BookmarkProvider(client());
       for (const auto& correction : corrections) {
         std::u16string fixed = text;
         correction.ApplyTo(fixed);
         DVLOG(1) << ":  " << fixed;
         AddMatchForText(fixed);

         // Note the `cursor_position` could be changed by insert or delete
         // corrections, but this is easy to adapt since we only fuzzy
         // match when cursor is at end of input; just move to new end.
         DCHECK_EQ(autocomplete_input_.cursor_position(),
                   autocomplete_input_.text().length());
         AutocompleteInput corrected_input(
             fixed, fixed.length(),
             autocomplete_input_.current_page_classification(),
             client()->GetSchemeClassifier());

         history_quick_provider->Start(corrected_input, false);
         DCHECK(history_quick_provider->done());
         bookmark_provider->Start(corrected_input, false);
         DCHECK(bookmark_provider->done());

         AddConvertedMatches(history_quick_provider->matches());
         AddConvertedMatches(bookmark_provider->matches());
       }
     }
     DVLOG(1) << "}?";
   }
 }

 void HistoryFuzzyProvider::AddMatchForText(std::u16string text) {
   AutocompleteMatch match(this, 10000000, false,
                           AutocompleteMatchType::HISTORY_URL);
   match.contents = std::move(text);
   match.contents_class.push_back(
       {0, AutocompleteMatch::ACMatchClassification::DIM});
   matches_.push_back(std::move(match));
 }

 void HistoryFuzzyProvider::AddConvertedMatches(const ACMatches& matches) {
   // TODO(orinj): Optimize with move not copy; requires provider change.
   //  Consider taking only the most relevant match.
   for (const auto& original_match : matches) {
     DVLOG(1) << "Converted match: " << original_match.contents;
     matches_.push_back(original_match);

     // Update match in place.
     AutocompleteMatch& match = matches_.back();
     match.provider = this;
     match.inline_autocompletion.clear();
     match.allowed_to_be_default_match = false;
     // TODO(orinj): Determine suitable relevance penalty; it should
     //  likely take into account the edit distance or size of correction.
     //  Using 9/10 reasonably took a 1334 relevance match down to 1200,
     //  but was harmful to HQP suggestions: as soon as a '.' was
     //  appended, a bunch of ~800 navsuggest results overtook a better
     //  HQP result that was bumped down to ~770. Using 95/100 lets this
     //  result compete in the navsuggest range.
     match.relevance = match.relevance * 95 / 100;
     match.contents_class.clear();
     match.contents_class.push_back(
         {0, AutocompleteMatch::ACMatchClassification::DIM});
   }
 }

 void HistoryFuzzyProvider::OnUrlsLoaded(fuzzy::Node node) {
   root_ = std::move(node);
 }

 void HistoryFuzzyProvider::OnURLVisited(
     history::HistoryService* history_service,
     ui::PageTransition transition,
     const history::URLRow& row,
     base::Time visit_time) {
   DVLOG(1) << "URL Visit: " << row.url();
   root_.Insert(UrlDomainReduction(row.url()), 0);
 }

 void HistoryFuzzyProvider::OnURLsDeleted(
     history::HistoryService* history_service,
     const history::DeletionInfo& deletion_info) {
   // Note, this implementation is conservative in terms of user privacy; it
   // deletes hosts from the trie if any URL with the given host is deleted.
   if (deletion_info.IsAllHistory()) {
     root_.Clear();
   } else {
     for (const history::URLRow& row : deletion_info.deleted_rows()) {
       root_.Delete(UrlDomainReduction(row.url()), 0);
     }
   }
 }
	// Copyright (c) 2022 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "components/omnibox/browser/history_fuzzy_provider.h"

	#include <algorithm>
	#include <functional>
	#include <memory>
	#include <ostream>
	#include <queue>
	#include <string>
	#include <unordered_map>
	#include <utility>
	#include <vector>

	#include "base/check.h"
	#include "base/memory/raw_ptr.h"
	#include "base/memory/scoped_refptr.h"
	#include "base/strings/utf_string_conversions.h"
	#include "base/trace_event/memory_usage_estimator.h"
	#include "base/trace_event/trace_event.h"
	#include "components/history/core/browser/history_database.h"
	#include "components/history/core/browser/history_db_task.h"
	#include "components/history/core/browser/history_service.h"
	#include "components/history/core/browser/url_database.h"
	#include "components/omnibox/browser/autocomplete_match_classification.h"
	#include "components/omnibox/browser/autocomplete_match_type.h"
	#include "components/omnibox/browser/autocomplete_provider_client.h"
	#include "components/omnibox/browser/autocomplete_result.h"
	#include "components/omnibox/browser/bookmark_provider.h"
	#include "components/omnibox/browser/history_quick_provider.h"
	#include "components/search_engines/omnibox_focus_type.h"
	#include "components/url_formatter/elide_url.h"
	#include "url/gurl.h"

	namespace {

	// This utility function reduces a URL to the most meaningful and likely part
	// of the hostname to be matched against, i.e. the domain, the URL's TLD+1.
	// May return an empty string if the given URL is not a good candidate for
	// meaningful domain name matching.
	std::u16string UrlDomainReduction(const GURL& url) {
	std::u16string url_host;
	std::u16string url_domain;
	url_formatter::SplitHost(url, &url_host, &url_domain, nullptr);
	return url_domain;
	}

	// This utility function prepares input text for fuzzy matching, or returns
	// an empty string in cases unlikely to be worth a fuzzy matching search.
	// Note, this is intended to be a fast way to improve matching and eliminate
	// likely-unfruitful searches. It could make use of `SplitHost` as above, or
	// `url_formatter::FormatUrlForDisplayOmitSchemePathAndTrivialSubdomains`,
	// which uses `FormatUrlWithAdjustments` under the hood, but all that URL
	// processing for input text that may not even be a URL seems like overkill,
	// so this simple direct method is used instead.
	std::u16string ReduceInputTextForMatching(const std::u16string& input) {
	constexpr size_t kMaximumFuzzyMatchInputLength = 32;
	constexpr size_t kPathCharacterCountToStopSearch = 6;
	constexpr size_t kPostDotCharacterCountHintingSubdomain = 4;

	// Long inputs are not fuzzy matched; doing so could be costly, and the
	// length of input itself is a signal that it may not have been typed but
	// simply pasted or edited in place.
	// TODO(orinj): Consider tracking trie depth for use as maximum here.
	if (input.length() > kMaximumFuzzyMatchInputLength) {
	return std::u16string();
	}

	// Spaces hint that the input may be a search, not a URL.
	if (input.find(u' ') != std::u16string::npos) {
	return std::u16string();
	}

	// Inputs containing anything that looks like a scheme are a hint that this
	// is an existing URL or an edit that's likely to be handled deliberately,
	// not a messy human input that may need fuzzy matching.
	if (input.find(u"://") != std::u16string::npos) {
	return std::u16string();
	}

	std::u16string remaining;
	// While typing a URL, the user may typo the domain but then continue on to
	// the path; keeping input up to the path separator keeps the window open
	// for fuzzy matching the domain as they continue to type, but we don't want
	// to keep it open forever (doing so could result in potentially sticky false
	// positives).
	size_t index = input.find(u'/');
	if (index != std::u16string::npos) {
	if (index + kPathCharacterCountToStopSearch < input.length()) {
	// User has moved well beyond typing domain and hasn't taken any fuzzy
	// suggestions provided so far, and they won't get better, so we can
	// save compute and suggestion results space by stopping the search.
	return std::u16string();
	}
	remaining = input.substr(0, index);
	} else {
	remaining = input;
	}

	index = remaining.find(u'.');
	if (index != std::u16string::npos &&
	index + kPostDotCharacterCountHintingSubdomain < remaining.length()) {
	// Keep input with dot if near the end (within range of .com, .org, .edu).
	// With a dot earlier in the string, the user might be typing a subdomain
	// and we only have the TLD+1 stored in the trie, so skip the dot and match
	// against the remaining text. This may be helpful in common cases like
	// typing an unnecessary "www." before the domain name.
	remaining = remaining.substr(index + 1);
	}

	return remaining;
	}

	} // namespace

	namespace fuzzy {

	Correction::Correction(const Correction& other) {
	kind = other.kind;
	at = other.at;
	new_char = other.new_char;
	if (other.next) {
	next = std::make_unique<Correction>(*other.next.get());
	}
	}

	Correction::Correction(Correction&&) = default;

	Correction::Correction(Kind kind, size_t at, char16_t new_char)
	: kind(kind), at(at), new_char(new_char) {}

	Correction::Correction(Kind kind,
	size_t at,
	char16_t new_char,
	std::unique_ptr<Correction> next)
	: kind(kind), at(at), new_char(new_char), next(std::move(next)) {}

	Correction::~Correction() = default;

	void Correction::ApplyTo(std::u16string& text) const {
	switch (kind) {
	case Kind::DELETE: {
	text.erase(at, 1);
	break;
	}
	case Kind::INSERT: {
	text.insert(at, 1, new_char);
	break;
	}
	case Kind::REPLACE: {
	text[at] = new_char;
	break;
	}
	case Kind::KEEP:
	default: {
	NOTREACHED();
	break;
	}
	}
	if (next) {
	next->ApplyTo(text);
	}
	}

	std::unique_ptr<Correction> Correction::GetApplicableCorrection() {
	if (kind == Kind::KEEP) {
	// Because this function eliminates KEEP corrections as the chain is built,
	// it doesn't need to work recursively; a single elimination is sufficient.
	DCHECK(!next \|\| next->kind != Kind::KEEP);
	return next ? std::make_unique<Correction>(*next) : nullptr;
	} else {
	// TODO(orinj): Consider a shared ownership model or preallocated pool to
	// eliminate lots of copy allocations. For now this is kept simple with
	// direct ownership of the full correction chain.
	return std::make_unique<Correction>(*this);
	}
	}

	// This operator implementation is for debugging.
	std::ostream& operator<<(std::ostream& os, const Correction& correction) {
	os << '{';
	switch (correction.kind) {
	case Correction::Kind::KEEP: {
	os << 'K';
	break;
	}
	case Correction::Kind::DELETE: {
	os << 'D';
	break;
	}
	case Correction::Kind::INSERT: {
	os << 'I';
	break;
	}
	case Correction::Kind::REPLACE: {
	os << 'R';
	break;
	}
	default: {
	NOTREACHED();
	break;
	}
	}
	os << "," << correction.at << "," << static_cast<char>(correction.new_char)
	<< "}";
	return os;
	}

	Node::Node() = default;

	Node::Node(Node&&) = default;

	Node::~Node() = default;

	void Node::Insert(const std::u16string& text, size_t from) {
	if (from >= text.length()) {
	relevance = 1;
	return;
	}
	std::unique_ptr<Node>& node = next[text[from]];
	if (!node) {
	node = std::make_unique<Node>();
	}
	node->Insert(text, from + 1);
	}

	bool Node::Delete(const std::u16string& text, size_t from) {
	if (from < text.length()) {
	auto it = next.find(text[from]);
	if (it != next.end() && it->second->Delete(text, from + 1)) {
	next.erase(it);
	}
	}
	return next.empty();
	}

	void Node::Clear() {
	next.clear();
	}

	bool Node::FindCorrections(const std::u16string& text,
	ToleranceSchedule tolerance_schedule,
	std::vector<Correction>& corrections) const {
	DVLOG(1) << "FindCorrections(" << text << ", " << tolerance_schedule.limit
	<< ")";
	DCHECK(corrections.empty());

	if (text.length() == 0) {
	return true;
	}

	// A utility class to track search progression.
	struct Step {
	// Walks through trie.
	raw_ptr<const Node> node;

	// Edit distance.
	int distance;

	// Advances through input text. This effectively tells how much of the
	// input has been consumed so far, regardless of output text length.
	size_t index;

	// Length of corrected text. This tells how long the output string will
	// be, regardless of input text length. It is independent of `index`
	// because corrections are not only 1:1 replacements but may involve
	// insertions or deletions as well.
	int length;

	// Backtracking data to enable text correction (from end of string back
	// to beginning, i.e. correction chains are applied in reverse).
	// TODO(orinj): This should be optimized in final algorithm; stop copying.
	Correction correction;

	// std::priority_queue keeps the greatest element on top, so we want this
	// operator implementation to make bad steps less than good steps.
	// Prioritize minimum distance, with index and length to break ties.
	// The first found solutions are best, and fastest in common cases
	// near input on trie.
	bool operator<(const Step& rhs) const {
	if (distance > rhs.distance) {
	return true;
	} else if (distance == rhs.distance) {
	if (index < rhs.index) {
	return true;
	} else if (index == rhs.index) {
	return length < rhs.length;
	}
	}
	return false;
	}
	};

	std::priority_queue<Step> pq;
	pq.push({this, 0, 0, 0, {Correction::Kind::KEEP, 0, '_'}});

	Step best{
	nullptr, INT_MAX, SIZE_MAX, INT_MAX, {Correction::Kind::KEEP, 0, '_'}};
	int i = 0;
	// Find and return all equally-distant results as soon as distance increases
	// beyond that of first found results. Length is also considered to
	// avoid producing shorter substring texts.
	while (!pq.empty() && pq.top().distance <= best.distance) {
	i++;
	Step step = pq.top();
	pq.pop();
	DVLOG(1) << i << "(" << step.distance << "," << step.index << ","
	<< step.length << "," << step.correction << ")";
	// TODO(orinj): Enforce a tolerance schedule with index versus distance;
	// this would allow more errors for longer inputs and prevents searching
	// through long corrections near start of input.
	// Strictly greater should not be possible for this comparison.
	if (step.index >= text.length()) {
	if (step.distance == 0) {
	// Ideal common case, full input on trie with no correction required.
	// Because search is directed by priority_queue, we get here before
	// generating any corrections (straight line to goal is shortest path).
	DCHECK(corrections.empty());
	return true;
	}
	// Check `length` to keep longer results. Without this, we could end up
	// with shorter substring corrections (e.g. both "was" and "wash").
	// It may not be necessary to do this if priority_queue keeps results
	// optimal or returns a first best result immediately.
	DCHECK(best.distance == INT_MAX \|\| step.distance == best.distance);
	if (step.distance < best.distance \|\| step.length > best.length) {
	DVLOG(1) << "new best by "
	<< (step.distance < best.distance ? "distance" : "length");
	best = std::move(step);
	corrections.clear();
	// Dereference is safe because nonzero distance implies presence of
	// nontrivial correction.
	corrections.emplace_back(*best.correction.GetApplicableCorrection());
	} else {
	// Equal distance.
	// Strictly greater should not be possible for this comparison.
	if (step.length >= best.length) {
	// Dereference is safe because this is another equally
	// distant correction, necessarily discovered after the first.
	corrections.emplace_back(*step.correction.GetApplicableCorrection());
	}
	#if DCHECK_ALWAYS_ON
	std::u16string corrected = text;
	step.correction.GetApplicableCorrection()->ApplyTo(corrected);
	DCHECK_EQ(corrected.length(), static_cast<size_t>(step.length))
	<< corrected;
	#endif
	}
	continue;
	}
	int tolerance = tolerance_schedule.ToleranceAt(step.index);
	if (step.distance < tolerance) {
	// Delete
	pq.push({step.node,
	step.distance + 1,
	step.index + 1,
	step.length,
	{Correction::Kind::DELETE, step.index, '_',
	step.correction.GetApplicableCorrection()}});
	}
	for (const auto& entry : step.node->next) {
	if (entry.first == text[step.index]) {
	// Keep
	pq.push({entry.second.get(),
	step.distance,
	step.index + 1,
	step.length + 1,
	{Correction::Kind::KEEP, step.index, '_',
	step.correction.GetApplicableCorrection()}});
	} else if (step.distance < tolerance) {
	// Insert
	pq.push({entry.second.get(),
	step.distance + 1,
	step.index,
	step.length + 1,
	{Correction::Kind::INSERT, step.index, entry.first,
	step.correction.GetApplicableCorrection()}});
	// Replace. Note, we do not replace at the same position as a previous
	// insertion because doing so could produce unnecessary duplicates.
	if (step.correction.kind != Correction::Kind::INSERT \|\|
	step.correction.at != step.index) {
	pq.push({entry.second.get(),
	step.distance + 1,
	step.index + 1,
	step.length + 1,
	{Correction::Kind::REPLACE, step.index, entry.first,
	step.correction.GetApplicableCorrection()}});
	}
	}
	}
	}
	if (!pq.empty()) {
	DVLOG(1) << "quit early on step with distance " << pq.top().distance;
	}
	return false;
	}

	void Node::Log(std::u16string built) const {
	if (relevance > 0) {
	DVLOG(1) << " <" << built << ">";
	}
	for (const auto& entry : next) {
	entry.second->Log(built + entry.first);
	}
	}

	size_t Node::EstimateMemoryUsage() const {
	size_t res = 0;
	res += base::trace_event::EstimateMemoryUsage(next);
	return res;
	}

	// This task class loads URLs considered significant according to
	// `HistoryDatabase::InitURLEnumeratorForSignificant` but there's nothing
	// special about that implementation; we may do something different for
	// fuzzy matching. The goal in general is to load and keep a reasonably sized
	// set of likely relevant host names for fast fuzzy correction.
	class LoadSignificantUrls : public history::HistoryDBTask {
	public:
	using Callback = base::OnceCallback<void(Node)>;

	LoadSignificantUrls(base::WaitableEvent* event, Callback callback)
	: wait_event_(event), callback_(std::move(callback)) {
	DVLOG(1) << "LoadSignificantUrls ctor thread "
	<< base::PlatformThread::CurrentId();
	}
	~LoadSignificantUrls() override = default;

	bool RunOnDBThread(history::HistoryBackend* backend,
	history::HistoryDatabase* db) override {
	DVLOG(1) << "LoadSignificantUrls run on db thread "
	<< base::PlatformThread::CurrentId() << "; db: " << db;
	history::URLDatabase::URLEnumerator enumerator;
	if (db && db->InitURLEnumeratorForSignificant(&enumerator)) {
	DVLOG(1) << "Got InMemoryDatabase";
	history::URLRow row;
	while (enumerator.GetNextURL(&row)) {
	DVLOG(1) << "url #" << row.id() << ": " << row.url().host();
	node_.Insert(UrlDomainReduction(row.url()), 0);
	}
	} else {
	DVLOG(1) << "No significant InMemoryDatabase";
	}
	return true;
	}

	void DoneRunOnMainThread() override {
	DVLOG(1) << "Done thread " << base::PlatformThread::CurrentId();
	std::move(callback_).Run(std::move(node_));
	wait_event_->Signal();
	}

	private:
	Node node_;
	raw_ptr<base::WaitableEvent> wait_event_;
	Callback callback_;
	};

	} // namespace fuzzy

	HistoryFuzzyProvider::HistoryFuzzyProvider(AutocompleteProviderClient* client)
	: HistoryProvider(AutocompleteProvider::TYPE_HISTORY_FUZZY, client) {
	history_service_observation_.Observe(client->GetHistoryService());
	client->GetHistoryService()->ScheduleDBTask(
	FROM_HERE,
	std::make_unique<fuzzy::LoadSignificantUrls>(
	&urls_loaded_event_,
	base::BindOnce(&HistoryFuzzyProvider::OnUrlsLoaded,
	weak_ptr_factory_.GetWeakPtr())),
	&task_tracker_);
	}

	void HistoryFuzzyProvider::Start(const AutocompleteInput& input,
	bool minimal_changes) {
	TRACE_EVENT0("omnibox", "HistoryFuzzyProvider::Start");
	matches_.clear();
	if (input.focus_type() != OmniboxFocusType::DEFAULT \|\|
	input.type() == metrics::OmniboxInputType::EMPTY) {
	return;
	}

	if (!urls_loaded_event_.IsSignaled()) {
	return;
	}

	autocomplete_input_ = input;

	// Fuzzy matching intends to correct quick typos, and because it may involve
	// a compute intensive search, some conditions are checked to bypass this
	// provider early. When the cursor is moved from the end of input string,
	// user may have slowed down to edit manually.
	if (autocomplete_input_.cursor_position() ==
	autocomplete_input_.text().length()) {
	DoAutocomplete();
	}
	}

	size_t HistoryFuzzyProvider::EstimateMemoryUsage() const {
	size_t res = HistoryProvider::EstimateMemoryUsage();
	res += base::trace_event::EstimateMemoryUsage(autocomplete_input_);
	res += base::trace_event::EstimateMemoryUsage(root_);
	return res;
	}

	HistoryFuzzyProvider::~HistoryFuzzyProvider() = default;

	void HistoryFuzzyProvider::DoAutocomplete() {
	// TODO(orinj): This schedule may want some measurement and tinkering.
	constexpr fuzzy::ToleranceSchedule kToleranceSchedule = {
	.start_index = 1,
	.step_length = 4,
	.limit = 3,
	};

	const std::u16string& text =
	ReduceInputTextForMatching(autocomplete_input_.text());
	if (text.length() == 0) {
	DVLOG(1) << "Skipping fuzzy for input '" << autocomplete_input_.text()
	<< "'";
	return;
	}
	if (text[text.length() - 1] == u'!') {
	if (text == u"log!") {
	DVLOG(1) << "Trie Log: !{";
	root_.Log(std::u16string());
	DVLOG(1) << "}!";
	} else {
	root_.Insert(text.substr(0, text.length() - 1), 0);
	}
	} else {
	std::vector<fuzzy::Correction> corrections;
	DVLOG(1) << "FindCorrections: <" << text << "> ---> ?{";
	if (root_.FindCorrections(text, kToleranceSchedule, corrections)) {
	DVLOG(1) << "Trie contains input; no fuzzy results needed";
	}
	if (!corrections.empty()) {
	// Use of `scoped_refptr` is required here because destructor is private.
	scoped_refptr<HistoryQuickProvider> history_quick_provider =
	new HistoryQuickProvider(client());
	scoped_refptr<BookmarkProvider> bookmark_provider =
	new BookmarkProvider(client());
	for (const auto& correction : corrections) {
	std::u16string fixed = text;
	correction.ApplyTo(fixed);
	DVLOG(1) << ": " << fixed;
	AddMatchForText(fixed);

	// Note the `cursor_position` could be changed by insert or delete
	// corrections, but this is easy to adapt since we only fuzzy
	// match when cursor is at end of input; just move to new end.
	DCHECK_EQ(autocomplete_input_.cursor_position(),
	autocomplete_input_.text().length());
	AutocompleteInput corrected_input(
	fixed, fixed.length(),
	autocomplete_input_.current_page_classification(),
	client()->GetSchemeClassifier());

	history_quick_provider->Start(corrected_input, false);
	DCHECK(history_quick_provider->done());
	bookmark_provider->Start(corrected_input, false);
	DCHECK(bookmark_provider->done());

	AddConvertedMatches(history_quick_provider->matches());
	AddConvertedMatches(bookmark_provider->matches());
	}
	}
	DVLOG(1) << "}?";
	}
	}

	void HistoryFuzzyProvider::AddMatchForText(std::u16string text) {
	AutocompleteMatch match(this, 10000000, false,
	AutocompleteMatchType::HISTORY_URL);
	match.contents = std::move(text);
	match.contents_class.push_back(
	{0, AutocompleteMatch::ACMatchClassification::DIM});
	matches_.push_back(std::move(match));
	}

	void HistoryFuzzyProvider::AddConvertedMatches(const ACMatches& matches) {
	// TODO(orinj): Optimize with move not copy; requires provider change.
	// Consider taking only the most relevant match.
	for (const auto& original_match : matches) {
	DVLOG(1) << "Converted match: " << original_match.contents;
	matches_.push_back(original_match);

	// Update match in place.
	AutocompleteMatch& match = matches_.back();
	match.provider = this;
	match.inline_autocompletion.clear();
	match.allowed_to_be_default_match = false;
	// TODO(orinj): Determine suitable relevance penalty; it should
	// likely take into account the edit distance or size of correction.
	// Using 9/10 reasonably took a 1334 relevance match down to 1200,
	// but was harmful to HQP suggestions: as soon as a '.' was
	// appended, a bunch of ~800 navsuggest results overtook a better
	// HQP result that was bumped down to ~770. Using 95/100 lets this
	// result compete in the navsuggest range.
	match.relevance = match.relevance * 95 / 100;
	match.contents_class.clear();
	match.contents_class.push_back(
	{0, AutocompleteMatch::ACMatchClassification::DIM});
	}
	}

	void HistoryFuzzyProvider::OnUrlsLoaded(fuzzy::Node node) {
	root_ = std::move(node);
	}

	void HistoryFuzzyProvider::OnURLVisited(
	history::HistoryService* history_service,
	ui::PageTransition transition,
	const history::URLRow& row,
	base::Time visit_time) {
	DVLOG(1) << "URL Visit: " << row.url();
	root_.Insert(UrlDomainReduction(row.url()), 0);
	}

	void HistoryFuzzyProvider::OnURLsDeleted(
	history::HistoryService* history_service,
	const history::DeletionInfo& deletion_info) {
	// Note, this implementation is conservative in terms of user privacy; it
	// deletes hosts from the trie if any URL with the given host is deleted.
	if (deletion_info.IsAllHistory()) {
	root_.Clear();
	} else {
	for (const history::URLRow& row : deletion_info.deleted_rows()) {
	root_.Delete(UrlDomainReduction(row.url()), 0);
	}
	}
	}