components/omnibox/browser/actions/omnibox_pedal_provider.cc - chromium/src - Git at Google

 // Copyright 2018 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "components/omnibox/browser/actions/omnibox_pedal_provider.h"

 #include <numeric>

 #include "base/containers/cxx20_erase.h"
 #include "base/i18n/case_conversion.h"
 #include "base/i18n/char_iterator.h"
 #include "base/i18n/rtl.h"
 #include "base/json/json_reader.h"
 #include "base/metrics/field_trial_params.h"
 #include "base/strings/string_tokenizer.h"
 #include "base/strings/string_util.h"
 #include "base/strings/utf_string_conversions.h"
 #include "base/trace_event/memory_usage_estimator.h"
 #include "components/omnibox/browser/actions/omnibox_pedal.h"
 #include "components/omnibox/browser/actions/omnibox_pedal_concepts.h"
 #include "components/omnibox/browser/autocomplete_input.h"
 #include "components/omnibox/browser/autocomplete_provider_client.h"
 #include "components/omnibox/browser/omnibox_field_trial.h"
 #include "components/omnibox/common/omnibox_features.h"
 #include "components/omnibox/resources/grit/omnibox_pedal_synonyms.h"
 #include "components/omnibox/resources/grit/omnibox_resources.h"
 #include "ui/base/l10n/l10n_util.h"
 #include "ui/base/resource/resource_bundle.h"

 namespace {
 typedef base::StringTokenizerT<std::u16string, std::u16string::const_iterator>
     StringTokenizer16;

 // This is a hard upper bound on the number of tokens that will be processed.
 // It determines resident token sequence allocation size and limits the value
 // of |max_tokens_| which may be set smaller to speed up matching.
 constexpr size_t kMaximumMaxTokens = 64;

 // All characters in this string get removed from text before processing.
 // U+200F is a RTL marker punctuation character that seems to throw
 // off some triggers in 'ar'.
 const char16_t kRemoveChars[] = {0x200F, 0};

 }  // namespace

 size_t EstimateMemoryUsage(scoped_refptr<OmniboxPedal> pedal) {
   // Consider the ref-counted Pedals to be part of the provider's memory usage.
   return pedal->EstimateMemoryUsage();
 }

 OmniboxPedalProvider::OmniboxPedalProvider(
     AutocompleteProviderClient& client,
     std::unordered_map<OmniboxPedalId, scoped_refptr<OmniboxPedal>> pedals)
     : client_(client),
       pedals_(std::move(pedals)),
       ignore_group_(false, false, 0),
       match_tokens_(kMaximumMaxTokens) {
   LoadPedalConcepts();

   // Cull Pedals with incomplete data; they won't trigger if not enabled,
   // but there's no need to keep them in the collection (iterated frequently).
   base::EraseIf(pedals_, [](const auto& it) {
     const OmniboxPedal::LabelStrings& labels = it.second->GetLabelStrings();
     return labels.hint.empty() || labels.suggestion_contents.empty() ||
            labels.accessibility_hint.empty() ||
            labels.accessibility_suffix.empty();
   });
 }

 OmniboxPedalProvider::~OmniboxPedalProvider() {}

 void OmniboxPedalProvider::AddProviderInfo(ProvidersInfo* provider_info) const {
   provider_info->push_back(metrics::OmniboxEventProto_ProviderInfo());
   metrics::OmniboxEventProto_ProviderInfo& new_entry = provider_info->back();
   // Note: SEARCH is used here because the suggestions that Pedals attach to are
   // almost exclusively coming from search suggestions (they could in theory
   // attach to others if the match content were a concept match, but in practice
   // only search suggestions have the relevant text). PEDAL is not used because
   // Pedals are not themselves suggestions produced by an autocomplete provider.
   // This may change. See http://cl/327103601 for context and discussion.
   new_entry.set_provider(metrics::OmniboxEventProto::SEARCH);
   new_entry.set_provider_done(true);

   if (field_trial_triggered_ || field_trial_triggered_in_session_) {
     std::vector<uint32_t> field_trial_hashes;
     OmniboxFieldTrial::GetActiveSuggestFieldTrialHashes(&field_trial_hashes);
     for (uint32_t trial : field_trial_hashes) {
       if (field_trial_triggered_)
         new_entry.mutable_field_trial_triggered()->Add(trial);
       if (field_trial_triggered_in_session_)
         new_entry.mutable_field_trial_triggered_in_session()->Add(trial);
     }
   }
 }

 void OmniboxPedalProvider::ResetSession() {
   field_trial_triggered_in_session_ = false;
   field_trial_triggered_ = false;
 }

 size_t OmniboxPedalProvider::EstimateMemoryUsage() const {
   size_t total = 0;
   total += base::trace_event::EstimateMemoryUsage(dictionary_);
   total += base::trace_event::EstimateMemoryUsage(ignore_group_);
   total += base::trace_event::EstimateMemoryUsage(pedals_);
   total += base::trace_event::EstimateMemoryUsage(tokenize_characters_);
   return total;
 }

 OmniboxPedal* OmniboxPedalProvider::FindPedalMatch(
     const std::u16string& match_text) {
   Tokenize(match_tokens_, match_text);
   if (match_tokens_.Size() == 0) {
     return nullptr;
   }

   // Note the ignore group is the only one that does full container
   // element erasure. This is necessary to prevent stop words from
   // breaking meaningful token sequences. For example, in the case
   // "make the most of chrome features", "the" must be fully
   // removed so as to not break detection of sequence "make the most of"
   // where "the" is removed by preprocessing. It becomes
   // "make most of" and would not match sequence "make _ most of"
   // where "the" was merely consumed instead of fully removed.
   if (ignore_group_.EraseMatchesIn(match_tokens_, true) &&
       match_tokens_.Size() == 0) {
     // Only ignored tokens were present, and all tokens were erased. No match.
     return nullptr;
   }

   for (const auto& pedal : pedals_) {
     // This restores link validity after above EraseMatchesIn call and prepares
     // |match_tokens_| for the next check after iteration.
     match_tokens_.ResetLinks();
     if (pedal.second->IsConceptMatch(match_tokens_)) {
       return pedal.second.get();
     }
   }
   return nullptr;
 }

 OmniboxPedal* OmniboxPedalProvider::FindReadyPedalMatch(
     const AutocompleteInput& input,
     const std::u16string& match_text) {
   OmniboxPedal* const found = FindPedalMatch(match_text);
   if (found == nullptr || !found->IsReadyToTrigger(input, client_)) {
     return nullptr;
   }

   field_trial_triggered_ = true;
   field_trial_triggered_in_session_ = true;

   return found;
 }

 void OmniboxPedalProvider::Tokenize(OmniboxPedal::TokenSequence& out_tokens,
                                     const std::u16string& text) const {
   // TODO(orinj): We may want to use FoldCase instead of ToLower here
   //  once the JSON data is eliminated (for now it's still needed for tests).
   //  See base/i18n/case_conversion.h for advice about unicode case handling.
   //  FoldCase is equivalent to lower-casing for ASCII/English, but provides
   //  more consistent (canonical) handling in other languages as well.
   std::u16string reduced_text = base::i18n::ToLower(text);
   base::RemoveChars(reduced_text, kRemoveChars, &reduced_text);
   out_tokens.Clear();
   if (tokenize_characters_.empty()) {
     // Tokenize on Unicode character boundaries when we have no delimiters.
     base::i18n::UTF16CharIterator char_iter(reduced_text);
     size_t left = 0;
     while (!char_iter.end()) {
       char_iter.Advance();
       size_t right = char_iter.array_pos();
       if (right > left) {
         const auto token = reduced_text.substr(left, right - left);
         const auto iter = dictionary_.find(token);
         if (iter == dictionary_.end() || out_tokens.Size() >= max_tokens_) {
           // No Pedal can possibly match because we found a token not
           // present in the token dictionary, or the text has too many tokens.
           out_tokens.Clear();
           break;
         } else {
           out_tokens.Add(iter->second);
         }
         left = right;
       } else {
         break;
       }
     }
   } else {
     // Delimiters will neatly divide the string into tokens.
     StringTokenizer16 tokenizer(reduced_text, tokenize_characters_);
     while (tokenizer.GetNext()) {
       const auto iter = dictionary_.find(tokenizer.token());
       if (iter == dictionary_.end() || out_tokens.Size() >= max_tokens_) {
         // No Pedal can possibly match because we found a token not
         // present in the token dictionary, or the text has too many tokens.
         out_tokens.Clear();
         break;
       } else {
         out_tokens.Add(iter->second);
       }
     }
   }
 }

 void OmniboxPedalProvider::TokenizeAndExpandDictionary(
     OmniboxPedal::TokenSequence& out_tokens,
     const std::u16string& token_sequence_string) {
   out_tokens.Clear();
   if (tokenize_characters_.empty()) {
     // Tokenize on Unicode character boundaries when we have no delimiters.
     base::i18n::UTF16CharIterator char_iter(token_sequence_string);
     size_t left = 0;
     while (!char_iter.end()) {
       char_iter.Advance();
       size_t right = char_iter.array_pos();
       if (right > left) {
         if (out_tokens.Size() >= max_tokens_) {
           // Can't take another token; the source data is invalid.
           out_tokens.Clear();
           break;
         }
         const std::u16string raw_token =
             token_sequence_string.substr(left, right - left);
         const std::u16string token = base::i18n::FoldCase(raw_token);
         const auto iter = dictionary_.find(token);
         if (iter == dictionary_.end()) {
           // Token not in dictionary; expand dictionary.
           out_tokens.Add(dictionary_.size());
           dictionary_.insert({token, dictionary_.size()});
         } else {
           // Token in dictionary; add existing token identifier to sequence.
           out_tokens.Add(iter->second);
         }
         left = right;
       } else {
         break;
       }
     }
   } else {
     // Delimiters will neatly divide the string into tokens.
     StringTokenizer16 tokenizer(token_sequence_string, tokenize_characters_);
     while (tokenizer.GetNext()) {
       if (out_tokens.Size() >= max_tokens_) {
         // Can't take another token; the source data is invalid.
         out_tokens.Clear();
         break;
       }
       std::u16string raw_token = tokenizer.token();
       base::StringPiece16 trimmed_token =
           base::TrimWhitespace(raw_token, base::TrimPositions::TRIM_ALL);
       std::u16string token = base::i18n::FoldCase(trimmed_token);
       const auto iter = dictionary_.find(token);
       if (iter == dictionary_.end()) {
         // Token not in dictionary; expand dictionary.
         out_tokens.Add(dictionary_.size());
         dictionary_.insert({std::move(token), dictionary_.size()});
       } else {
         // Token in dictionary; add existing token identifier to sequence.
         out_tokens.Add(iter->second);
       }
     }
   }
 }

 void OmniboxPedalProvider::LoadPedalConcepts() {
   // The locale is a two-letter language code, possibly followed by a dash and
   // country code. English locales include "en", "en-US", and "en-GB" while
   // non-English locales never start with "en".
   const bool locale_is_english =
       base::i18n::GetConfiguredLocale().substr(0, 2) == "en";

   // Load concept data then parse to base::Value in order to construct Pedals.
   std::string uncompressed_data =
       ui::ResourceBundle::GetSharedInstance().LoadLocalizedResourceString(
           IDR_OMNIBOX_PEDAL_CONCEPTS);
   const auto concept_data = base::JSONReader::Read(uncompressed_data);

   DCHECK(concept_data);
   DCHECK(concept_data->is_dict());

   const int data_version = concept_data->FindKey("data_version")->GetInt();
   CHECK_EQ(data_version, OMNIBOX_PEDAL_CONCEPTS_DATA_VERSION);

   max_tokens_ = concept_data->FindKey("max_tokens")->GetInt();
   // It is conceivable that some language may need more here, but the goal is
   // to sanity check input since it is trusted and used for vector reserve.
   DCHECK_LE(max_tokens_, kMaximumMaxTokens);

   if (concept_data->FindKey("tokenize_each_character")->GetBool()) {
     tokenize_characters_ = u"";
   } else {
     tokenize_characters_ = u" -";
   }

   const auto& dictionary =
       concept_data->FindKey("dictionary")->GetListDeprecated();
   dictionary_.reserve(dictionary.size());
   int token_id = 0;
   for (const auto& token_value : dictionary) {
     std::u16string token;
     if (token_value.is_string())
       token = base::UTF8ToUTF16(token_value.GetString());
     dictionary_.insert({token, token_id});
     ++token_id;
   }

   ignore_group_ = LoadSynonymGroupString(
       false, false, l10n_util::GetStringUTF16(IDS_OMNIBOX_PEDALS_IGNORE_GROUP));
   if (tokenize_characters_.empty()) {
     // Translation console sourced data has lots of spaces, but in practice
     // the ignore group doesn't include a single space sequence. Rather than
     // burden l10n with getting this nuance in the data precisely specified,
     // we simply hardcode to ignore spaces. This applies for all languages
     // that don't tokenize on spaces (see `tokenize_characters_` above).
     ignore_group_.AddSynonym(
         OmniboxPedal::TokenSequence(std::vector<int>({dictionary_[u" "]})));
   }
   ignore_group_.SortSynonyms();

   for (const auto& pedal_value :
        concept_data->FindKey("pedals")->GetListDeprecated()) {
     DCHECK(pedal_value.is_dict());
     const int id = pedal_value.FindIntKey("id").value();
     const auto pedal_iter = pedals_.find(static_cast<OmniboxPedalId>(id));
     if (pedal_iter == pedals_.end()) {
       // Data may exist for Pedals that are intentionally not registered; skip.
       continue;
     }
     OmniboxPedal* pedal = pedal_iter->second.get();
     const base::Value* ui_strings =
         pedal_value.FindDictKey("omnibox_ui_strings");
     if (ui_strings && pedal->GetLabelStrings().hint.empty()) {
       pedal->SetLabelStrings(*ui_strings);
     }
     const std::string* url = pedal_value.FindStringKey("url");
     if (!url->empty()) {
       pedal->SetNavigationUrl(GURL(*url));
     }

     OmniboxPedal::TokenSequence verbatim_sequence(0);
     TokenizeAndExpandDictionary(verbatim_sequence,
                                 pedal->GetLabelStrings().hint);
     ignore_group_.EraseMatchesIn(verbatim_sequence, true);
     pedal->AddVerbatimSequence(std::move(verbatim_sequence));

     std::vector<OmniboxPedal::SynonymGroupSpec> specs =
         pedal->SpecifySynonymGroups(locale_is_english);
     // `specs` will be empty for any pedals not yet processed by l10n because
     // the appropriate string names won't be defined. In such cases, we fall
     // back to loading from JSON to robustly handle partial presence of data.
     if (specs.empty()) {
       for (const auto& group_value :
            pedal_value.FindKey("groups")->GetListDeprecated()) {
         // Note, group JSON values are preprocessed by the data generation tool.
         pedal->AddSynonymGroup(LoadSynonymGroupValue(group_value));
       }
     } else {
       for (const auto& spec : specs) {
         // Note, group strings are not preprocessed; they are the raw outputs
         // from translators in the localization pipeline, so we need to remove
         // ignore group sequences and validate remaining data. The groups
         // are sorted *after* erasing the ignore group to ensure no synonym
         // token sequences are made shorter than sequences later in the order,
         // which would break an invariant expected by the matching algorithm.
         OmniboxPedal::SynonymGroup group =
             LoadSynonymGroupString(spec.required, spec.match_once,
                                    l10n_util::GetStringUTF16(spec.message_id));
         group.EraseIgnoreGroup(ignore_group_);
         group.SortSynonyms();
         if (group.IsValid()) {
           pedal->AddSynonymGroup(std::move(group));
         }
       }
     }
   }
 }

 OmniboxPedal::SynonymGroup OmniboxPedalProvider::LoadSynonymGroupValue(
     const base::Value& group_value) const {
   DCHECK(group_value.is_dict());
   const bool required = group_value.FindKey("required")->GetBool();
   const bool single = group_value.FindKey("single")->GetBool();
   const auto& synonyms = group_value.FindKey("synonyms")->GetListDeprecated();
   OmniboxPedal::SynonymGroup synonym_group(required, single, synonyms.size());
   for (const auto& synonyms_value : synonyms) {
     DCHECK(synonyms_value.is_list());
     const auto& synonyms_value_list = synonyms_value.GetListDeprecated();
     OmniboxPedal::TokenSequence synonym_all_tokens(synonyms_value_list.size());
     for (const auto& token_index_value : synonyms_value_list) {
       synonym_all_tokens.Add(token_index_value.GetInt());
     }
     synonym_group.AddSynonym(std::move(synonym_all_tokens));
   }
   return synonym_group;
 }

 OmniboxPedal::SynonymGroup OmniboxPedalProvider::LoadSynonymGroupString(
     bool required,
     bool match_once,
     std::u16string synonyms_csv) {
   base::RemoveChars(synonyms_csv, kRemoveChars, &synonyms_csv);
   OmniboxPedal::SynonymGroup group(required, match_once, 0);
   // Note, 'ar' language uses '،' instead of ',' to delimit synonyms and
   // in some cases the 'ja' language data uses '、' to delimit synonyms.
   StringTokenizer16 tokenizer(synonyms_csv, u",،、");
   while (tokenizer.GetNext()) {
     OmniboxPedal::TokenSequence sequence(0);
     // In some languages where whitespace is significant but not a token
     // delimiter, we want to trim and normalize whitespace that might be
     // added by translators for reading convenience in translation console.
     TokenizeAndExpandDictionary(
         sequence, base::CollapseWhitespace(tokenizer.token(), false));
     group.AddSynonym(std::move(sequence));
   }
   return group;
 }
	// Copyright 2018 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "components/omnibox/browser/actions/omnibox_pedal_provider.h"

	#include <numeric>

	#include "base/containers/cxx20_erase.h"
	#include "base/i18n/case_conversion.h"
	#include "base/i18n/char_iterator.h"
	#include "base/i18n/rtl.h"
	#include "base/json/json_reader.h"
	#include "base/metrics/field_trial_params.h"
	#include "base/strings/string_tokenizer.h"
	#include "base/strings/string_util.h"
	#include "base/strings/utf_string_conversions.h"
	#include "base/trace_event/memory_usage_estimator.h"
	#include "components/omnibox/browser/actions/omnibox_pedal.h"
	#include "components/omnibox/browser/actions/omnibox_pedal_concepts.h"
	#include "components/omnibox/browser/autocomplete_input.h"
	#include "components/omnibox/browser/autocomplete_provider_client.h"
	#include "components/omnibox/browser/omnibox_field_trial.h"
	#include "components/omnibox/common/omnibox_features.h"
	#include "components/omnibox/resources/grit/omnibox_pedal_synonyms.h"
	#include "components/omnibox/resources/grit/omnibox_resources.h"
	#include "ui/base/l10n/l10n_util.h"
	#include "ui/base/resource/resource_bundle.h"

	namespace {
	typedef base::StringTokenizerT<std::u16string, std::u16string::const_iterator>
	StringTokenizer16;

	// This is a hard upper bound on the number of tokens that will be processed.
	// It determines resident token sequence allocation size and limits the value
	// of \|max_tokens_\| which may be set smaller to speed up matching.
	constexpr size_t kMaximumMaxTokens = 64;

	// All characters in this string get removed from text before processing.
	// U+200F is a RTL marker punctuation character that seems to throw
	// off some triggers in 'ar'.
	const char16_t kRemoveChars[] = {0x200F, 0};

	} // namespace

	size_t EstimateMemoryUsage(scoped_refptr<OmniboxPedal> pedal) {
	// Consider the ref-counted Pedals to be part of the provider's memory usage.
	return pedal->EstimateMemoryUsage();
	}

	OmniboxPedalProvider::OmniboxPedalProvider(
	AutocompleteProviderClient& client,
	std::unordered_map<OmniboxPedalId, scoped_refptr<OmniboxPedal>> pedals)
	: client_(client),
	pedals_(std::move(pedals)),
	ignore_group_(false, false, 0),
	match_tokens_(kMaximumMaxTokens) {
	LoadPedalConcepts();

	// Cull Pedals with incomplete data; they won't trigger if not enabled,
	// but there's no need to keep them in the collection (iterated frequently).
	base::EraseIf(pedals_, [](const auto& it) {
	const OmniboxPedal::LabelStrings& labels = it.second->GetLabelStrings();
	return labels.hint.empty() \|\| labels.suggestion_contents.empty() \|\|
	labels.accessibility_hint.empty() \|\|
	labels.accessibility_suffix.empty();
	});
	}

	OmniboxPedalProvider::~OmniboxPedalProvider() {}

	void OmniboxPedalProvider::AddProviderInfo(ProvidersInfo* provider_info) const {
	provider_info->push_back(metrics::OmniboxEventProto_ProviderInfo());
	metrics::OmniboxEventProto_ProviderInfo& new_entry = provider_info->back();
	// Note: SEARCH is used here because the suggestions that Pedals attach to are
	// almost exclusively coming from search suggestions (they could in theory
	// attach to others if the match content were a concept match, but in practice
	// only search suggestions have the relevant text). PEDAL is not used because
	// Pedals are not themselves suggestions produced by an autocomplete provider.
	// This may change. See http://cl/327103601 for context and discussion.
	new_entry.set_provider(metrics::OmniboxEventProto::SEARCH);
	new_entry.set_provider_done(true);

	if (field_trial_triggered_ \|\| field_trial_triggered_in_session_) {
	std::vector<uint32_t> field_trial_hashes;
	OmniboxFieldTrial::GetActiveSuggestFieldTrialHashes(&field_trial_hashes);
	for (uint32_t trial : field_trial_hashes) {
	if (field_trial_triggered_)
	new_entry.mutable_field_trial_triggered()->Add(trial);
	if (field_trial_triggered_in_session_)
	new_entry.mutable_field_trial_triggered_in_session()->Add(trial);
	}
	}
	}

	void OmniboxPedalProvider::ResetSession() {
	field_trial_triggered_in_session_ = false;
	field_trial_triggered_ = false;
	}

	size_t OmniboxPedalProvider::EstimateMemoryUsage() const {
	size_t total = 0;
	total += base::trace_event::EstimateMemoryUsage(dictionary_);
	total += base::trace_event::EstimateMemoryUsage(ignore_group_);
	total += base::trace_event::EstimateMemoryUsage(pedals_);
	total += base::trace_event::EstimateMemoryUsage(tokenize_characters_);
	return total;
	}

	OmniboxPedal* OmniboxPedalProvider::FindPedalMatch(
	const std::u16string& match_text) {
	Tokenize(match_tokens_, match_text);
	if (match_tokens_.Size() == 0) {
	return nullptr;
	}

	// Note the ignore group is the only one that does full container
	// element erasure. This is necessary to prevent stop words from
	// breaking meaningful token sequences. For example, in the case
	// "make the most of chrome features", "the" must be fully
	// removed so as to not break detection of sequence "make the most of"
	// where "the" is removed by preprocessing. It becomes
	// "make most of" and would not match sequence "make _ most of"
	// where "the" was merely consumed instead of fully removed.
	if (ignore_group_.EraseMatchesIn(match_tokens_, true) &&
	match_tokens_.Size() == 0) {
	// Only ignored tokens were present, and all tokens were erased. No match.
	return nullptr;
	}

	for (const auto& pedal : pedals_) {
	// This restores link validity after above EraseMatchesIn call and prepares
	// \|match_tokens_\| for the next check after iteration.
	match_tokens_.ResetLinks();
	if (pedal.second->IsConceptMatch(match_tokens_)) {
	return pedal.second.get();
	}
	}
	return nullptr;
	}

	OmniboxPedal* OmniboxPedalProvider::FindReadyPedalMatch(
	const AutocompleteInput& input,
	const std::u16string& match_text) {
	OmniboxPedal* const found = FindPedalMatch(match_text);
	if (found == nullptr \|\| !found->IsReadyToTrigger(input, client_)) {
	return nullptr;
	}

	field_trial_triggered_ = true;
	field_trial_triggered_in_session_ = true;

	return found;
	}

	void OmniboxPedalProvider::Tokenize(OmniboxPedal::TokenSequence& out_tokens,
	const std::u16string& text) const {
	// TODO(orinj): We may want to use FoldCase instead of ToLower here
	// once the JSON data is eliminated (for now it's still needed for tests).
	// See base/i18n/case_conversion.h for advice about unicode case handling.
	// FoldCase is equivalent to lower-casing for ASCII/English, but provides
	// more consistent (canonical) handling in other languages as well.
	std::u16string reduced_text = base::i18n::ToLower(text);
	base::RemoveChars(reduced_text, kRemoveChars, &reduced_text);
	out_tokens.Clear();
	if (tokenize_characters_.empty()) {
	// Tokenize on Unicode character boundaries when we have no delimiters.
	base::i18n::UTF16CharIterator char_iter(reduced_text);
	size_t left = 0;
	while (!char_iter.end()) {
	char_iter.Advance();
	size_t right = char_iter.array_pos();
	if (right > left) {
	const auto token = reduced_text.substr(left, right - left);
	const auto iter = dictionary_.find(token);
	if (iter == dictionary_.end() \|\| out_tokens.Size() >= max_tokens_) {
	// No Pedal can possibly match because we found a token not
	// present in the token dictionary, or the text has too many tokens.
	out_tokens.Clear();
	break;
	} else {
	out_tokens.Add(iter->second);
	}
	left = right;
	} else {
	break;
	}
	}
	} else {
	// Delimiters will neatly divide the string into tokens.
	StringTokenizer16 tokenizer(reduced_text, tokenize_characters_);
	while (tokenizer.GetNext()) {
	const auto iter = dictionary_.find(tokenizer.token());
	if (iter == dictionary_.end() \|\| out_tokens.Size() >= max_tokens_) {
	// No Pedal can possibly match because we found a token not
	// present in the token dictionary, or the text has too many tokens.
	out_tokens.Clear();
	break;
	} else {
	out_tokens.Add(iter->second);
	}
	}
	}
	}

	void OmniboxPedalProvider::TokenizeAndExpandDictionary(
	OmniboxPedal::TokenSequence& out_tokens,
	const std::u16string& token_sequence_string) {
	out_tokens.Clear();
	if (tokenize_characters_.empty()) {
	// Tokenize on Unicode character boundaries when we have no delimiters.
	base::i18n::UTF16CharIterator char_iter(token_sequence_string);
	size_t left = 0;
	while (!char_iter.end()) {
	char_iter.Advance();
	size_t right = char_iter.array_pos();
	if (right > left) {
	if (out_tokens.Size() >= max_tokens_) {
	// Can't take another token; the source data is invalid.
	out_tokens.Clear();
	break;
	}
	const std::u16string raw_token =
	token_sequence_string.substr(left, right - left);
	const std::u16string token = base::i18n::FoldCase(raw_token);
	const auto iter = dictionary_.find(token);
	if (iter == dictionary_.end()) {
	// Token not in dictionary; expand dictionary.
	out_tokens.Add(dictionary_.size());
	dictionary_.insert({token, dictionary_.size()});
	} else {
	// Token in dictionary; add existing token identifier to sequence.
	out_tokens.Add(iter->second);
	}
	left = right;
	} else {
	break;
	}
	}
	} else {
	// Delimiters will neatly divide the string into tokens.
	StringTokenizer16 tokenizer(token_sequence_string, tokenize_characters_);
	while (tokenizer.GetNext()) {
	if (out_tokens.Size() >= max_tokens_) {
	// Can't take another token; the source data is invalid.
	out_tokens.Clear();
	break;
	}
	std::u16string raw_token = tokenizer.token();
	base::StringPiece16 trimmed_token =
	base::TrimWhitespace(raw_token, base::TrimPositions::TRIM_ALL);
	std::u16string token = base::i18n::FoldCase(trimmed_token);
	const auto iter = dictionary_.find(token);
	if (iter == dictionary_.end()) {
	// Token not in dictionary; expand dictionary.
	out_tokens.Add(dictionary_.size());
	dictionary_.insert({std::move(token), dictionary_.size()});
	} else {
	// Token in dictionary; add existing token identifier to sequence.
	out_tokens.Add(iter->second);
	}
	}
	}
	}

	void OmniboxPedalProvider::LoadPedalConcepts() {
	// The locale is a two-letter language code, possibly followed by a dash and
	// country code. English locales include "en", "en-US", and "en-GB" while
	// non-English locales never start with "en".
	const bool locale_is_english =
	base::i18n::GetConfiguredLocale().substr(0, 2) == "en";

	// Load concept data then parse to base::Value in order to construct Pedals.
	std::string uncompressed_data =
	ui::ResourceBundle::GetSharedInstance().LoadLocalizedResourceString(
	IDR_OMNIBOX_PEDAL_CONCEPTS);
	const auto concept_data = base::JSONReader::Read(uncompressed_data);

	DCHECK(concept_data);
	DCHECK(concept_data->is_dict());

	const int data_version = concept_data->FindKey("data_version")->GetInt();
	CHECK_EQ(data_version, OMNIBOX_PEDAL_CONCEPTS_DATA_VERSION);

	max_tokens_ = concept_data->FindKey("max_tokens")->GetInt();
	// It is conceivable that some language may need more here, but the goal is
	// to sanity check input since it is trusted and used for vector reserve.
	DCHECK_LE(max_tokens_, kMaximumMaxTokens);

	if (concept_data->FindKey("tokenize_each_character")->GetBool()) {
	tokenize_characters_ = u"";
	} else {
	tokenize_characters_ = u" -";
	}

	const auto& dictionary =
	concept_data->FindKey("dictionary")->GetListDeprecated();
	dictionary_.reserve(dictionary.size());
	int token_id = 0;
	for (const auto& token_value : dictionary) {
	std::u16string token;
	if (token_value.is_string())
	token = base::UTF8ToUTF16(token_value.GetString());
	dictionary_.insert({token, token_id});
	++token_id;
	}

	ignore_group_ = LoadSynonymGroupString(
	false, false, l10n_util::GetStringUTF16(IDS_OMNIBOX_PEDALS_IGNORE_GROUP));
	if (tokenize_characters_.empty()) {
	// Translation console sourced data has lots of spaces, but in practice
	// the ignore group doesn't include a single space sequence. Rather than
	// burden l10n with getting this nuance in the data precisely specified,
	// we simply hardcode to ignore spaces. This applies for all languages
	// that don't tokenize on spaces (see `tokenize_characters_` above).
	ignore_group_.AddSynonym(
	OmniboxPedal::TokenSequence(std::vector<int>({dictionary_[u" "]})));
	}
	ignore_group_.SortSynonyms();

	for (const auto& pedal_value :
	concept_data->FindKey("pedals")->GetListDeprecated()) {
	DCHECK(pedal_value.is_dict());
	const int id = pedal_value.FindIntKey("id").value();
	const auto pedal_iter = pedals_.find(static_cast<OmniboxPedalId>(id));
	if (pedal_iter == pedals_.end()) {
	// Data may exist for Pedals that are intentionally not registered; skip.
	continue;
	}
	OmniboxPedal* pedal = pedal_iter->second.get();
	const base::Value* ui_strings =
	pedal_value.FindDictKey("omnibox_ui_strings");
	if (ui_strings && pedal->GetLabelStrings().hint.empty()) {
	pedal->SetLabelStrings(*ui_strings);
	}
	const std::string* url = pedal_value.FindStringKey("url");
	if (!url->empty()) {
	pedal->SetNavigationUrl(GURL(*url));
	}

	OmniboxPedal::TokenSequence verbatim_sequence(0);
	TokenizeAndExpandDictionary(verbatim_sequence,
	pedal->GetLabelStrings().hint);
	ignore_group_.EraseMatchesIn(verbatim_sequence, true);
	pedal->AddVerbatimSequence(std::move(verbatim_sequence));

	std::vector<OmniboxPedal::SynonymGroupSpec> specs =
	pedal->SpecifySynonymGroups(locale_is_english);
	// `specs` will be empty for any pedals not yet processed by l10n because
	// the appropriate string names won't be defined. In such cases, we fall
	// back to loading from JSON to robustly handle partial presence of data.
	if (specs.empty()) {
	for (const auto& group_value :
	pedal_value.FindKey("groups")->GetListDeprecated()) {
	// Note, group JSON values are preprocessed by the data generation tool.
	pedal->AddSynonymGroup(LoadSynonymGroupValue(group_value));
	}
	} else {
	for (const auto& spec : specs) {
	// Note, group strings are not preprocessed; they are the raw outputs
	// from translators in the localization pipeline, so we need to remove
	// ignore group sequences and validate remaining data. The groups
	// are sorted after erasing the ignore group to ensure no synonym
	// token sequences are made shorter than sequences later in the order,
	// which would break an invariant expected by the matching algorithm.
	OmniboxPedal::SynonymGroup group =
	LoadSynonymGroupString(spec.required, spec.match_once,
	l10n_util::GetStringUTF16(spec.message_id));
	group.EraseIgnoreGroup(ignore_group_);
	group.SortSynonyms();
	if (group.IsValid()) {
	pedal->AddSynonymGroup(std::move(group));
	}
	}
	}
	}
	}

	OmniboxPedal::SynonymGroup OmniboxPedalProvider::LoadSynonymGroupValue(
	const base::Value& group_value) const {
	DCHECK(group_value.is_dict());
	const bool required = group_value.FindKey("required")->GetBool();
	const bool single = group_value.FindKey("single")->GetBool();
	const auto& synonyms = group_value.FindKey("synonyms")->GetListDeprecated();
	OmniboxPedal::SynonymGroup synonym_group(required, single, synonyms.size());
	for (const auto& synonyms_value : synonyms) {
	DCHECK(synonyms_value.is_list());
	const auto& synonyms_value_list = synonyms_value.GetListDeprecated();
	OmniboxPedal::TokenSequence synonym_all_tokens(synonyms_value_list.size());
	for (const auto& token_index_value : synonyms_value_list) {
	synonym_all_tokens.Add(token_index_value.GetInt());
	}
	synonym_group.AddSynonym(std::move(synonym_all_tokens));
	}
	return synonym_group;
	}

	OmniboxPedal::SynonymGroup OmniboxPedalProvider::LoadSynonymGroupString(
	bool required,
	bool match_once,
	std::u16string synonyms_csv) {
	base::RemoveChars(synonyms_csv, kRemoveChars, &synonyms_csv);
	OmniboxPedal::SynonymGroup group(required, match_once, 0);
	// Note, 'ar' language uses '،' instead of ',' to delimit synonyms and
	// in some cases the 'ja' language data uses '、' to delimit synonyms.
	StringTokenizer16 tokenizer(synonyms_csv, u",،、");
	while (tokenizer.GetNext()) {
	OmniboxPedal::TokenSequence sequence(0);
	// In some languages where whitespace is significant but not a token
	// delimiter, we want to trim and normalize whitespace that might be
	// added by translators for reading convenience in translation console.
	TokenizeAndExpandDictionary(
	sequence, base::CollapseWhitespace(tokenizer.token(), false));
	group.AddSynonym(std::move(sequence));
	}
	return group;
	}