components/omnibox/browser/actions/omnibox_pedal_provider.cc - chromium/src - Git at Google

 // Copyright 2018 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "components/omnibox/browser/actions/omnibox_pedal_provider.h"

 #include <numeric>
 #include <string_view>
 #include <unordered_map>

 #include "base/i18n/case_conversion.h"
 #include "base/i18n/char_iterator.h"
 #include "base/i18n/rtl.h"
 #include "base/metrics/field_trial_params.h"
 #include "base/strings/string_tokenizer.h"
 #include "base/strings/string_util.h"
 #include "base/strings/utf_string_conversions.h"
 #include "base/trace_event/memory_usage_estimator.h"
 #include "components/omnibox/browser/actions/omnibox_pedal.h"
 #include "components/omnibox/browser/actions/omnibox_pedal_concepts.h"
 #include "components/omnibox/browser/autocomplete_input.h"
 #include "components/omnibox/browser/autocomplete_provider_client.h"
 #include "components/omnibox/browser/omnibox_field_trial.h"
 #include "components/omnibox/common/omnibox_features.h"
 #include "components/omnibox/resources/grit/omnibox_pedal_synonyms.h"
 #include "ui/base/l10n/l10n_util.h"

 namespace {
 typedef base::StringTokenizerT<std::u16string, std::u16string::const_iterator>
     StringTokenizer16;

 // This is a hard upper bound on the number of tokens that will be processed.
 // The value 61 was determined from the original body of translation data,
 // but translators gradually increased the length of strings. Each time
 // the limit is exceeded, the translation pipeline breaks, so 100 provides
 // some cushion; but it shouldn't be unbounded as that would adversely
 // affect performance. Pedals are checked on each keystroke in the omnibox.
 constexpr size_t kMaxTokens = 100;

 // All characters in this string get removed from text before processing.
 // U+200F is a RTL marker punctuation character that seems to throw
 // off some triggers in 'ar'.
 const char16_t kRemoveChars[] = {0x200F, 0};

 }  // namespace

 size_t EstimateMemoryUsage(scoped_refptr<OmniboxPedal> pedal) {
   // Consider the ref-counted Pedals to be part of the provider's memory usage.
   return pedal->EstimateMemoryUsage();
 }

 OmniboxPedalProvider::OmniboxPedalProvider(
     AutocompleteProviderClient& client,
     std::unordered_map<OmniboxPedalId, scoped_refptr<OmniboxPedal>> pedals)
     : client_(client),
       pedals_(std::move(pedals)),
       ignore_group_(false, false, 0),
       match_tokens_(kMaxTokens) {
   LoadPedalConcepts();

   // Cull Pedals with incomplete data; they won't trigger if not enabled,
   // but there's no need to keep them in the collection (iterated frequently).
   std::erase_if(pedals_, [](const auto& it) {
     const OmniboxPedal::LabelStrings& labels = it.second->GetLabelStrings();
     return labels.hint.empty() || labels.suggestion_contents.empty() ||
            labels.accessibility_hint.empty() ||
            labels.accessibility_suffix.empty();
   });
 }

 OmniboxPedalProvider::~OmniboxPedalProvider() = default;

 size_t OmniboxPedalProvider::EstimateMemoryUsage() const {
   size_t total = 0;
   total += base::trace_event::EstimateMemoryUsage(dictionary_);
   total += base::trace_event::EstimateMemoryUsage(ignore_group_);
   total += base::trace_event::EstimateMemoryUsage(pedals_);
   total += base::trace_event::EstimateMemoryUsage(tokenize_characters_);
   return total;
 }

 OmniboxPedal* OmniboxPedalProvider::FindPedalMatch(
     const std::u16string& match_text) {
   Tokenize(match_tokens_, match_text);
   if (match_tokens_.Size() == 0) {
     return nullptr;
   }

   // Note the ignore group is the only one that does full container
   // element erasure. This is necessary to prevent stop words from
   // breaking meaningful token sequences. For example, in the case
   // "make the most of chrome features", "the" must be fully
   // removed so as to not break detection of sequence "make the most of"
   // where "the" is removed by preprocessing. It becomes
   // "make most of" and would not match sequence "make _ most of"
   // where "the" was merely consumed instead of fully removed.
   if (ignore_group_.EraseMatchesIn(match_tokens_, true) &&
       match_tokens_.Size() == 0) {
     // Only ignored tokens were present, and all tokens were erased. No match.
     return nullptr;
   }

   for (const auto& pedal : pedals_) {
     // This restores link validity after above EraseMatchesIn call and prepares
     // |match_tokens_| for the next check after iteration.
     match_tokens_.ResetLinks();
     if (pedal.second->IsConceptMatch(match_tokens_)) {
       return pedal.second.get();
     }
   }
   return nullptr;
 }

 OmniboxPedal* OmniboxPedalProvider::FindReadyPedalMatch(
     const AutocompleteInput& input,
     const std::u16string& match_text) {
   OmniboxPedal* const found = FindPedalMatch(match_text);
   if (found == nullptr || !found->IsReadyToTrigger(input, *client_)) {
     return nullptr;
   }

   return found;
 }

 void OmniboxPedalProvider::Tokenize(OmniboxPedal::TokenSequence& out_tokens,
                                     const std::u16string& text) const {
   // Note that FoldCase (not ToLower) is used here and elsewhere in this code.
   // See base/i18n/case_conversion.h for advice about unicode case handling.
   // FoldCase is equivalent to lower-casing for ASCII/English, but provides
   // more consistent (canonical) handling in other languages as well.
   std::u16string reduced_text = base::i18n::FoldCase(text);
   base::RemoveChars(reduced_text, kRemoveChars, &reduced_text);
   out_tokens.Clear();
   if (tokenize_characters_.empty()) {
     // Tokenize on Unicode character boundaries when we have no delimiters.
     base::i18n::UTF16CharIterator char_iter(reduced_text);
     size_t left = 0;
     while (!char_iter.end()) {
       char_iter.Advance();
       size_t right = char_iter.array_pos();
       if (right > left) {
         const auto token = reduced_text.substr(left, right - left);
         const auto iter = dictionary_.find(token);
         if (iter == dictionary_.end() || out_tokens.Size() >= kMaxTokens) {
           // No Pedal can possibly match because we found a token not
           // present in the token dictionary, or the text has too many tokens.
           out_tokens.Clear();
           break;
         } else {
           out_tokens.Add(iter->second);
         }
         left = right;
       } else {
         break;
       }
     }
   } else {
     // Delimiters will neatly divide the string into tokens.
     StringTokenizer16 tokenizer(reduced_text, tokenize_characters_);
     while (tokenizer.GetNext()) {
       const auto iter = dictionary_.find(tokenizer.token());
       if (iter == dictionary_.end() || out_tokens.Size() >= kMaxTokens) {
         // No Pedal can possibly match because we found a token not
         // present in the token dictionary, or the text has too many tokens.
         out_tokens.Clear();
         break;
       } else {
         out_tokens.Add(iter->second);
       }
     }
   }
 }

 void OmniboxPedalProvider::TokenizeAndExpandDictionary(
     OmniboxPedal::TokenSequence& out_tokens,
     const std::u16string& token_sequence_string) {
   out_tokens.Clear();
   if (tokenize_characters_.empty()) {
     // Tokenize on Unicode character boundaries when we have no delimiters.
     base::i18n::UTF16CharIterator char_iter(token_sequence_string);
     size_t left = 0;
     while (!char_iter.end()) {
       char_iter.Advance();
       size_t right = char_iter.array_pos();
       if (right > left) {
         const std::u16string raw_token =
             token_sequence_string.substr(left, right - left);
         std::u16string token = base::i18n::FoldCase(raw_token);
         const auto iter = dictionary_.find(token);
         if (iter == dictionary_.end()) {
           // Token not in dictionary; expand dictionary.
           out_tokens.Add(dictionary_.size());
           dictionary_.insert({std::move(token), dictionary_.size()});
         } else {
           // Token in dictionary; add existing token identifier to sequence.
           out_tokens.Add(iter->second);
         }
         left = right;
       } else {
         break;
       }
     }
   } else {
     // Delimiters will neatly divide the string into tokens.
     StringTokenizer16 tokenizer(token_sequence_string, tokenize_characters_);
     while (tokenizer.GetNext()) {
       std::u16string raw_token = tokenizer.token();
       std::u16string_view trimmed_token =
           base::TrimWhitespace(raw_token, base::TrimPositions::TRIM_ALL);
       std::u16string token = base::i18n::FoldCase(trimmed_token);
       const auto iter = dictionary_.find(token);
       if (iter == dictionary_.end()) {
         // Token not in dictionary; expand dictionary.
         out_tokens.Add(dictionary_.size());
         dictionary_.insert({std::move(token), dictionary_.size()});
       } else {
         // Token in dictionary; add existing token identifier to sequence.
         out_tokens.Add(iter->second);
       }
     }
   }
 }

 void OmniboxPedalProvider::LoadPedalConcepts() {
   // The locale is a two-letter language code, possibly followed by a dash and
   // country code. English locales include "en", "en-US", and "en-GB" while
   // non-English locales never start with "en".
   const std::string locale = base::i18n::GetConfiguredLocale();
   const std::string language_code = locale.substr(0, 2);

   // According to the pedals localization data, only a few languages
   // were set to tokenize each character, so those are checked directly here.
   // Note, zh-CN was set to tokenize each character but zh-TW was not so the
   // full locale is checked for that exceptional case.
   if (language_code == "ja" || (language_code == "zh" && locale != "zh-TW")) {
     tokenize_characters_ = u"";
   } else {
     tokenize_characters_ = u" -";
   }

   ignore_group_ = LoadSynonymGroupString(
       false, false, l10n_util::GetStringUTF16(IDS_OMNIBOX_PEDALS_IGNORE_GROUP));
   if (tokenize_characters_.empty()) {
     // Translation console sourced data has lots of spaces, but in practice
     // the ignore group doesn't include a single space sequence. Rather than
     // burden l10n with getting this nuance in the data precisely specified,
     // we simply hardcode to ignore spaces. This applies for all languages
     // that don't tokenize on spaces (see `tokenize_characters_` above).
     ignore_group_.AddSynonym(
         OmniboxPedal::TokenSequence(std::vector<int>({dictionary_[u" "]})));
   }
   ignore_group_.SortSynonyms();

   for (auto& entry : pedals_) {
     OmniboxPedal* pedal = entry.second.get();
     OmniboxPedal::TokenSequence verbatim_sequence(0);
     TokenizeAndExpandDictionary(verbatim_sequence,
                                 pedal->GetLabelStrings().hint);
     ignore_group_.EraseMatchesIn(verbatim_sequence, true);
     pedal->AddVerbatimSequence(std::move(verbatim_sequence));

     std::vector<OmniboxPedal::SynonymGroupSpec> specs =
         pedal->SpecifySynonymGroups(language_code == "en");
     DCHECK(!specs.empty());
     for (const auto& spec : specs) {
       // Note, group strings are not preprocessed; they are the raw outputs
       // from translators in the localization pipeline, so we need to remove
       // ignore group sequences and validate remaining data. The groups
       // are sorted *after* erasing the ignore group to ensure no synonym
       // token sequences are made shorter than sequences later in the order,
       // which would break an invariant expected by the matching algorithm.
       OmniboxPedal::SynonymGroup group =
           LoadSynonymGroupString(spec.required, spec.match_once,
                                  l10n_util::GetStringUTF16(spec.message_id));
       group.EraseIgnoreGroup(ignore_group_);
       group.SortSynonyms();
       if (group.IsValid()) {
         pedal->AddSynonymGroup(std::move(group));
       }
     }
   }

   // Give all pedals a final chance to override/mutate based on feature flags.
   for (auto& entry : pedals_) {
     entry.second->OnLoaded();
   }
 }

 OmniboxPedal::SynonymGroup OmniboxPedalProvider::LoadSynonymGroupString(
     bool required,
     bool match_once,
     std::u16string synonyms_csv) {
   base::RemoveChars(synonyms_csv, kRemoveChars, &synonyms_csv);
   OmniboxPedal::SynonymGroup group(required, match_once, 0);
   // Note, 'ar' language uses '،' instead of ',' to delimit synonyms and
   // in some cases the 'ja' language data uses '、' to delimit synonyms.
   StringTokenizer16 tokenizer(synonyms_csv, u",،、");
   while (tokenizer.GetNext()) {
     OmniboxPedal::TokenSequence sequence(0);
     // In some languages where whitespace is significant but not a token
     // delimiter, we want to trim and normalize whitespace that might be
     // added by translators for reading convenience in translation console.
     TokenizeAndExpandDictionary(
         sequence, base::CollapseWhitespace(tokenizer.token(), false));
     // This DCHECK should only trigger in case of extra long translation
     // phrases which would need to be fixed in the translation data itself
     // for best efficiency, or by adjusting `kMaxTokens` if really necessary.
     DCHECK_LE(sequence.Size(), kMaxTokens);
     group.AddSynonym(std::move(sequence));
   }
   return group;
 }
	// Copyright 2018 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "components/omnibox/browser/actions/omnibox_pedal_provider.h"

	#include <numeric>
	#include <string_view>
	#include <unordered_map>

	#include "base/i18n/case_conversion.h"
	#include "base/i18n/char_iterator.h"
	#include "base/i18n/rtl.h"
	#include "base/metrics/field_trial_params.h"
	#include "base/strings/string_tokenizer.h"
	#include "base/strings/string_util.h"
	#include "base/strings/utf_string_conversions.h"
	#include "base/trace_event/memory_usage_estimator.h"
	#include "components/omnibox/browser/actions/omnibox_pedal.h"
	#include "components/omnibox/browser/actions/omnibox_pedal_concepts.h"
	#include "components/omnibox/browser/autocomplete_input.h"
	#include "components/omnibox/browser/autocomplete_provider_client.h"
	#include "components/omnibox/browser/omnibox_field_trial.h"
	#include "components/omnibox/common/omnibox_features.h"
	#include "components/omnibox/resources/grit/omnibox_pedal_synonyms.h"
	#include "ui/base/l10n/l10n_util.h"

	namespace {
	typedef base::StringTokenizerT<std::u16string, std::u16string::const_iterator>
	StringTokenizer16;

	// This is a hard upper bound on the number of tokens that will be processed.
	// The value 61 was determined from the original body of translation data,
	// but translators gradually increased the length of strings. Each time
	// the limit is exceeded, the translation pipeline breaks, so 100 provides
	// some cushion; but it shouldn't be unbounded as that would adversely
	// affect performance. Pedals are checked on each keystroke in the omnibox.
	constexpr size_t kMaxTokens = 100;

	// All characters in this string get removed from text before processing.
	// U+200F is a RTL marker punctuation character that seems to throw
	// off some triggers in 'ar'.
	const char16_t kRemoveChars[] = {0x200F, 0};

	} // namespace

	size_t EstimateMemoryUsage(scoped_refptr<OmniboxPedal> pedal) {
	// Consider the ref-counted Pedals to be part of the provider's memory usage.
	return pedal->EstimateMemoryUsage();
	}

	OmniboxPedalProvider::OmniboxPedalProvider(
	AutocompleteProviderClient& client,
	std::unordered_map<OmniboxPedalId, scoped_refptr<OmniboxPedal>> pedals)
	: client_(client),
	pedals_(std::move(pedals)),
	ignore_group_(false, false, 0),
	match_tokens_(kMaxTokens) {
	LoadPedalConcepts();

	// Cull Pedals with incomplete data; they won't trigger if not enabled,
	// but there's no need to keep them in the collection (iterated frequently).
	std::erase_if(pedals_, [](const auto& it) {
	const OmniboxPedal::LabelStrings& labels = it.second->GetLabelStrings();
	return labels.hint.empty() \|\| labels.suggestion_contents.empty() \|\|
	labels.accessibility_hint.empty() \|\|
	labels.accessibility_suffix.empty();
	});
	}

	OmniboxPedalProvider::~OmniboxPedalProvider() = default;

	size_t OmniboxPedalProvider::EstimateMemoryUsage() const {
	size_t total = 0;
	total += base::trace_event::EstimateMemoryUsage(dictionary_);
	total += base::trace_event::EstimateMemoryUsage(ignore_group_);
	total += base::trace_event::EstimateMemoryUsage(pedals_);
	total += base::trace_event::EstimateMemoryUsage(tokenize_characters_);
	return total;
	}

	OmniboxPedal* OmniboxPedalProvider::FindPedalMatch(
	const std::u16string& match_text) {
	Tokenize(match_tokens_, match_text);
	if (match_tokens_.Size() == 0) {
	return nullptr;
	}

	// Note the ignore group is the only one that does full container
	// element erasure. This is necessary to prevent stop words from
	// breaking meaningful token sequences. For example, in the case
	// "make the most of chrome features", "the" must be fully
	// removed so as to not break detection of sequence "make the most of"
	// where "the" is removed by preprocessing. It becomes
	// "make most of" and would not match sequence "make _ most of"
	// where "the" was merely consumed instead of fully removed.
	if (ignore_group_.EraseMatchesIn(match_tokens_, true) &&
	match_tokens_.Size() == 0) {
	// Only ignored tokens were present, and all tokens were erased. No match.
	return nullptr;
	}

	for (const auto& pedal : pedals_) {
	// This restores link validity after above EraseMatchesIn call and prepares
	// \|match_tokens_\| for the next check after iteration.
	match_tokens_.ResetLinks();
	if (pedal.second->IsConceptMatch(match_tokens_)) {
	return pedal.second.get();
	}
	}
	return nullptr;
	}

	OmniboxPedal* OmniboxPedalProvider::FindReadyPedalMatch(
	const AutocompleteInput& input,
	const std::u16string& match_text) {
	OmniboxPedal* const found = FindPedalMatch(match_text);
	if (found == nullptr \|\| !found->IsReadyToTrigger(input, *client_)) {
	return nullptr;
	}

	return found;
	}

	void OmniboxPedalProvider::Tokenize(OmniboxPedal::TokenSequence& out_tokens,
	const std::u16string& text) const {
	// Note that FoldCase (not ToLower) is used here and elsewhere in this code.
	// See base/i18n/case_conversion.h for advice about unicode case handling.
	// FoldCase is equivalent to lower-casing for ASCII/English, but provides
	// more consistent (canonical) handling in other languages as well.
	std::u16string reduced_text = base::i18n::FoldCase(text);
	base::RemoveChars(reduced_text, kRemoveChars, &reduced_text);
	out_tokens.Clear();
	if (tokenize_characters_.empty()) {
	// Tokenize on Unicode character boundaries when we have no delimiters.
	base::i18n::UTF16CharIterator char_iter(reduced_text);
	size_t left = 0;
	while (!char_iter.end()) {
	char_iter.Advance();
	size_t right = char_iter.array_pos();
	if (right > left) {
	const auto token = reduced_text.substr(left, right - left);
	const auto iter = dictionary_.find(token);
	if (iter == dictionary_.end() \|\| out_tokens.Size() >= kMaxTokens) {
	// No Pedal can possibly match because we found a token not
	// present in the token dictionary, or the text has too many tokens.
	out_tokens.Clear();
	break;
	} else {
	out_tokens.Add(iter->second);
	}
	left = right;
	} else {
	break;
	}
	}
	} else {
	// Delimiters will neatly divide the string into tokens.
	StringTokenizer16 tokenizer(reduced_text, tokenize_characters_);
	while (tokenizer.GetNext()) {
	const auto iter = dictionary_.find(tokenizer.token());
	if (iter == dictionary_.end() \|\| out_tokens.Size() >= kMaxTokens) {
	// No Pedal can possibly match because we found a token not
	// present in the token dictionary, or the text has too many tokens.
	out_tokens.Clear();
	break;
	} else {
	out_tokens.Add(iter->second);
	}
	}
	}
	}

	void OmniboxPedalProvider::TokenizeAndExpandDictionary(
	OmniboxPedal::TokenSequence& out_tokens,
	const std::u16string& token_sequence_string) {
	out_tokens.Clear();
	if (tokenize_characters_.empty()) {
	// Tokenize on Unicode character boundaries when we have no delimiters.
	base::i18n::UTF16CharIterator char_iter(token_sequence_string);
	size_t left = 0;
	while (!char_iter.end()) {
	char_iter.Advance();
	size_t right = char_iter.array_pos();
	if (right > left) {
	const std::u16string raw_token =
	token_sequence_string.substr(left, right - left);
	std::u16string token = base::i18n::FoldCase(raw_token);
	const auto iter = dictionary_.find(token);
	if (iter == dictionary_.end()) {
	// Token not in dictionary; expand dictionary.
	out_tokens.Add(dictionary_.size());
	dictionary_.insert({std::move(token), dictionary_.size()});
	} else {
	// Token in dictionary; add existing token identifier to sequence.
	out_tokens.Add(iter->second);
	}
	left = right;
	} else {
	break;
	}
	}
	} else {
	// Delimiters will neatly divide the string into tokens.
	StringTokenizer16 tokenizer(token_sequence_string, tokenize_characters_);
	while (tokenizer.GetNext()) {
	std::u16string raw_token = tokenizer.token();
	std::u16string_view trimmed_token =
	base::TrimWhitespace(raw_token, base::TrimPositions::TRIM_ALL);
	std::u16string token = base::i18n::FoldCase(trimmed_token);
	const auto iter = dictionary_.find(token);
	if (iter == dictionary_.end()) {
	// Token not in dictionary; expand dictionary.
	out_tokens.Add(dictionary_.size());
	dictionary_.insert({std::move(token), dictionary_.size()});
	} else {
	// Token in dictionary; add existing token identifier to sequence.
	out_tokens.Add(iter->second);
	}
	}
	}
	}

	void OmniboxPedalProvider::LoadPedalConcepts() {
	// The locale is a two-letter language code, possibly followed by a dash and
	// country code. English locales include "en", "en-US", and "en-GB" while
	// non-English locales never start with "en".
	const std::string locale = base::i18n::GetConfiguredLocale();
	const std::string language_code = locale.substr(0, 2);

	// According to the pedals localization data, only a few languages
	// were set to tokenize each character, so those are checked directly here.
	// Note, zh-CN was set to tokenize each character but zh-TW was not so the
	// full locale is checked for that exceptional case.
	if (language_code == "ja" \|\| (language_code == "zh" && locale != "zh-TW")) {
	tokenize_characters_ = u"";
	} else {
	tokenize_characters_ = u" -";
	}

	ignore_group_ = LoadSynonymGroupString(
	false, false, l10n_util::GetStringUTF16(IDS_OMNIBOX_PEDALS_IGNORE_GROUP));
	if (tokenize_characters_.empty()) {
	// Translation console sourced data has lots of spaces, but in practice
	// the ignore group doesn't include a single space sequence. Rather than
	// burden l10n with getting this nuance in the data precisely specified,
	// we simply hardcode to ignore spaces. This applies for all languages
	// that don't tokenize on spaces (see `tokenize_characters_` above).
	ignore_group_.AddSynonym(
	OmniboxPedal::TokenSequence(std::vector<int>({dictionary_[u" "]})));
	}
	ignore_group_.SortSynonyms();

	for (auto& entry : pedals_) {
	OmniboxPedal* pedal = entry.second.get();
	OmniboxPedal::TokenSequence verbatim_sequence(0);
	TokenizeAndExpandDictionary(verbatim_sequence,
	pedal->GetLabelStrings().hint);
	ignore_group_.EraseMatchesIn(verbatim_sequence, true);
	pedal->AddVerbatimSequence(std::move(verbatim_sequence));

	std::vector<OmniboxPedal::SynonymGroupSpec> specs =
	pedal->SpecifySynonymGroups(language_code == "en");
	DCHECK(!specs.empty());
	for (const auto& spec : specs) {
	// Note, group strings are not preprocessed; they are the raw outputs
	// from translators in the localization pipeline, so we need to remove
	// ignore group sequences and validate remaining data. The groups
	// are sorted after erasing the ignore group to ensure no synonym
	// token sequences are made shorter than sequences later in the order,
	// which would break an invariant expected by the matching algorithm.
	OmniboxPedal::SynonymGroup group =
	LoadSynonymGroupString(spec.required, spec.match_once,
	l10n_util::GetStringUTF16(spec.message_id));
	group.EraseIgnoreGroup(ignore_group_);
	group.SortSynonyms();
	if (group.IsValid()) {
	pedal->AddSynonymGroup(std::move(group));
	}
	}
	}

	// Give all pedals a final chance to override/mutate based on feature flags.
	for (auto& entry : pedals_) {
	entry.second->OnLoaded();
	}
	}

	OmniboxPedal::SynonymGroup OmniboxPedalProvider::LoadSynonymGroupString(
	bool required,
	bool match_once,
	std::u16string synonyms_csv) {
	base::RemoveChars(synonyms_csv, kRemoveChars, &synonyms_csv);
	OmniboxPedal::SynonymGroup group(required, match_once, 0);
	// Note, 'ar' language uses '،' instead of ',' to delimit synonyms and
	// in some cases the 'ja' language data uses '、' to delimit synonyms.
	StringTokenizer16 tokenizer(synonyms_csv, u",،、");
	while (tokenizer.GetNext()) {
	OmniboxPedal::TokenSequence sequence(0);
	// In some languages where whitespace is significant but not a token
	// delimiter, we want to trim and normalize whitespace that might be
	// added by translators for reading convenience in translation console.
	TokenizeAndExpandDictionary(
	sequence, base::CollapseWhitespace(tokenizer.token(), false));
	// This DCHECK should only trigger in case of extra long translation
	// phrases which would need to be fixed in the translation data itself
	// for best efficiency, or by adjusting `kMaxTokens` if really necessary.
	DCHECK_LE(sequence.Size(), kMaxTokens);
	group.AddSynonym(std::move(sequence));
	}
	return group;
	}