components/autofill/content/renderer/html_based_username_detector.cc - chromium/src - Git at Google

 // Copyright 2017 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "components/autofill/content/renderer/html_based_username_detector.h"

 #include <algorithm>
 #include <array>
 #include <string>
 #include <utility>
 #include <vector>

 #include "base/containers/contains.h"
 #include "base/containers/flat_set.h"
 #include "base/containers/span.h"
 #include "base/i18n/case_conversion.h"
 #include "base/memory/raw_span.h"
 #include "base/strings/string_split.h"
 #include "base/strings/utf_string_conversions.h"
 #include "components/autofill/content/renderer/form_autofill_util.h"
 #include "components/autofill/content/renderer/html_based_username_detector_vocabulary.h"
 #include "components/autofill/core/common/form_data.h"
 #include "third_party/blink/public/web/web_form_element.h"

 using blink::WebFormControlElement;
 using blink::WebFormElement;
 using blink::WebInputElement;

 namespace autofill {

 namespace {

 // List of separators that can appear in HTML attribute values.
 constexpr char16_t kDelimiters[] =
     u"$\"\'?%*@!\\/&^#:+~`;,>|<.[](){}-_ 0123456789";

 // Minimum length of a word, in order not to be considered short word. Short
 // words will not be searched in attribute values (especially after delimiters
 // removing), because a short word may be a part of another word. A short word
 // should be enclosed between delimiters, otherwise an occurrence doesn't count.
 constexpr int kMinimumWordLength = 4;

 // For each input element that can be a username, developer and user group
 // values are computed. The user group value includes what a user sees: label,
 // placeholder, aria-label (all are stored in FormFieldData.label). The
 // developer group value consists of name and id attribute values.
 // For each group the set of short tokens (tokens shorter than
 // |kMinimumWordLength|) is computed as well.
 struct UsernameFieldData {
   FieldRendererId renderer_id;
   std::u16string developer_value;
   base::flat_set<std::u16string> developer_short_tokens;
   std::u16string user_value;
   base::flat_set<std::u16string> user_short_tokens;
 };

 // Words that the algorithm looks for are split into multiple categories based
 // on feature reliability.
 // A category may contain a latin dictionary and a non-latin dictionary. It is
 // mandatory that it has a latin one, but a non-latin might be missing.
 // "Latin" translations are the translations of the words for which the
 // original translation is similar to the romanized translation (translation of
 // the word only using ISO basic Latin alphabet).
 // "Non-latin" translations are the translations of the words that have custom,
 // country specific characters.
 struct CategoryOfWords {
   const base::raw_span<const std::u16string_view> latin_dictionary;
   const base::raw_span<const std::u16string_view> non_latin_dictionary;
 };

 // 1. Removes delimiters from |raw_value| and appends the remainder to
 // |*field_data_value|. A sentinel symbol is added first if |*field_data_value|
 // is not empty.
 // 2. Tokenizes and appends short tokens (shorter than |kMinimumWordLength|)
 // from |raw_value| to |*field_data_short_tokens|, if any.
 void AppendValueAndShortTokens(
     const std::u16string& raw_value,
     std::u16string* field_data_value,
     base::flat_set<std::u16string>* field_data_short_tokens) {
   const std::u16string lowercase_value = base::i18n::ToLower(raw_value);
   std::vector<std::u16string_view> tokens =
       base::SplitStringPiece(lowercase_value, kDelimiters,
                              base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

   // When computing the developer value, '$' safety guard is being added
   // between field name and id, so that forming of accidental words is
   // prevented.
   if (!field_data_value->empty())
     field_data_value->push_back('$');

   field_data_value->reserve(field_data_value->size() + lowercase_value.size());
   std::vector<std::u16string> short_tokens;
   for (const std::u16string_view& token : tokens) {
     if (token.size() < kMinimumWordLength)
       short_tokens.emplace_back(token);
     field_data_value->append(token);
   }
   // It is better to insert elements to a |base::flat_set| in one operation.
   field_data_short_tokens->insert(short_tokens.begin(), short_tokens.end());
 }

 // For the given |input_element|, compute developer and user value, along with
 // sets of short tokens, and returns it.
 UsernameFieldData ComputeUsernameFieldData(const FormFieldData& field) {
   UsernameFieldData field_data;
   field_data.renderer_id = field.renderer_id();

   AppendValueAndShortTokens(field.name(), &field_data.developer_value,
                             &field_data.developer_short_tokens);
   AppendValueAndShortTokens(field.id_attribute(), &field_data.developer_value,
                             &field_data.developer_short_tokens);
   AppendValueAndShortTokens(field.label(), &field_data.user_value,
                             &field_data.user_short_tokens);
   return field_data;
 }

 void InferUsernameFieldData(
     const FormData& form_data,
     std::vector<UsernameFieldData>* possible_usernames_data) {
   for (const FormFieldData& field : form_data.fields()) {
     if (field.name().empty() &&
         field.form_control_type() == FormControlType::kInputPassword) {
       continue;
     }
     possible_usernames_data->push_back(ComputeUsernameFieldData(field));
   }
 }

 // Check if any word from |dictionary| is encountered in computed field
 // information (i.e. |value|, |tokens|).
 bool CheckFieldWithDictionary(
     const std::u16string& value,
     const base::flat_set<std::u16string>& short_tokens,
     base::span<const std::u16string_view> dictionary) {
   for (std::u16string_view word : dictionary) {
     if (word.length() < kMinimumWordLength) {
       // Treat short words by looking them up in the tokens set.
       if (short_tokens.find(word) != short_tokens.end()) {
         return true;
       }
     } else {
       // Treat long words by looking them up as a substring in |value|.
       if (value.find(word) != std::string::npos) {
         return true;
       }
     }
   }
   return false;
 }

 // Check if any word from |category| is encountered in computed field
 // information (|possible_username|).
 bool ContainsWordFromCategory(const UsernameFieldData& possible_username,
                               const CategoryOfWords& category) {
   // For user value, search in latin and non-latin dictionaries, because this
   // value is user visible. For developer value, only look up in latin
   /// dictionaries.
   return CheckFieldWithDictionary(possible_username.user_value,
                                   possible_username.user_short_tokens,
                                   category.latin_dictionary) ||
          CheckFieldWithDictionary(possible_username.user_value,
                                   possible_username.user_short_tokens,
                                   category.non_latin_dictionary) ||
          CheckFieldWithDictionary(possible_username.developer_value,
                                   possible_username.developer_short_tokens,
                                   category.latin_dictionary);
 }

 // Remove from |possible_usernames_data| the elements that definitely cannot be
 // usernames, because their computed values contain at least one negative word.
 void RemoveFieldsWithNegativeWords(
     std::vector<UsernameFieldData>* possible_usernames_data) {
   static constexpr CategoryOfWords kNegativeCategory = {kNegativeLatin,
                                                         kNegativeNonLatin};

   std::erase_if(
       *possible_usernames_data, [](const UsernameFieldData& possible_username) {
         return ContainsWordFromCategory(possible_username, kNegativeCategory);
       });
 }

 // Check if any word from the given category (|category|) appears in fields from
 // the form (|possible_usernames_data|). If the category words appear in more
 // than 2 fields, do nothing, because it may just be a prefix. If the words
 // appears in 1 or 2 fields, the first field is added to |username_predictions|.
 void FindWordsFromCategoryInForm(
     const std::vector<UsernameFieldData>& possible_usernames_data,
     const CategoryOfWords& category,
     std::vector<FieldRendererId>* username_predictions) {
   // Auxiliary element that contains the first field (in order of appearance in
   // the form) in which a substring is encountered.
   FieldRendererId chosen_field_renderer_id;

   size_t fields_found = 0;
   for (const UsernameFieldData& field_data : possible_usernames_data) {
     if (ContainsWordFromCategory(field_data, category)) {
       if (fields_found == 0) {
         chosen_field_renderer_id = field_data.renderer_id;
       }
       fields_found++;
     }
   }

   if (fields_found > 0 && fields_found <= 2)
     if (!base::Contains(*username_predictions, chosen_field_renderer_id))
       username_predictions->push_back(chosen_field_renderer_id);
 }

 // Find username elements if there is no cached result for the given form and
 // add them to |username_predictions| in the order of decreasing reliability.
 void FindUsernameFieldInternal(
     const FormData& form_data,
     std::vector<FieldRendererId>* username_predictions) {
   DCHECK(username_predictions);
   DCHECK(username_predictions->empty());

   static constexpr CategoryOfWords kUsernameCategory = {kUsernameLatin,
                                                         kUsernameLatin};
   static constexpr CategoryOfWords kUserCategory = {kUserLatin, kUserNonLatin};
   static constexpr CategoryOfWords kTechnicalCategory = {kTechnicalWords, {}};
   static constexpr CategoryOfWords kWeakCategory = {kWeakWords, {}};
   // These categories contain words that point to username field.
   // Order of categories is vital: the detector searches for words in descending
   // order of probability to point to a username field.
   static constexpr auto kPositiveCategories = std::to_array<CategoryOfWords>(
       {kUsernameCategory, kUserCategory, kTechnicalCategory, kWeakCategory});
   std::vector<UsernameFieldData> possible_usernames_data;

   InferUsernameFieldData(form_data, &possible_usernames_data);
   RemoveFieldsWithNegativeWords(&possible_usernames_data);

   // These are the searches performed by the username detector.
   for (const CategoryOfWords& category : kPositiveCategories) {
     FindWordsFromCategoryInForm(possible_usernames_data, category,
                                 username_predictions);
   }
 }

 }  // namespace

 const std::vector<FieldRendererId>& GetPredictionsFieldBasedOnHtmlAttributes(
     const FormData& form_data,
     UsernameDetectorCache* username_detector_cache) {
   // The cache will store the object referenced in the return value, so it must
   // exist. It can be empty.
   DCHECK(username_detector_cache);

   auto [form_position, cache_miss] = username_detector_cache->emplace(
       form_data.renderer_id(), std::vector<FieldRendererId>());

   if (cache_miss) {
     std::vector<FieldRendererId> username_predictions;
     FindUsernameFieldInternal(form_data, &username_predictions);
     if (!username_predictions.empty())
       form_position->second = std::move(username_predictions);
   }
   return form_position->second;
 }

 }  // namespace autofill
	// Copyright 2017 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "components/autofill/content/renderer/html_based_username_detector.h"

	#include <algorithm>
	#include <array>
	#include <string>
	#include <utility>
	#include <vector>

	#include "base/containers/contains.h"
	#include "base/containers/flat_set.h"
	#include "base/containers/span.h"
	#include "base/i18n/case_conversion.h"
	#include "base/memory/raw_span.h"
	#include "base/strings/string_split.h"
	#include "base/strings/utf_string_conversions.h"
	#include "components/autofill/content/renderer/form_autofill_util.h"
	#include "components/autofill/content/renderer/html_based_username_detector_vocabulary.h"
	#include "components/autofill/core/common/form_data.h"
	#include "third_party/blink/public/web/web_form_element.h"

	using blink::WebFormControlElement;
	using blink::WebFormElement;
	using blink::WebInputElement;

	namespace autofill {

	namespace {

	// List of separators that can appear in HTML attribute values.
	constexpr char16_t kDelimiters[] =
	u"$\"\'?%*@!\\/&^#:+~`;,>\|<.[](){}-_ 0123456789";

	// Minimum length of a word, in order not to be considered short word. Short
	// words will not be searched in attribute values (especially after delimiters
	// removing), because a short word may be a part of another word. A short word
	// should be enclosed between delimiters, otherwise an occurrence doesn't count.
	constexpr int kMinimumWordLength = 4;

	// For each input element that can be a username, developer and user group
	// values are computed. The user group value includes what a user sees: label,
	// placeholder, aria-label (all are stored in FormFieldData.label). The
	// developer group value consists of name and id attribute values.
	// For each group the set of short tokens (tokens shorter than
	// \|kMinimumWordLength\|) is computed as well.
	struct UsernameFieldData {
	FieldRendererId renderer_id;
	std::u16string developer_value;
	base::flat_set<std::u16string> developer_short_tokens;
	std::u16string user_value;
	base::flat_set<std::u16string> user_short_tokens;
	};

	// Words that the algorithm looks for are split into multiple categories based
	// on feature reliability.
	// A category may contain a latin dictionary and a non-latin dictionary. It is
	// mandatory that it has a latin one, but a non-latin might be missing.
	// "Latin" translations are the translations of the words for which the
	// original translation is similar to the romanized translation (translation of
	// the word only using ISO basic Latin alphabet).
	// "Non-latin" translations are the translations of the words that have custom,
	// country specific characters.
	struct CategoryOfWords {
	const base::raw_span<const std::u16string_view> latin_dictionary;
	const base::raw_span<const std::u16string_view> non_latin_dictionary;
	};

	// 1. Removes delimiters from \|raw_value\| and appends the remainder to
	// \|field_data_value\|. A sentinel symbol is added first if \|field_data_value\|
	// is not empty.
	// 2. Tokenizes and appends short tokens (shorter than \|kMinimumWordLength\|)
	// from \|raw_value\| to \|*field_data_short_tokens\|, if any.
	void AppendValueAndShortTokens(
	const std::u16string& raw_value,
	std::u16string* field_data_value,
	base::flat_set<std::u16string>* field_data_short_tokens) {
	const std::u16string lowercase_value = base::i18n::ToLower(raw_value);
	std::vector<std::u16string_view> tokens =
	base::SplitStringPiece(lowercase_value, kDelimiters,
	base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

	// When computing the developer value, '$' safety guard is being added
	// between field name and id, so that forming of accidental words is
	// prevented.
	if (!field_data_value->empty())
	field_data_value->push_back('$');

	field_data_value->reserve(field_data_value->size() + lowercase_value.size());
	std::vector<std::u16string> short_tokens;
	for (const std::u16string_view& token : tokens) {
	if (token.size() < kMinimumWordLength)
	short_tokens.emplace_back(token);
	field_data_value->append(token);
	}
	// It is better to insert elements to a \|base::flat_set\| in one operation.
	field_data_short_tokens->insert(short_tokens.begin(), short_tokens.end());
	}

	// For the given \|input_element\|, compute developer and user value, along with
	// sets of short tokens, and returns it.
	UsernameFieldData ComputeUsernameFieldData(const FormFieldData& field) {
	UsernameFieldData field_data;
	field_data.renderer_id = field.renderer_id();

	AppendValueAndShortTokens(field.name(), &field_data.developer_value,
	&field_data.developer_short_tokens);
	AppendValueAndShortTokens(field.id_attribute(), &field_data.developer_value,
	&field_data.developer_short_tokens);
	AppendValueAndShortTokens(field.label(), &field_data.user_value,
	&field_data.user_short_tokens);
	return field_data;
	}

	void InferUsernameFieldData(
	const FormData& form_data,
	std::vector<UsernameFieldData>* possible_usernames_data) {
	for (const FormFieldData& field : form_data.fields()) {
	if (field.name().empty() &&
	field.form_control_type() == FormControlType::kInputPassword) {
	continue;
	}
	possible_usernames_data->push_back(ComputeUsernameFieldData(field));
	}
	}

	// Check if any word from \|dictionary\| is encountered in computed field
	// information (i.e. \|value\|, \|tokens\|).
	bool CheckFieldWithDictionary(
	const std::u16string& value,
	const base::flat_set<std::u16string>& short_tokens,
	base::span<const std::u16string_view> dictionary) {
	for (std::u16string_view word : dictionary) {
	if (word.length() < kMinimumWordLength) {
	// Treat short words by looking them up in the tokens set.
	if (short_tokens.find(word) != short_tokens.end()) {
	return true;
	}
	} else {
	// Treat long words by looking them up as a substring in \|value\|.
	if (value.find(word) != std::string::npos) {
	return true;
	}
	}
	}
	return false;
	}

	// Check if any word from \|category\| is encountered in computed field
	// information (\|possible_username\|).
	bool ContainsWordFromCategory(const UsernameFieldData& possible_username,
	const CategoryOfWords& category) {
	// For user value, search in latin and non-latin dictionaries, because this
	// value is user visible. For developer value, only look up in latin
	/// dictionaries.
	return CheckFieldWithDictionary(possible_username.user_value,
	possible_username.user_short_tokens,
	category.latin_dictionary) \|\|
	CheckFieldWithDictionary(possible_username.user_value,
	possible_username.user_short_tokens,
	category.non_latin_dictionary) \|\|
	CheckFieldWithDictionary(possible_username.developer_value,
	possible_username.developer_short_tokens,
	category.latin_dictionary);
	}

	// Remove from \|possible_usernames_data\| the elements that definitely cannot be
	// usernames, because their computed values contain at least one negative word.
	void RemoveFieldsWithNegativeWords(
	std::vector<UsernameFieldData>* possible_usernames_data) {
	static constexpr CategoryOfWords kNegativeCategory = {kNegativeLatin,
	kNegativeNonLatin};

	std::erase_if(
	*possible_usernames_data, [](const UsernameFieldData& possible_username) {
	return ContainsWordFromCategory(possible_username, kNegativeCategory);
	});
	}

	// Check if any word from the given category (\|category\|) appears in fields from
	// the form (\|possible_usernames_data\|). If the category words appear in more
	// than 2 fields, do nothing, because it may just be a prefix. If the words
	// appears in 1 or 2 fields, the first field is added to \|username_predictions\|.
	void FindWordsFromCategoryInForm(
	const std::vector<UsernameFieldData>& possible_usernames_data,
	const CategoryOfWords& category,
	std::vector<FieldRendererId>* username_predictions) {
	// Auxiliary element that contains the first field (in order of appearance in
	// the form) in which a substring is encountered.
	FieldRendererId chosen_field_renderer_id;

	size_t fields_found = 0;
	for (const UsernameFieldData& field_data : possible_usernames_data) {
	if (ContainsWordFromCategory(field_data, category)) {
	if (fields_found == 0) {
	chosen_field_renderer_id = field_data.renderer_id;
	}
	fields_found++;
	}
	}

	if (fields_found > 0 && fields_found <= 2)
	if (!base::Contains(*username_predictions, chosen_field_renderer_id))
	username_predictions->push_back(chosen_field_renderer_id);
	}

	// Find username elements if there is no cached result for the given form and
	// add them to \|username_predictions\| in the order of decreasing reliability.
	void FindUsernameFieldInternal(
	const FormData& form_data,
	std::vector<FieldRendererId>* username_predictions) {
	DCHECK(username_predictions);
	DCHECK(username_predictions->empty());

	static constexpr CategoryOfWords kUsernameCategory = {kUsernameLatin,
	kUsernameLatin};
	static constexpr CategoryOfWords kUserCategory = {kUserLatin, kUserNonLatin};
	static constexpr CategoryOfWords kTechnicalCategory = {kTechnicalWords, {}};
	static constexpr CategoryOfWords kWeakCategory = {kWeakWords, {}};
	// These categories contain words that point to username field.
	// Order of categories is vital: the detector searches for words in descending
	// order of probability to point to a username field.
	static constexpr auto kPositiveCategories = std::to_array<CategoryOfWords>(
	{kUsernameCategory, kUserCategory, kTechnicalCategory, kWeakCategory});
	std::vector<UsernameFieldData> possible_usernames_data;

	InferUsernameFieldData(form_data, &possible_usernames_data);
	RemoveFieldsWithNegativeWords(&possible_usernames_data);

	// These are the searches performed by the username detector.
	for (const CategoryOfWords& category : kPositiveCategories) {
	FindWordsFromCategoryInForm(possible_usernames_data, category,
	username_predictions);
	}
	}

	} // namespace

	const std::vector<FieldRendererId>& GetPredictionsFieldBasedOnHtmlAttributes(
	const FormData& form_data,
	UsernameDetectorCache* username_detector_cache) {
	// The cache will store the object referenced in the return value, so it must
	// exist. It can be empty.
	DCHECK(username_detector_cache);

	auto [form_position, cache_miss] = username_detector_cache->emplace(
	form_data.renderer_id(), std::vector<FieldRendererId>());

	if (cache_miss) {
	std::vector<FieldRendererId> username_predictions;
	FindUsernameFieldInternal(form_data, &username_predictions);
	if (!username_predictions.empty())
	form_position->second = std::move(username_predictions);
	}
	return form_position->second;
	}

	} // namespace autofill