blob: 83506229898541e8dd071c76040db8e34239bc39 [file]
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/autofill/core/browser/form_parsing/address_field_parser_ng.h"
#include <initializer_list>
#include <ostream>
#include <string_view>
#include <utility>
#include "base/logging.h"
#include "base/strings/string_number_conversions.h"
#include "components/autofill/core/browser/autofill_field.h"
#include "components/autofill/core/browser/data_model/addresses/autofill_i18n_api.h"
#include "components/autofill/core/browser/field_types.h"
#include "components/autofill/core/browser/form_parsing/autofill_scanner.h"
namespace autofill {
namespace {
// Specify `--vmodule=address_field_parser_ng=1` to get insights into the
// classification process. It will produce output describing the recursive
// exploration of field type assignments.
constexpr FieldType kAddressLines[] = {ADDRESS_HOME_LINE1, ADDRESS_HOME_LINE2,
ADDRESS_HOME_LINE3};
constexpr FieldTypeSet kAddressLinesFieldTypeSet({ADDRESS_HOME_LINE1,
ADDRESS_HOME_LINE2,
ADDRESS_HOME_LINE3});
// Adds a FormControlType to MatchParams.
MatchParams MatchParamsWithFieldType(MatchParams p,
FormControlType field_type) {
p.field_types.insert(field_type);
return p;
}
// Removes a MatchAttribute from MatchParams.
MatchParams MatchParamsWithoutAttribute(MatchParams p,
MatchAttribute attribute) {
p.attributes.erase(attribute);
return p;
}
std::string SequenceToScoreString(const ClassifiedFieldSequence& sequence) {
return base::NumberToString(sequence.contained_types.size()) + "/" +
base::NumberToString(sequence.score);
}
} // namespace
// This class stores precalculated work for the address hierarchy of a
// specific country which should be precalculated to prevent repetitive work
// during form parsing.
class AddressFieldParserNG::FieldTypeInformation {
public:
explicit FieldTypeInformation(AddressCountryCode country_code);
~FieldTypeInformation();
// Returns the set of field types supported by the address hierarchy of
// the country passed in the constructor.
FieldTypeSet supported_field_types() const { return supported_field_types_; }
// Returns the set of field types that must not occur if `type` is already
// assigned to a field:
// - If a type T was assigned to a field, no ancestor or descendant of T
// should be assigned to one of the following fields in the parsing run
// because one would contain the other.
// E.g. a street name and a street address are incompatible because a
// a street name is contained in the street address.
// - Each type T is incompatible with itself (we want to classify only one
// field as a type in a single parse run).
// E.g. we don't want to classify two postal code fields.
// - For synthesized field types, we also consider two types T and T2 as
// incompatible if T and T2 share any descendants.
// E.g. a landmark+street-location is incompatible to a landmark+locality
// because they share a landmark.
// - Structured address details (street name, house number, ...) are
// incompatible with address lines.
// The function must only be called on `FieldType`s that exist in
// `supported_field_types()`.
FieldTypeSet incompatible_field_types(FieldType type) const {
return incompatible_.at(type);
}
private:
void InitializeFieldTypesAndDescendants(AddressComponent* node);
void InitializeIncompatibilities();
const bool is_custom_hierarchy_available_for_country = false;
// All field types existing in the address model of the specified country.
FieldTypeSet supported_field_types_;
// All descendants of each field type, including the key field type itself.
base::flat_map<FieldType, FieldTypeSet> descendants_and_self_;
// All field types that are incompatible with the key field type meaning that
// those field types must not be produced by the same execution of
// `AddressFieldParserNG::Parse()` (e.g. because a ClassifiedFieldSequence
// should not contain multiple instances of the same field type or because
// address lines 1, 2, 3 should not co-exist with a landmark; see
// `incompatible_field_types()`).
base::flat_map<FieldType, FieldTypeSet> incompatible_;
friend std::ostream& operator<<(std::ostream& os,
const FieldTypeInformation& field_types);
};
AddressFieldParserNG::FieldTypeInformation::FieldTypeInformation(
AddressCountryCode country_code)
: is_custom_hierarchy_available_for_country(
i18n_model_definition::IsCustomHierarchyAvailableForCountry(
country_code)) {
AddressComponentsStore model =
i18n_model_definition::CreateAddressComponentModel(country_code);
InitializeFieldTypesAndDescendants(model.Root());
// Address lines 1, 2, 3 are not part of the `model` (they are derived from a
// ADDRESS_HOME_STREET_ADDRESS). Therefore, they are not considered by
// `InitializeFieldTypesAndDescendants()`. For the purpose of classifying
// fields in an address form, they should still be listed in
// `supported_field_types_` and `descendants_and_self_`.
for (FieldType child_type : kAddressLines) {
supported_field_types_.insert(child_type);
descendants_and_self_[child_type].insert(child_type);
descendants_and_self_[ADDRESS_HOME_STREET_ADDRESS].insert(child_type);
}
// The COMPANY_NAME is not part of the address model but classified by the
// AddressFieldParser.
supported_field_types_.insert(COMPANY_NAME);
descendants_and_self_[COMPANY_NAME].insert(COMPANY_NAME);
incompatible_[COMPANY_NAME] = FieldTypeSet({COMPANY_NAME});
// UNKNOWN_TYPE is a non-standard field type that is repurposed for internal
// logic. It is used to skip certain fields in the classification that may
// occur at arbitrary locations in the address form but don't belong to the
// address hierarchy. For example an email field belongs into this category.
// UNKNOWN_TYPE is only used internally by the `AddressFieldParserNG` and
// never returned to the caller of the parse function.
supported_field_types_.insert(UNKNOWN_TYPE);
descendants_and_self_[UNKNOWN_TYPE].insert(UNKNOWN_TYPE);
for (FieldType t : supported_field_types_) {
CHECK(descendants_and_self_.contains(t)) << FieldTypeToStringView(t);
}
InitializeIncompatibilities();
DVLOG(1) << "FieldTypeInformation for " << country_code.value();
DVLOG(1) << *this;
}
AddressFieldParserNG::FieldTypeInformation::~FieldTypeInformation() = default;
void AddressFieldParserNG::FieldTypeInformation::
InitializeFieldTypesAndDescendants(AddressComponent* node) {
// This function is a recursive descend through the address model tree for a
// specific country to collect all field types that occur in the address model
// in `supported_field_types_` and determine for each node which descendants
// exist.
FieldType field_type = node->GetStorageType();
if (descendants_and_self_.contains(field_type)) {
return;
}
descendants_and_self_[field_type].insert(field_type);
supported_field_types_.insert(field_type);
auto InitializeChild = [&](AddressComponent* child) {
InitializeFieldTypesAndDescendants(child);
// Invariant: All children have already updated their
// `descendants_and_self_`.
// Note: Don't inline the following line because two operator[]() calls lead
// to undefined behavior. Each one may modify the underlying map and return
// invalid references.
FieldTypeSet values_of_child =
descendants_and_self_[child->GetStorageType()];
descendants_and_self_[field_type].insert_all(values_of_child);
};
for (AddressComponent* child : node->Subcomponents()) {
InitializeChild(child);
}
for (AddressComponent* child : node->SynthesizedSubcomponents()) {
InitializeChild(child);
}
}
void AddressFieldParserNG::FieldTypeInformation::InitializeIncompatibilities() {
// Each field type may be assigned only once and is therefore incompatible to
// itself.
for (FieldType field_type : supported_field_types_) {
incompatible_[field_type] = FieldTypeSet();
}
// The descendants of a ADDRESS_HOME_STREET_ADDRESS in the address model are
// (except for address lines 1, 2, 3) the structured street address components
// like street name, house number, etc.
// If either kAutofillStructuredFieldsDisableAddressLines is enabled or a
// country is explicitly modeled for the i18n address hierarchy, these
// structured address components are incompatible with address lines 1, 2, 3.
// For the legacy model we allow an address line 2 to be paired with a street
// name and house number.
const bool autofill_structured_fields_disable_address_lines =
base::FeatureList::IsEnabled(
features::kAutofillStructuredFieldsDisableAddressLines) ||
is_custom_hierarchy_available_for_country;
// Because address lines 1, 2, 3 are already attached as children of
// ADDRESS_HOME_STREET_ADDRESS, they need to be removed first to get true set
// of structured address components.
FieldTypeSet structured_address_components =
descendants_and_self_[ADDRESS_HOME_STREET_ADDRESS];
structured_address_components.erase_all(kAddressLinesFieldTypeSet);
for (FieldType child_type : structured_address_components) {
if (autofill_structured_fields_disable_address_lines) {
for (FieldType address_line : kAddressLines) {
incompatible_[child_type].insert(address_line);
incompatible_[address_line].insert(child_type);
}
} else {
if (supported_field_types_.contains(ADDRESS_HOME_STREET_NAME)) {
incompatible_[ADDRESS_HOME_LINE1].insert(ADDRESS_HOME_STREET_NAME);
incompatible_[ADDRESS_HOME_STREET_NAME].insert(ADDRESS_HOME_LINE1);
}
if (supported_field_types_.contains(ADDRESS_HOME_HOUSE_NUMBER)) {
incompatible_[ADDRESS_HOME_LINE1].insert(ADDRESS_HOME_HOUSE_NUMBER);
incompatible_[ADDRESS_HOME_HOUSE_NUMBER].insert(ADDRESS_HOME_LINE1);
}
}
}
// A field type T is incompatible with its ancestors and descendants. For
// synthesized nodes it's also possible that a field type T is incompatible
// with a type T2 that is neither an ancestor nor a descendant but shares
// some descendants.
for (FieldType c1 : supported_field_types_) {
for (FieldType c2 : supported_field_types_) {
// Comparing the underlying integer values of c1 and c2 is a speed
// optimization to avoid redundant work that would happen due to symmetry.
if (std::to_underlying(c1) > std::to_underlying(c2)) {
continue;
}
// Note: Don't inline the following line because two operator[]() calls
// lead to undefined behavior. Each one may modify the underlying map and
// return invalid references.
FieldTypeSet c2_values = descendants_and_self_[c2];
if (descendants_and_self_[c1].contains_any(c2_values)) {
incompatible_[c1].insert(c2);
incompatible_[c2].insert(c1);
}
}
}
}
std::ostream& operator<<(
std::ostream& os,
const AddressFieldParserNG::FieldTypeInformation& field_types) {
os << "FieldTypeInformation::descendants_and_self_:\n";
for (FieldType child_type : field_types.supported_field_types_) {
if (!field_types.descendants_and_self_.at(child_type).empty()) {
os << FieldTypeToStringView(child_type) << ":";
for (FieldType desc : field_types.descendants_and_self_.at(child_type)) {
os << " " << FieldTypeToStringView(desc);
}
os << "\n";
}
}
os << "FieldTypeInformation::incompatible_:\n";
for (FieldType child_type : field_types.supported_field_types_) {
if (!field_types.incompatible_.at(child_type).empty()) {
os << FieldTypeToStringView(child_type) << ":";
for (FieldType incompatible : field_types.incompatible_.at(child_type)) {
os << " " << FieldTypeToStringView(incompatible);
}
os << "\n";
}
}
return os;
}
ClassifiedFieldSequence::ClassifiedFieldSequence() = default;
ClassifiedFieldSequence::~ClassifiedFieldSequence() = default;
bool ClassifiedFieldSequence::BetterThan(
const ClassifiedFieldSequence& other) const {
if (contained_types.size() != other.contained_types.size()) {
return contained_types.size() > other.contained_types.size();
}
return score > other.score;
}
// TODO(crbug.com/328954153): This initialization of prepared work could be
// cached in a registry to prevent the repetitive creation effort.
AddressFieldParserNG::AddressFieldParserNG(AddressCountryCode client_country)
: field_types_(std::make_unique<FieldTypeInformation>(client_country)) {}
AddressFieldParserNG::~AddressFieldParserNG() = default;
// static
std::unique_ptr<FormFieldParser> AddressFieldParserNG::Parse(
ParsingContext& context,
AutofillScanner& scanner) {
if (scanner.IsEnd()) {
return nullptr;
}
const AutofillScanner::Position saved_cursor = scanner.GetPosition();
std::unique_ptr<AddressFieldParserNG> address_field(new AddressFieldParserNG(
AddressCountryCode(context.client_country.value())));
address_field->context_ = &context;
address_field->scanner_ = &scanner;
address_field->initial_field_ = &scanner.Cursor();
DVLOG(1) << "Parse recursively starting at " << scanner.GetOffset() << " "
<< scanner.Cursor().label();
address_field->ParseRecursively();
// These members are used during the parse run and should be cleared because
// we cannot make any life-cycle assumptions on them beyond the call of Parse.
address_field->context_ = nullptr;
address_field->scanner_ = nullptr;
address_field->initial_field_ = nullptr;
// As per the contract of parse functions: If a viable classification was
// found, set the cursor to the last classified field + 1, otherwise return
// the scanner in the initial state.
if (!address_field->best_classification_.assignments.empty()) {
scanner.Restore(
*address_field->best_classification_.last_classified_field_index);
scanner.Advance();
return address_field;
}
scanner.Restore(saved_cursor);
return nullptr;
}
void AddressFieldParserNG::AddClassifications(
FieldCandidatesMap& field_candidates) const {
for (auto [field_type, field_ptr] : best_classification_.assignments) {
if (!field_ptr) {
continue;
}
// TODO(crbug.com/320965828): Support MatchInfo. The NG parser doesn't track
// how matches are found. `kHighQualityLabel` is merely a placeholder.
AddClassification(
FieldAndMatchInfo(field_ptr,
{.matched_attribute =
MatchInfo::MatchAttribute::kHighQualityLabel}),
field_type, kBaseAddressParserScore, field_candidates);
}
}
std::optional<double> AddressFieldParserNG::FindScoreOfBestMatchingRule(
FieldType field_type) {
// Naming convention: In the following code,
// auto r =
// is a short cut for
// std::optional<double> result =
// is used consistently to keep the code readable.
// Give the label priority over the name to avoid misclassifications when the
// name has a misleading value (e.g. in MX the input field for
// "Municipio/Delegación" is sometimes named "city" even though that should be
// mapped to a "Ciudad").
bool prefer_label = context_->client_country == GeoIpCountryCode("MX");
auto MatchOnlyLabel = [](const MatchParams& p) {
return MatchParamsWithoutAttribute(p, MatchAttribute::kName);
};
auto MatchOnlyName = [](const MatchParams& p) {
return MatchParamsWithoutAttribute(p, MatchAttribute::kLabel);
};
// Returns `score` if the regex pattern identified by `pattern_name` matches
// against the label or name of a field. In some countries we prefer matches
// to labels over matches to field names; in other countries we prefer matches
// to field names. If a match happens on the preferred attribute, the score is
// boosted by 0.05.
auto Match = [&](std::string_view pattern_name, double score,
MatchParams (*match_pattern_projection)(const MatchParams&) =
nullptr) -> std::optional<double> {
// Helper function to consecutively match the regex against the label and
// the name attribute in the desired order and adding a boost in case the
// preferred attribute match.
auto MatchAttribute = [&](bool match_label) -> std::optional<double> {
if (FieldMatchesMatchPatternRef(
*context_, scanner_->Cursor(), pattern_name,
{match_label ? MatchOnlyLabel : MatchOnlyName,
match_pattern_projection})) {
return score + (match_label == prefer_label ? 0.05 : 0.0);
}
return std::nullopt;
};
if (prefer_label) {
auto r = MatchAttribute(/*match_label=*/true);
return r ? r : MatchAttribute(/*match_label=*/false);
} else {
auto r = MatchAttribute(/*match_label=*/false);
return r ? r : MatchAttribute(/*match_label=*/true);
}
};
// TOOD(crbug.com/328954153) Consider whether it makes sense to pull the
// country specific rules out of this big switch statement.
switch (field_type) {
case UNKNOWN_TYPE:
// The following are field types that may occur interspersed in an
// address form but matches are ignored. Email fields are reported by a
// different FormFieldParser. The other fields are just ignored.
for (const char* type : {"ADDRESS_LOOKUP", "ADDRESS_NAME_IGNORED",
"EMAIL_ADDRESS", "ATTENTION_IGNORED"}) {
if (Match(type, 10.0)) {
return 10;
}
}
return std::nullopt;
case ADDRESS_HOME_STREET_ADDRESS:
// The score is a bit higher than the score of an address line 1.
// This ensures that
// score(ADDRESS_HOME_STREET_ADDRESS) > score(ADDRESS_HOME_LINE1)
// but
// score(ADDRESS_HOME_STREET_ADDRESS) < score(ADDRESS_HOME_LINE1) +
// score(ADDRESS_HOME_LINE2)
return Match("ADDRESS_LINE_1", 1.6, [](const MatchParams& p) {
return MatchParamsWithFieldType(p, FormControlType::kTextArea);
});
case ADDRESS_HOME_LINE1:
return Match("ADDRESS_LINE_1", 1.0);
case ADDRESS_HOME_LINE2:
// Address lines 2 can follow address lines 1 - and, if
// kAutofillStructuredFieldsDisableAddressLines is disabled, a street name
// and house number. If kAutofillStructuredFieldsDisableAddressLines is
// enabled, `incompatible_` will suppress a combination of street
// name/house number and address line 2.
if (partial_classification_.contained_types.contains(
ADDRESS_HOME_LINE1) ||
partial_classification_.contained_types.contains_all(
{ADDRESS_HOME_STREET_NAME, ADDRESS_HOME_HOUSE_NUMBER})) {
// If the country model does not contain support for an apartment
// number, we treat a match for the apartment number regex as an
// address line 2.
if (!field_types_->supported_field_types().contains(
ADDRESS_HOME_APT_NUM)) {
if (auto r = Match("ADDRESS_HOME_APT_NUM", 1.0)) {
return r;
}
}
return Match("ADDRESS_LINE_2", 1.0);
}
return std::nullopt;
case ADDRESS_HOME_LINE3:
// An address line 3 can only directly follow an address line 2.
if (partial_classification_.contained_types.contains_all(
{ADDRESS_HOME_LINE1, ADDRESS_HOME_LINE2}) &&
partial_classification_.assignments[ADDRESS_HOME_LINE2] ==
scanner_->Predecessor()) {
if (auto r = Match("ADDRESS_LINE_2", 1.0)) {
return r;
}
return Match("ADDRESS_LINE_EXTRA", 1.0);
}
return std::nullopt;
case ADDRESS_HOME_APT_NUM:
return Match("ADDRESS_HOME_APT_NUM", 1.0);
case ADDRESS_HOME_APT:
case ADDRESS_HOME_APT_TYPE:
case ADDRESS_HOME_HOUSE_NUMBER_AND_APT:
// ADDRESS_HOME_APT, ADDRESS_HOME_APT_TYPE and
// ADDRESS_HOME_HOUSE_NUMBER_AND_APT are currently internal nodes of the
// address hierarchy that only exist to parse and format an address. They
// don't exist as recognized field types.
return std::nullopt;
case ADDRESS_HOME_CITY:
return Match("CITY", 1.0);
case ADDRESS_HOME_STATE:
return Match("STATE", 1.0);
case ADDRESS_HOME_ZIP:
return Match("ZIP_CODE", 1.0);
// TODO(crbug.com/328954153): ZIP4
case ADDRESS_HOME_COUNTRY:
// A bit >1.0 to prefer country over state in "country/region"
if (auto r = Match("COUNTRY", 1.1)) {
return r;
}
// The occasional page (e.g. google account registration page) calls
// this a "location". However, this only makes sense for select tags, so
// a different PatternRef is used.
return Match("COUNTRY_LOCATION", 1.1);
case ADDRESS_HOME_DEPENDENT_LOCALITY:
// In India a special regex is used for the locality (dependent locality).
if (context_->client_country == GeoIpCountryCode("IN")) {
return Match("IN_DEPENDENT_LOCALITY", 1.0);
}
return Match("ADDRESS_HOME_DEPENDENT_LOCALITY", 1.0);
case ADDRESS_HOME_STREET_NAME:
// A bit >1.0 to prefer a street name over address line 1.
return Match("ADDRESS_HOME_STREET_NAME", 1.1);
case ADDRESS_HOME_HOUSE_NUMBER:
return Match("ADDRESS_HOME_HOUSE_NUMBER", 1.1);
case ADDRESS_HOME_STREET_LOCATION:
// In India a special regex is used for the street location.
if (context_->client_country == GeoIpCountryCode("IN")) {
return Match("IN_STREET_LOCATION", 1.0);
}
// In most countries, street location is a combination of multiple
// fields. Therefore, the score is higher than the score of each compound.
return Match("ADDRESS_HOME_STREET_LOCATION", 1.5);
case ADDRESS_HOME_LANDMARK:
return Match("LANDMARK", 1.0);
case ADDRESS_HOME_BETWEEN_STREETS:
return Match("BETWEEN_STREETS", 1.5);
case ADDRESS_HOME_BETWEEN_STREETS_1:
// These are scored a big higher than ADDRESS_HOME_STREET_NAME to give
// priority to ADDRESS_HOME_BETWEEN_STREETS_1/2. This is fine because the
// regex is more specific than the regex for ADDRESS_HOME_STREET_NAME.
return Match("BETWEEN_STREETS_LINE_1", 1.2);
case ADDRESS_HOME_BETWEEN_STREETS_2:
if (partial_classification_.contained_types.contains(
ADDRESS_HOME_BETWEEN_STREETS_1)) {
return Match("BETWEEN_STREETS_LINE_2", 1.2);
}
return std::nullopt;
case ADDRESS_HOME_ADMIN_LEVEL2:
// The score is a bit higher than city for MX because the term
// "Municipio/Delegación" should take precedence.
return Match("ADMIN_LEVEL_2", 1.1);
case ADDRESS_HOME_OVERFLOW:
return Match("OVERFLOW", 1.0);
case ADDRESS_HOME_BETWEEN_STREETS_OR_LANDMARK:
// Higher score because the field needs to contain hints for both
// between streets and landmark.
return Match("BETWEEN_STREETS_OR_LANDMARK", 1.7);
case ADDRESS_HOME_OVERFLOW_AND_LANDMARK:
// Higher score because the field needs to contain hints for both
// overflow and landmark.
return Match("OVERFLOW_AND_LANDMARK", 1.7);
case COMPANY_NAME:
// A bit less than 1.0 to prioritize an address line 2 interpretation.
// score(street address) + score(company name) <
// score(address line 1) + score(address line 2)
if (!Match("ADDRESS_LINE_1", 1.0) && !Match("ADDRESS_LINE_2", 1.0) &&
!Match("ADDRESS_HOME_APT_NUM", 1.0)) {
return Match("COMPANY_NAME", 0.8);
}
return std::nullopt;
case ADDRESS_HOME_STREET_LOCATION_AND_LOCALITY:
if (context_->client_country == GeoIpCountryCode("IN") &&
Match("IN_STREET_LOCATION", 1.0) &&
Match("IN_DEPENDENT_LOCALITY", 1.0)) {
return 1.5;
}
return std::nullopt;
case ADDRESS_HOME_STREET_LOCATION_AND_LANDMARK:
if (context_->client_country == GeoIpCountryCode("IN") &&
Match("IN_STREET_LOCATION", 1.0) && Match("LANDMARK", 1.0)) {
return 1.5;
}
// Some Location and landmark fields are labeled "Address". We give it
// a 0.8 score to prefer a classification as a STREET_ADDRESS or
// ADDRESS_LINE_1, but allow a higher score when combined with a
// landmark.
if (context_->client_country == GeoIpCountryCode("IN") &&
Match("ADDRESS_LINE_1", 1.0)) {
return 0.8;
}
return std::nullopt;
case ADDRESS_HOME_DEPENDENT_LOCALITY_AND_LANDMARK:
if (context_->client_country == GeoIpCountryCode("IN") &&
Match("IN_DEPENDENT_LOCALITY", 1.0) && Match("LANDMARK", 1.0)) {
return 1.5;
}
return std::nullopt;
case ADDRESS_HOME_ZIP_AND_CITY:
if (context_->client_country == GeoIpCountryCode("FR") &&
Match("ZIP_CODE", 1.0) && Match("CITY", 1.0)) {
return 1.5;
}
return std::nullopt;
// Address related fields that we don't parse (yet).
case DELIVERY_INSTRUCTIONS:
case ADDRESS_HOME_SUBPREMISE:
case ADDRESS_HOME_OTHER_SUBUNIT:
case ADDRESS_HOME_ADDRESS:
case ADDRESS_HOME_ADDRESS_WITH_NAME:
case ADDRESS_HOME_FLOOR:
case ADDRESS_HOME_SORTING_CODE:
return std::nullopt;
// Fields that are not processed by the AddressFieldParserNG.
case NAME_HONORIFIC_PREFIX:
case NAME_FIRST:
case NAME_MIDDLE:
case NAME_LAST:
case NAME_LAST_FIRST:
case NAME_LAST_CONJUNCTION:
case NAME_LAST_SECOND:
case NAME_MIDDLE_INITIAL:
case NAME_FULL:
case NAME_SUFFIX:
case ALTERNATIVE_FULL_NAME:
case ALTERNATIVE_GIVEN_NAME:
case ALTERNATIVE_FAMILY_NAME:
case EMAIL_ADDRESS:
case USERNAME_AND_EMAIL_ADDRESS:
case PHONE_HOME_NUMBER:
case PHONE_HOME_NUMBER_PREFIX:
case PHONE_HOME_NUMBER_SUFFIX:
case PHONE_HOME_CITY_CODE:
case PHONE_HOME_CITY_CODE_WITH_TRUNK_PREFIX:
case PHONE_HOME_COUNTRY_CODE:
case PHONE_HOME_CITY_AND_NUMBER:
case PHONE_HOME_CITY_AND_NUMBER_WITHOUT_TRUNK_PREFIX:
case PHONE_HOME_WHOLE_NUMBER:
case PHONE_HOME_EXTENSION:
case CREDIT_CARD_NAME_FULL:
case CREDIT_CARD_NAME_FIRST:
case CREDIT_CARD_NAME_LAST:
case CREDIT_CARD_NUMBER:
case CREDIT_CARD_EXP_MONTH:
case CREDIT_CARD_EXP_2_DIGIT_YEAR:
case CREDIT_CARD_EXP_4_DIGIT_YEAR:
case CREDIT_CARD_EXP_DATE_2_DIGIT_YEAR:
case CREDIT_CARD_EXP_DATE_4_DIGIT_YEAR:
case CREDIT_CARD_TYPE:
case CREDIT_CARD_VERIFICATION_CODE:
case CREDIT_CARD_STANDALONE_VERIFICATION_CODE:
case IBAN_VALUE:
case MERCHANT_PROMO_CODE:
case USERNAME:
case PASSWORD:
case ACCOUNT_CREATION_PASSWORD:
case CONFIRMATION_PASSWORD:
case SINGLE_USERNAME:
case SINGLE_USERNAME_FORGOT_PASSWORD:
case SINGLE_USERNAME_WITH_INTERMEDIATE_VALUES:
case NOT_PASSWORD:
case NOT_USERNAME:
case NOT_ACCOUNT_CREATION_PASSWORD:
case NEW_PASSWORD:
case PROBABLY_NEW_PASSWORD:
case NOT_NEW_PASSWORD:
case ONE_TIME_CODE:
case NO_SERVER_DATA:
case EMPTY_TYPE:
case AMBIGUOUS_TYPE:
case MERCHANT_EMAIL_SIGNUP:
case PRICE:
case NUMERIC_QUANTITY:
case SEARCH_TERM:
case PASSPORT_NUMBER:
case PASSPORT_ISSUING_COUNTRY:
case PASSPORT_EXPIRATION_DATE:
case PASSPORT_ISSUE_DATE:
case LOYALTY_MEMBERSHIP_PROGRAM:
case LOYALTY_MEMBERSHIP_PROVIDER:
case LOYALTY_MEMBERSHIP_ID:
case VEHICLE_LICENSE_PLATE:
case VEHICLE_VIN:
case VEHICLE_MAKE:
case VEHICLE_MODEL:
case VEHICLE_YEAR:
case VEHICLE_PLATE_STATE:
case DRIVERS_LICENSE_REGION:
case DRIVERS_LICENSE_NUMBER:
case DRIVERS_LICENSE_EXPIRATION_DATE:
case DRIVERS_LICENSE_ISSUE_DATE:
case EMAIL_OR_LOYALTY_MEMBERSHIP_ID:
case NATIONAL_ID_CARD_NUMBER:
case NATIONAL_ID_CARD_EXPIRATION_DATE:
case NATIONAL_ID_CARD_ISSUE_DATE:
case NATIONAL_ID_CARD_ISSUING_COUNTRY:
case REDRESS_NUMBER:
case KNOWN_TRAVELER_NUMBER:
case KNOWN_TRAVELER_NUMBER_EXPIRATION_DATE:
case ADDRESS_HOME_ZIP_PREFIX:
case ADDRESS_HOME_ZIP_SUFFIX:
case FLIGHT_RESERVATION_FLIGHT_NUMBER:
case FLIGHT_RESERVATION_TICKET_NUMBER:
case FLIGHT_RESERVATION_CONFIRMATION_CODE:
case FLIGHT_RESERVATION_DEPARTURE_DATE:
case ORDER_ID:
case ORDER_DATE:
case ORDER_MERCHANT_NAME:
case SHIPMENT_TRACKING_NUMBER:
case MAX_VALID_FIELD_TYPE:
return std::nullopt;
}
}
void AddressFieldParserNG::ParseRecursively() {
auto log_prefix = [&]() { return std::string(scanner_->GetOffset(), ' '); };
if (scanner_->IsEnd()) {
DVLOG(1) << log_prefix() << "END of input";
DVLOG(1) << log_prefix()
<< "score=" << SequenceToScoreString(partial_classification_)
<< ", best_score_so_far="
<< SequenceToScoreString(best_classification_)
<< ", plausible=" << IsClassificationPlausible();
// Store classification if it's better.
if (partial_classification_.BetterThan(best_classification_) &&
IsClassificationPlausible()) {
DVLOG(1) << log_prefix() << "NEW BEST SOLUTION";
best_classification_ = partial_classification_;
}
return;
}
// UNKNOWN_TYPE should always be the first element. If we have a match,
// we skip all other field types.
CHECK_EQ(*field_types_->supported_field_types().begin(), UNKNOWN_TYPE);
// Whether any field type could be assigned for the current scanner position.
bool found_extra_assignment = false;
for (FieldType field_type : field_types_->supported_field_types()) {
// Skip trying field_type if it's incompatible with already assigned types.
if (partial_classification_.contained_types.contains_any(
field_types_->incompatible_field_types(field_type))) {
DVLOG(1) << log_prefix() << "---- " << FieldTypeToStringView(field_type)
<< " conflict.";
continue;
}
std::optional<double> extra_score = FindScoreOfBestMatchingRule(field_type);
if (!extra_score) {
DVLOG(1) << log_prefix() << "---- " << FieldTypeToStringView(field_type)
<< " non-match.";
continue;
}
found_extra_assignment = true;
// Perform new assignment.
const double old_score = partial_classification_.score;
const std::optional<AutofillScanner::Position>
old_last_classified_field_index =
partial_classification_.last_classified_field_index;
if (field_type != UNKNOWN_TYPE) {
partial_classification_.contained_types.insert(field_type);
partial_classification_.assignments[field_type] = &scanner_->Cursor();
partial_classification_.last_classified_field_index =
scanner_->GetPosition();
}
partial_classification_.score += *extra_score;
DVLOG(1) << log_prefix() << "++++ " << FieldTypeToStringView(field_type)
<< " match. new score is " << partial_classification_.score;
const AutofillScanner::Position old_position = scanner_->GetPosition();
scanner_->Advance();
ParseRecursively();
scanner_->Restore(old_position);
// Revert new assignment.
if (field_type != UNKNOWN_TYPE) {
partial_classification_.contained_types.erase(field_type);
partial_classification_.assignments.erase(field_type);
partial_classification_.last_classified_field_index =
old_last_classified_field_index;
}
partial_classification_.score = old_score;
// If we had a match on UNKNOWN_TYPE (i.e. an email field, address lookup
// field, etc.), we don't want to try other field types. E.g. "address" is
// a substring of "email address" and should not be considered.
if (field_type == UNKNOWN_TYPE && extra_score) {
break;
}
}
if (!found_extra_assignment) {
DVLOG(1) << log_prefix() << "END did not find another classification.";
DVLOG(1) << log_prefix()
<< "score=" << SequenceToScoreString(partial_classification_)
<< ", best_score_so_far="
<< SequenceToScoreString(best_classification_)
<< ", plausible=" << IsClassificationPlausible();
if (partial_classification_.BetterThan(best_classification_) &&
IsClassificationPlausible()) {
DVLOG(1) << log_prefix() << "NEW BEST SOLUTION";
best_classification_ = partial_classification_;
}
}
}
bool AddressFieldParserNG::IsClassificationPlausible() const {
// The house number is easy to guess wrong (e.g. to mix up with a CC number).
// Therefore, we require extra evidence.
const FieldTypeSet& contained_types = partial_classification_.contained_types;
if (contained_types.contains(ADDRESS_HOME_HOUSE_NUMBER) &&
!contained_types.contains_any(
{ADDRESS_HOME_STREET_NAME, ADDRESS_HOME_OVERFLOW,
ADDRESS_HOME_LANDMARK, ADDRESS_HOME_OVERFLOW_AND_LANDMARK})) {
return false;
}
if (contained_types.contains(ADDRESS_HOME_APT_NUM) &&
!contained_types.contains_all(
{ADDRESS_HOME_STREET_NAME, ADDRESS_HOME_HOUSE_NUMBER})) {
return false;
}
return true;
}
} // namespace autofill