blob: 244f7782b6c02b33151ddcdc812131d3677adc7f [file] [log] [blame]
// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_COMPONENT_H_
#define COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_COMPONENT_H_
#include <map>
#include <string>
#include <vector>
#include "base/memory/raw_ptr.h"
#include "base/strings/string_piece.h"
#include "components/autofill/core/browser/field_types.h"
#include "third_party/abseil-cpp/absl/types/optional.h"
namespace re2 {
class RE2;
} // namespace re2
namespace autofill {
struct AddressToken;
struct SortedTokenComparisonResult;
// Represents the validation status of value stored in the AutofillProfile.
// The associated integer values used to store the verification code in SQL and
// should not be modified.
// GENERATED_JAVA_ENUM_PACKAGE: org.chromium.chrome.browser.autofill
enum class VerificationStatus {
// No verification status assigned.
kNoStatus = 0,
// The value token was parsed from a parent token.
kParsed = 1,
// Value was built from its subcomponents.
kFormatted = 2,
// The value was observed in a form transmission.
kObserved = 3,
// The user used the autofill settings to verify and store this token.
kUserVerified = 4,
// The token was parsed by the server.
kServerParsed = 5,
kMaxValue = kServerParsed
};
// Prints the string representation of |status| to |os|.
std::ostream& operator<<(std::ostream& os, VerificationStatus status);
// Returns true if |left| has a less significant verification status compared to
// |right|.
bool IsLessSignificantVerificationStatus(VerificationStatus left,
VerificationStatus right);
// Returns the more significant verification status according to
// |IsLessSignificantVerificationStatus|.
VerificationStatus GetMoreSignificantVerificationStatus(
VerificationStatus left,
VerificationStatus right);
// The merge mode defines if and how two components are merged.
// The merge operations are applied in the order defined here.
// If one merge operation succeeds, the subsequent ones are not tested.
// Therefore, if |KUseBetterOrMoreRecentIfDifferent| is active,
// |kMergeChildrenAndReformatIfNeeded| will not be applied because
// |kUseBetterOrMostRecentIfDifferent| is always applicable.
enum MergeMode {
// If one component has an empty value, use the non-empty one.
kReplaceEmpty = 1,
// Recursively merge two components that have the same tokens in arbitrary
// order. This is used as the default merge mode.
kRecursivelyMergeTokenEquivalentValues = 1 << 1,
// If both tokens have the same normalized value, use the one with the better
// verification status. If both statuses are the same, use the newer one.
kUseBetterOrNewerForSameValue = 1 << 2,
// If one component is a superset of the other, use the subset.
kReplaceSuperset = 1 << 3,
// If one component is a subset of the other, use the superset.
kReplaceSubset = 1 << 4,
// If both components have a different value, is the newer one.
kUseNewerIfDifferent = 1 << 5,
// If the newer component contains one token more, apply a recursive strategy
// to merge the tokens.
kRecursivelyMergeSingleTokenSubset = 1 << 6,
// If one is a substring of the other use the most recent one.
kUseMostRecentSubstring = 1 << 7,
// If the tokens match or one is a subset of the other, pick the shorter one.
kPickShorterIfOneContainsTheOther = 1 << 8,
// If the normalized values are different, use the better one in terms
// of verification score or the most recent one if both scores are the same.
kUseBetterOrMostRecentIfDifferent = 1 << 9,
// Merge the child nodes and reformat the node from its children after merge
// if the value has changed.
kMergeChildrenAndReformatIfNeeded = 1 << 10,
// Make a merge decision based on canonicalized values.
kMergeBasedOnCanonicalizedValues = 1 << 11,
// Defines the default merging behavior.
kDefault = kRecursivelyMergeTokenEquivalentValues
};
// An AddressComponent is a tree structure that represents a semi-structured
// address token. Such an address token can either be an atomic leaf node or
// have a set of children, each representing a more granular subtoken of the
// component.
//
// An AddressComponent has a string representation stored in |value_| and a
// VerificationStatus stored in |verification_status_|.
// The latter indicates if the value was user-verified, observed in a form
// submission event, parsed from its parent component or was formatted from its
// child components.
//
// In a proper component tree, each AddressComponent has a unique
// ServerFieldType. Additionally, an AddressComponent may be associated with a
// list of additional field types that allow for retrieving and setting the
// Component's value in specific formats. For example, NAME_MIDDLE may be the
// storage type and NAME_MIDDLE_INITIAL is an additional field type.
//
// The usage pattern of such an address tree is as follows:
//
// * Create a tree from an observed form submission or a profile editing or
// creation event in the Chrome settings. It is assumed that the created
// tree does not have values for competing field types. Two types are competing
// iff they are on a common root-to-leaf path. For example, an imported profile
// with a value for NAME_FULL and NAME_LAST has conflicting types that
// carry redundant information.
//
// * After the creation of the tree, the values of unassigned nodes in the tree
// are deducted from the values of assigned nodes. This happens by parsing
// (taking a string and splitting it into components) or by formatting (taking
// one or multiple strings and combining them into one string).
//
// * After the completion, there should be no need to modify the tree.
//
// * A tree may be mergeable with another tree of the same type. This
// operation incorporates complementing observations. For example, in the first
// tree NAME_FIRST, NAME_MIDDLE and NAME_LAST may be parsed from an observed
// unstructured name (NAME_FULL). The second tree may be built from observing
// the structured name, and contain observed NAME_FIRST, NAME_MIDDLE and
// NAME_LAST values but only a formatted NAME_FULL value.
class AddressComponent {
public:
// Constructor for a compound child node.
AddressComponent(ServerFieldType storage_type,
AddressComponent* parent,
unsigned int merge_mode);
// Disallows copies and direct assignments since they are not needed in the
// current Autofill design.
AddressComponent(const AddressComponent& other) = delete;
AddressComponent& operator=(const AddressComponent& right) = delete;
virtual ~AddressComponent();
// Migrates from a legacy structure in which tokens are imported without
// a status.
virtual void MigrateLegacyStructure() {}
// Comparison operators are deleted in favor of and |SameAs()|.
bool operator==(const AddressComponent& right) const = delete;
bool operator!=(const AddressComponent& right) const = delete;
// Compares the values and verification statuses with |other| recursively down
// the tree. Returns true iff all values and verification statuses of this
// node and its subtree and |other| with its subtree are the same.
bool SameAs(const AddressComponent& other) const;
// Copies the values and verification statuses from |other| recursively down
// the tree.
void CopyFrom(const AddressComponent& other);
// Returns the autofill storage type stored in |storage_type_|.
ServerFieldType GetStorageType() const;
// Returns the string representation of |storage_type_|.
std::string GetStorageTypeName() const;
// Returns the value verification status of the component's value;
VerificationStatus GetVerificationStatus() const;
// Returns true if the component has no subcomponents.
bool IsAtomic() const;
// Returns a constant reference to |value_.value()|. If the value is not
// assigned, an empty string is returned.
const std::u16string& GetValue() const;
// Returns a canonicalized version of the value or absl::nullopt if
// canonicalization is not possible or not implemented.
virtual absl::optional<std::u16string> GetCanonicalizedValue() const;
// Returns true if the value of this AddressComponent is assigned.
bool IsValueAssigned() const;
// Sets the value corresponding to the storage type of this AddressComponent.
virtual void SetValue(std::u16string value, VerificationStatus status);
// Wrapper function around
// SetValueForTypeIfPossible(/*invalidate_child_nodes=*/false);
bool SetValueForType(ServerFieldType field_type,
const std::u16string& value,
const VerificationStatus& status);
// Wrapper function around
// SetValueForTypeIfPossible(/*invalidate_child_nodes=*/true);
// TODO(1440504): Remove and merge with SetValueForType.
bool SetValueForTypeAndResetSubstructure(ServerFieldType field_type,
const std::u16string& value,
const VerificationStatus& status);
// Sets the value to an empty string, marks it unassigned and sets the
// verification status to |kNoStatus|.
virtual void UnsetValue();
// Convenience method to get the value of |field_type|.
// Returns an empty string if |field_type| is not supported.
std::u16string GetValueForType(ServerFieldType field_type) const;
// Convenience method to get the value of `field_type` to be used for
// comparison with `other`. Returns an empty string if `field_type` is not
// supported.
std::u16string GetValueForComparisonForType(
ServerFieldType field_type,
const AddressComponent& other) const;
// Convenience method to get the verification status of `field_type`.
// Returns |VerificationStatus::kNoStatus| if `field_type` is not supported.
VerificationStatus GetVerificationStatusForType(
ServerFieldType field_type) const;
// Returns true if the |value| and |verification_status| were successfully
// unset for |type|.
bool UnsetValueForTypeIfSupported(ServerFieldType field_type);
// Parses |value_| to assign values to the subcomponents.
// The method uses 3 stages:
//
// * Use |ParseValueAndAssignSubcomponentsByMethod()|. This stage exists
// to catch special cases and may fail. The method is virtual and can be
// implemented on the type level.
//
// * Use |ParseValueAndAssignSubcomponentsByRegularExpressions()|. This stage
// uses a list of regular expressions acquired by the virtual method
// |GetParseRegularExpressionsByRelevance()|. This stage my fail.
//
// * Use |ParseValueAndAssignSubcomponentsByFallbackMethod()| as the last
// resort to parse |value_|. This method must produce a valid result.
void ParseValueAndAssignSubcomponents();
// This methods populated the unassigned entries in the subtree of this node
// by either parsing unknown values for subcomponents from their parents, or
// vice versa, formatting unknown values from known subcomponents. The method
// is virtual and can be reimplemented on the type level.
virtual void RecursivelyCompleteTree();
// Completes the full tree by calling |RecursivelyCompleteTree()| starting
// form the root node. Returns true if the completion was successful.
virtual bool CompleteFullTree();
// Checks if a tree is completable in the sense that there are no conflicting
// observed or verified types. This means that there is not more than one
// observed or verified node on any root-to-leaf path in the tree.
bool IsTreeCompletable();
// Recursively adds the supported types to the set. Calls
// |GetAdditionalSupportedFieldTypes()| to add field types.
void GetSupportedTypes(ServerFieldTypeSet* supported_types) const;
// Adds the additional supported field types to |supported_types|.
// The method should DCHECK that the added types are not part of the set yet.
virtual const ServerFieldTypeSet GetAdditionalSupportedFieldTypes() const;
// Unassigns all nodes with parsed or formatted values.
void UnsetParsedAndFormattedValuesInEntireTree();
// Unassigns all nodes with parsed or formatted values.
void RecursivelyUnsetParsedAndFormattedValues();
// Returns true if both components are mergeable.
virtual bool IsMergeableWithComponent(
const AddressComponent& newer_component) const;
// Recursively updates the verification statuses to the higher one, for nodes
// in |newer_component| that have the same values as the nodes in |this|.
virtual void MergeVerificationStatuses(
const AddressComponent& newer_component);
// Merge |newer_component| into this AddressComponent.
// Returns false if the merging is not possible.
// The state of the component is not altered by a failed merging attempt.
// |newer_was_more_recently_used| indicates that the newer component was also
// more recently used for filling a form.
virtual bool MergeWithComponent(const AddressComponent& newer_component,
bool newer_was_more_recently_used = true);
// Merge |newer_component| into this AddressComponent.
// The merging is possible iff the value of both root nodes is token
// equivalent, meaning they contain the same tokens in an arbitrary order.
// Returns false if the merging is not possible.
// The state of the component is not altered by a failed merging attempt.
bool MergeTokenEquivalentComponent(const AddressComponent& newer_component);
// Returns a constant vector of pointers to the child nodes of the component.
const std::vector<AddressComponent*>& Subcomponents() const {
return subcomponents_;
}
// Returns a vector containing sorted normalized tokens of the
// value of the component. The tokens are lazily calculated when first needed.
const std::vector<AddressToken> GetSortedTokens() const;
// Recursively unsets all subcomponents.
void RecursivelyUnsetSubcomponents();
// Return if the value associated with |field_type| is valid.
// If |wipe_if_not|, the value is unset if invalid.
bool IsValueForTypeValid(ServerFieldType field_type,
bool wipe_if_not = false);
// While processing two structured addresses, if only one of them has their
// country set, the other should assume the non-empty one while merging. This
// is required to do consistent address rewriting.
// Returns the common country to be used.
std::u16string GetCommonCountry(const AddressComponent& other) const;
// Deletes the stored structure and returns true if |IsStructureValid()|
// returns false.
virtual bool WipeInvalidStructure();
// Returns if the structure in the tree below this node is valid. A structure
// becomes invalid when it contains information that is not contained in the
// value of this node.
bool IsStructureValid() const;
// Returns true if all values of all descendent nodes are empty.
bool AllDescendantsAreEmpty() const;
#ifdef UNIT_TEST
// Initiates the formatting of the values from the subcomponents.
void FormatValueFromSubcomponentsForTesting() {
FormatValueFromSubcomponents();
}
// Returns the best format string for testing.
std::u16string GetBestFormatStringForTesting() {
return GetBestFormatString();
}
// Returns the parse expressions by relevance for testing.
std::vector<const re2::RE2*>
GetParseRegularExpressionsByRelevanceForTesting() {
return GetParseRegularExpressionsByRelevance();
}
// Returns a reference to the root node of the tree for testing.
AddressComponent& GetRootNodeForTesting() { return GetRootNode(); }
// Replaces placeholder values in the best format string with the
// corresponding values.
std::u16string GetReplacedPlaceholderTypesWithValuesForTesting() const {
return ReplacePlaceholderTypesWithValues(GetBestFormatString());
}
// Returns a vector containing the |storage_types_| of all direct
// subcomponents.
std::vector<ServerFieldType> GetSubcomponentTypesForTesting() const {
return GetSubcomponentTypes();
}
// Sets the merge mode for testing purposes.
void SetMergeModeForTesting(int merge_mode) { merge_mode_ = merge_mode; }
// Returns the value used for comparison for testing purposes.
std::u16string GetValueForComparisonForTesting(
const AddressComponent& other) const {
return GetValueForComparison(other);
}
#endif
protected:
// Returns the verification score of this component and its substructure.
// Each observed node contributes to the validation score by 1.
virtual int GetStructureVerificationScore() const;
// Returns whether `field_type` is a supported type for the current node.
bool IsSupportedType(ServerFieldType field_type) const;
// Returns a vector containing the |storage_types_| of all direct
// subcomponents.
std::vector<ServerFieldType> GetSubcomponentTypes() const;
// Heuristic method to get the best suited format string.
// This method is virtual and can be reimplemented for each type.
virtual std::u16string GetBestFormatString() const;
// Returns pointers to regular expressions sorted by their relevance.
// This method is virtual and can be reimplemented for each type.
virtual std::vector<const re2::RE2*> GetParseRegularExpressionsByRelevance()
const;
// Method to parse |value_| into the values of |subcomponents_|. The
// purpose of this method is to cover special cases. This method returns true
// on success and is allowed to fail. On failure, the |subcomponents_| are not
// altered.
virtual bool ParseValueAndAssignSubcomponentsByMethod();
// This method parses |value_| to assign values to the subcomponents.
// The method is virtual and can be reimplemented per type.
// It must succeed.
virtual void ParseValueAndAssignSubcomponentsByFallbackMethod();
// This method is used to set the value given by a type different than the
// storage type. It must implement the conversion logic specific to each type.
virtual void SetValueForOtherSupportedType(ServerFieldType field_type,
const std::u16string& value,
const VerificationStatus& status);
// This method is used to retrieve the value for a supported field type
// different from the storage type. It must implement the conversion logic
// specific to each type.
virtual std::u16string GetValueForOtherSupportedType(
ServerFieldType field_type) const;
// This method is used to retrieve the value for a supported field type
// different from the storage type, and rewrites it for comparison with
// `other`. It must implement the conversion logic specific to each type.
virtual std::u16string GetValueForComparisonForOtherSupportedType(
ServerFieldType field_type,
const AddressComponent& other) const;
// Clears all parsed and formatted values.
void ClearAllParsedAndFormattedValues();
// Merge a component that has exactly one token less.
bool MergeSubsetComponent(
const AddressComponent& subset_component,
const SortedTokenComparisonResult& token_comparison_result);
// Consumes an additional token into the most appropriate subcomponent.
// Can be implemented by the specific node types.
// The fall-back solution uses the first empty node.
// If no empty node is available, it appends the value to the first node.
virtual void ConsumeAdditionalToken(const std::u16string& token_value);
// Returns a reference to the root node of the tree.
AddressComponent& GetRootNode();
// Returns a reference to the root node of the tree.
const AddressComponent& GetRootNode() const;
// Function to determine if the value stored in this component is valid.
// Return true be default but can be overloaded by a subclass.
virtual bool IsValueValid() const;
// Function to be called post assign to do sanitization.
virtual void PostAssignSanitization() {}
// Returns a normalized value for comparison.
// In the default implementation, this converts the value to lower case and
// removes white spaces. This function may be reimplemented to perform
// different normalization operations.
virtual std::u16string GetNormalizedValue() const;
// Returns a value used for comparison.
// In the default implementation this is just the normalized value but this
// function can be overridden in subclasses to apply further operations on
// the normalized value.
// |other| represents the component we are comparing with and is required
// for consistent rewriting rules.
virtual std::u16string GetValueForComparison(
const AddressComponent& other) const;
// Returns true if the merging of two token identical values should give
// precedence to the newer value.
virtual bool HasNewerValuePrecedenceInMerging(
const AddressComponent& newer_component) const;
// Parses |value| by using |parse_expressions| and assigns the values.
// Returns true on success.
bool ParseValueAndAssignSubcomponentsByRegularExpression(
const std::u16string& value,
const re2::RE2* parse_expression);
// Determines and sets a formatted value using
// |GetFormattedValueFromSubcomponents|.
void FormatValueFromSubcomponents();
// Returns the maximum number of components with assigned values on the path
// from the component to a leaf node.
int MaximumNumberOfAssignedAddressComponentsOnNodeToLeafPaths() const;
private:
// Returns the node in the tree that supports `field_type`. This node, if it
// exists, is unique by definition. Returns nullptr if no such node exists.
AddressComponent* GetNodeForType(ServerFieldType field_type);
// const version of GetNodeForType.
const AddressComponent* GetNodeForType(ServerFieldType field_type) const;
// Function to be called by child nodes on construction to register
// themselves as child nodes.
void RegisterChildNode(AddressComponent* child);
// Unsets the node and all of its children.
void UnsetAddressComponentAndItsSubcomponents();
// Unsets the children of a node.
void UnsetSubcomponents();
// In cases where the tree has been initially completed, there might still be
// nodes that are empty (e.g. a new leaf or internal node got recently
// introduced). Gap filling addresses all those cases.
// TODO(crbug.com/1440168) Include also parsing as a strategy for gap filling.
void FillTreeGaps();
// Determines a value from the subcomponents by using the
// most suitable format string determined by |GetBestFormatString()|.
std::u16string GetFormattedValueFromSubcomponents();
// Replaces placeholder values with the corresponding values.
std::u16string ReplacePlaceholderTypesWithValues(
const std::u16string& format) const;
// Replaces placeholder values with the corresponding values.
std::u16string ReplacePlaceholderTypesWithValuesRegexVersion(
const std::u16string& format) const;
// This method uses regular expressions acquired by
// |GetParseRegularExpressionsByRelevance| to parse |value_| into the values
// of the subcomponents. Returns true on success and is allowed to fail.
bool ParseValueAndAssignSubcomponentsByRegularExpressions();
// The unstructured value of this component.
absl::optional<std::u16string> value_;
// The verification status of |value_| indicates the certainty of the value
// to be correct.
VerificationStatus value_verification_status_;
// The storable Autofill type of the component.
const ServerFieldType storage_type_;
// A vector of pointers to the subcomponents.
std::vector<AddressComponent*> subcomponents_;
// A vector that contains the tokens of |value_| after normalization,
// meaning that it was converted to lower case and diacritics have been
// removed. |value_| is tokenized by splitting the string by white spaces and
// commas. It is calculated when |value_| is set.
absl::optional<std::vector<AddressToken>> sorted_normalized_tokens_;
// A pointer to the parent node. It is set to nullptr if the node is the root
// node of the AddressComponent tree.
const raw_ptr<AddressComponent> parent_;
// Defines if and how two components can be merged.
int merge_mode_;
};
} // namespace autofill
#endif // COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_COMPONENT_H_