Improve `StructuredAddressesRegExProvider` readability
This CL improves the readability of the
`StructuredAddressesRegExProvider` by making the following changes:
- Replaced kOptional... regexes with non-optional versions,
using the `MatchQuantifier::kOptional` argument instead.
- Created `CaptureTypeWithPatternOptional` and
`NoCapturePatternOptional` wrappers and used them where applicable.
- Removed duplicate entries from the LastNamePrefix regex and
improved the comprehensiveness of the Dutch regexes by adding
missing prefixes such as "von der".
Bug: 386916943
Change-Id: I9756d1e9663266e3ab1519af5c450e2a40dbfb1c
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6218544
Commit-Queue: Piotr Kotynia <piotrkotynia@google.com>
Reviewed-by: Dominic Battré <battre@chromium.org>
Reviewed-by: Norge Vizcay <vizcay@google.com>
Cr-Commit-Position: refs/heads/main@{#1413983}
diff --git a/components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.cc b/components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.cc
index 3d1330b0..9a48a86 100644
--- a/components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.cc
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.cc
@@ -68,10 +68,10 @@
"|lt|ltc|ltg|ltjg|maj|major|mg|pastor|prof|rep|reverend"
"|rev|sen|st)";
-// Regular expression pattern for an optional last name suffix.
-const char kOptionalLastNameSuffixRe[] =
- "(?:(?:(?:b\\.a|ba|d\\.d\\.s|dds|ii|iii|iv|ix|jr|m\\.a|m\\.d|md|ms|"
- "ph\\.?d|sr|v|vi|vii|viii|x)\\.?)?)";
+// Regular expression pattern for a last name suffix.
+const char kLastNameSuffixRe[] =
+ "(?:(?:b\\.a|ba|d\\.d\\.s|dds|ii|iii|iv|ix|jr|m\\.a|m\\.d|md|ms|"
+ "ph\\.?d|sr|v|vi|vii|viii|x)\\.?)";
// Regular expression pattern for a CJK character.
const char kCjkCharacterRe[] =
@@ -140,17 +140,16 @@
// "le"/"la" as used in Hispanic/Latinx names.
// * The matching of "i" is made lazy to give the last name conjunction
// precedence.
-const char kOptionalLastNamePrefixRe[] =
- "(?:(?:"
- "a|ab|af|av|ap|abu|aït|al|ālam|aust|austre|bar|bath|bat|ben|bin|ibn|bet|"
+const char kLastNamePrefixRe[] =
+ "(?:a|ab|af|av|ap|abu|aït|al|ālam|aust|austre|bar|bath|bat|ben|bin|ibn|bet|"
"bint|binti|binte|da|das|de|degli|dele|del|du|della|der|di|dos|du|e|el|"
"fetch|vetch|fitz|i??|kil|gil|de le|de "
"la|la|le|lille|lu|m|mac|mc|mck|mhic|mic|mala|"
"mellom|myljom|na|ned|nedre|neder|nic|ni|nin|nord|norr|ny|o|ua|"
- "ui|opp|upp|öfver|ost|öst|öster|øst|øst|østre|över|øvste|øvre|øver|öz|pour|"
- "putra|putri|setia|tor|söder|sør|sønder|sør|syd|søndre|syndre|søre|ter|ter|"
- "tre|van|van der|väst|väster|verch|erch|vest|vestre|vesle|vetle|von|zu|von "
- "und zu)\\s)?";
+ "ui|opp|upp|öfver|ost|öst|öster|øst|østre|över|øvste|øvre|øver|öz|pour|"
+ "putra|putri|setia|tor|söder|sør|sønder|sør|syd|søndre|syndre|søre|ter|"
+ "tre|väst|väster|verch|erch|vest|vestre|vesle|vetle|zu|von und zu|"
+ "v[ao]n de[nr]?|v[ao]n)";
// Regular expression to match the affixes that indicate the floor an
// apartment is located in.
@@ -185,9 +184,8 @@
return CaptureTypeWithPattern(
NAME_FULL,
{// Parse one or more CJK characters into the last name.
- CaptureTypeWithPattern(
- NAME_LAST, kCjkCharactersRe,
- {.separator = kCjkNameSeperatorsRe}),
+ CaptureTypeWithPattern(NAME_LAST, kCjkCharactersRe,
+ {.separator = kCjkNameSeperatorsRe}),
// Parse the remaining CJK characters into the first name.
CaptureTypeWithPattern(NAME_FIRST, kCjkCharactersRe)});
}
@@ -196,9 +194,8 @@
return CaptureTypeWithPattern(
ALTERNATIVE_FULL_NAME,
{// Parse one or more CJK characters into the last name.
- CaptureTypeWithPattern(
- ALTERNATIVE_FAMILY_NAME, kCjkCharactersRe,
- {.separator = kCjkNameSeperatorsRe}),
+ CaptureTypeWithPattern(ALTERNATIVE_FAMILY_NAME, kCjkCharactersRe,
+ {.separator = kCjkNameSeperatorsRe}),
// Parse the remaining CJK characters into the first name.
CaptureTypeWithPattern(ALTERNATIVE_GIVEN_NAME, kCjkCharactersRe)});
}
@@ -255,16 +252,16 @@
"^", kCjkCharactersRe,
// Followed by an optional separator with one
// or more additional CJK characters.
- "(", kCjkNameSeperatorsRe,
- kCjkCharactersRe, ")?$"});
+ "(", kCjkNameSeperatorsRe, kCjkCharactersRe, ")?$"});
}
// Returns an expression to parse a full name that contains only a last name.
std::string ParseOnlyLastNameExpression() {
return CaptureTypeWithPattern(
NAME_FULL, {CaptureTypeWithPattern(
- NAME_LAST, {kOptionalLastNamePrefixRe, kSingleWordRe}),
- kOptionalLastNameSuffixRe});
+ NAME_LAST, {NoCapturePatternOptional(kLastNamePrefixRe),
+ kSingleWordRe}),
+ NoCapturePatternOptional(kLastNameSuffixRe)});
}
// Returns an expression to parse a name that consists of a first, middle and
@@ -278,18 +275,15 @@
std::string ParseFirstMiddleLastNameExpression() {
return CaptureTypeWithPattern(
NAME_FULL,
- {NoCapturePattern(
- kHonorificPrefixRe,
- CaptureOptions{.quantifier = MatchQuantifier::kOptional}),
- CaptureTypeWithPattern(
- NAME_FIRST, kSingleWordRe,
- CaptureOptions{.quantifier = MatchQuantifier::kOptional}),
+ {NoCapturePatternOptional(kHonorificPrefixRe),
+ CaptureTypeWithPatternOptional(NAME_FIRST, kSingleWordRe),
CaptureTypeWithPattern(
NAME_MIDDLE, kMultipleLazyWordsRe,
CaptureOptions{.quantifier = MatchQuantifier::kLazyOptional}),
- CaptureTypeWithPattern(NAME_LAST,
- {kOptionalLastNamePrefixRe, kSingleWordRe}),
- kOptionalLastNameSuffixRe});
+ CaptureTypeWithPattern(
+ NAME_LAST,
+ {NoCapturePatternOptional(kLastNamePrefixRe), kSingleWordRe}),
+ NoCapturePatternOptional(kLastNameSuffixRe)});
}
// Returns an expression to parse a name that starts with the last name,
@@ -302,15 +296,12 @@
std::string ParseLastCommaFirstMiddleExpression() {
return CaptureTypeWithPattern(
NAME_FULL,
- {NoCapturePattern(
- kHonorificPrefixRe,
- CaptureOptions{.quantifier = MatchQuantifier::kOptional}),
- CaptureTypeWithPattern(NAME_LAST,
- {kOptionalLastNamePrefixRe, kSingleWordRe},
- {.separator = "\\s*,\\s*"}),
+ {NoCapturePatternOptional(kHonorificPrefixRe),
CaptureTypeWithPattern(
- NAME_FIRST, kSingleWordRe,
- CaptureOptions{.quantifier = MatchQuantifier::kOptional}),
+ NAME_LAST,
+ {NoCapturePatternOptional(kLastNamePrefixRe), kSingleWordRe},
+ {.separator = "\\s*,\\s*"}),
+ CaptureTypeWithPatternOptional(NAME_FIRST, kSingleWordRe),
CaptureTypeWithPattern(
NAME_MIDDLE, kMultipleLazyWordsRe,
CaptureOptions{.quantifier = MatchQuantifier::kLazyOptional})});
@@ -327,13 +318,14 @@
std::string ParseHispanicLastNameExpression() {
return CaptureTypeWithPattern(
NAME_LAST,
- {CaptureTypeWithPattern(NAME_LAST_FIRST,
- {kOptionalLastNamePrefixRe, kSingleWordRe}),
+ {CaptureTypeWithPattern(
+ NAME_LAST_FIRST,
+ {NoCapturePatternOptional(kLastNamePrefixRe), kSingleWordRe}),
+ CaptureTypeWithPatternOptional(NAME_LAST_CONJUNCTION,
+ kHispanicLastNameConjunctionsRe),
CaptureTypeWithPattern(
- NAME_LAST_CONJUNCTION, kHispanicLastNameConjunctionsRe,
- CaptureOptions{.quantifier = MatchQuantifier::kOptional}),
- CaptureTypeWithPattern(NAME_LAST_SECOND,
- {kOptionalLastNamePrefixRe, kSingleWordRe})});
+ NAME_LAST_SECOND,
+ {NoCapturePatternOptional(kLastNamePrefixRe), kSingleWordRe})});
}
// Returns an expression to parse a full Hispanic/Latinx name that
@@ -342,9 +334,7 @@
std::string ParseHispanicFullNameExpression() {
return CaptureTypeWithPattern(
NAME_FULL,
- {NoCapturePattern(
- kHonorificPrefixRe,
- CaptureOptions{.quantifier = MatchQuantifier::kOptional}),
+ {NoCapturePatternOptional(kHonorificPrefixRe),
CaptureTypeWithPattern(
NAME_FIRST, kMultipleLazyWordsRe,
CaptureOptions{.quantifier = MatchQuantifier::kLazyOptional}),
@@ -376,7 +366,7 @@
kHouseNumberOptionalPrefixRe,
"(?:\\d+\\w?)", "(th\\.|\\.)?")},
CaptureOptions{.separator = ""}),
- CaptureTypeWithPattern(
+ CaptureTypeWithPatternOptional(
ADDRESS_HOME_SUBPREMISE,
{
CaptureTypeWithPrefixedPattern(
@@ -386,8 +376,7 @@
ADDRESS_HOME_APT_NUM, kApartmentNumberPrefix,
"(?:(\\d{1,3}\\w?|\\w))",
CaptureOptions{.quantifier = MatchQuantifier::kOptional}),
- },
- CaptureOptions{.quantifier = MatchQuantifier::kOptional})});
+ })});
}
// Returns an expression to parse a street address into the street name, the
@@ -409,7 +398,7 @@
"(?:\\d+\\w?)", "(th\\.|\\.)?")},
CaptureOptions{.separator = ""}),
- CaptureTypeWithPattern(
+ CaptureTypeWithPatternOptional(
ADDRESS_HOME_SUBPREMISE,
{
CaptureTypeWithSuffixedPattern(
@@ -419,8 +408,7 @@
ADDRESS_HOME_APT_NUM, "(-\\s*)?", "(?:(\\d{1,3}\\w?|\\w))",
kApartmentNumberSuffix,
CaptureOptions{.quantifier = MatchQuantifier::kOptional}),
- },
- CaptureOptions{.quantifier = MatchQuantifier::kOptional})});
+ })});
}
// Returns an expression to parse a street address into the street name, the
@@ -441,7 +429,7 @@
"(?:\\d+\\w?)", "(th\\.|\\.)?")},
{.separator = ""}),
- CaptureTypeWithPattern(
+ CaptureTypeWithPatternOptional(
ADDRESS_HOME_SUBPREMISE,
{
CaptureTypeWithSuffixedPattern(
@@ -451,8 +439,7 @@
ADDRESS_HOME_APT_NUM, kApartmentNumberPrefix,
"(?:(\\d{0,3}\\w?))",
CaptureOptions{.quantifier = MatchQuantifier::kOptional}),
- },
- CaptureOptions{.quantifier = MatchQuantifier::kOptional})});
+ })});
}
// Returns an expression to parse a street address into the street name, the
@@ -472,7 +459,7 @@
CaptureTypeWithPattern(ADDRESS_HOME_STREET_NAME,
kMultipleLazyWordsRe)},
{.separator = ""}),
- CaptureTypeWithPattern(
+ CaptureTypeWithPatternOptional(
ADDRESS_HOME_SUBPREMISE,
{
CaptureTypeWithPrefixedPattern(
@@ -482,8 +469,7 @@
ADDRESS_HOME_APT_NUM, kApartmentNumberPrefix,
"(?:(\\d{0,3}\\w?))",
CaptureOptions{.quantifier = MatchQuantifier::kOptional}),
- },
- CaptureOptions{.quantifier = MatchQuantifier::kOptional})});
+ })});
}
} // namespace
diff --git a/components/autofill/core/browser/data_model/autofill_structured_address_utils.cc b/components/autofill/core/browser/data_model/autofill_structured_address_utils.cc
index 9ad8f4b..b1440959 100644
--- a/components/autofill/core/browser/data_model/autofill_structured_address_utils.cc
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_utils.cc
@@ -256,6 +256,11 @@
{"(?i:", pattern, "(?:", options.separator, ")+)", quantifier});
}
+std::string NoCapturePatternOptional(const std::string& pattern) {
+ return NoCapturePattern(
+ pattern, CaptureOptions{.quantifier = MatchQuantifier::kOptional});
+}
+
std::string CaptureTypeWithAffixedPattern(const FieldType& type,
const std::string& prefix,
const std::string& pattern,
@@ -306,6 +311,19 @@
std::string(), options);
}
+std::string CaptureTypeWithPatternOptional(const FieldType& type,
+ const std::string& pattern) {
+ return CaptureTypeWithPattern(type, pattern,
+ {.quantifier = MatchQuantifier::kOptional});
+}
+
+std::string CaptureTypeWithPatternOptional(
+ const FieldType& type,
+ std::initializer_list<std::string_view> pattern_span_initializer_list) {
+ return CaptureTypeWithPatternOptional(
+ type, base::StrCat(base::span(pattern_span_initializer_list)));
+}
+
std::u16string NormalizeAndRewrite(const AddressCountryCode& country_code,
const std::u16string& text,
bool keep_white_space) {
diff --git a/components/autofill/core/browser/data_model/autofill_structured_address_utils.h b/components/autofill/core/browser/data_model/autofill_structured_address_utils.h
index 154810c..e6be957 100644
--- a/components/autofill/core/browser/data_model/autofill_structured_address_utils.h
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_utils.h
@@ -207,6 +207,9 @@
std::string NoCapturePattern(const std::string& pattern,
const CaptureOptions& options = CaptureOptions());
+// A wrapper for NoCapturePattern() that makes the match optional.
+std::string NoCapturePatternOptional(const std::string& pattern);
+
// Returns a capture group named by the string representation of |type| that
// matches |pattern| with an additional uncaptured |prefix_pattern| and
// |suffix_pattern|.
@@ -240,6 +243,16 @@
const std::string& pattern,
const CaptureOptions options = CaptureOptions());
+// A wrapper for CaptureTypeWithPattern() that makes the match optional.
+std::string CaptureTypeWithPatternOptional(const FieldType& type,
+ const std::string& pattern);
+
+// Calls CaptureTypeWithPatternOptional with a pattern created by the
+// concatenation of the string_views in |pattern_span_initializer_list|.
+std::string CaptureTypeWithPatternOptional(
+ const FieldType& type,
+ std::initializer_list<std::string_view> pattern_span_initializer_list);
+
// Normalizes and rewrites `text` using the rules for `country_code`.
// If `country_code` is empty, it defaults to US.
std::u16string NormalizeAndRewrite(const AddressCountryCode& country_code,
diff --git a/components/autofill/core/browser/data_model/autofill_structured_address_utils_unittest.cc b/components/autofill/core/browser/data_model/autofill_structured_address_utils_unittest.cc
index 131156e9..b380b4dbc 100644
--- a/components/autofill/core/browser/data_model/autofill_structured_address_utils_unittest.cc
+++ b/components/autofill/core/browser/data_model/autofill_structured_address_utils_unittest.cc
@@ -208,6 +208,12 @@
EXPECT_EQ("(?i:abs\\w(?:,|\\s+|$)+)", NoCapturePattern("abs\\w"));
EXPECT_EQ("(?i:abs\\w(?:_)+)",
NoCapturePattern("abs\\w", {.separator = "_"}));
+ EXPECT_EQ(
+ NoCapturePattern("abs\\w", {.quantifier = MatchQuantifier::kOptional}),
+ NoCapturePatternOptional("abs\\w"));
+ EXPECT_EQ(CaptureTypeWithPattern(NAME_FULL, "abs\\w",
+ {.quantifier = MatchQuantifier::kOptional}),
+ CaptureTypeWithPatternOptional(NAME_FULL, "abs\\w"));
}
TEST(AutofillStructuredAddressUtils, TokenizeValue) {