| // Copyright 2013 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "components/autofill/core/browser/address_field.h" |
| |
| #include <stddef.h> |
| |
| #include <memory> |
| #include <utility> |
| |
| #include "base/logging.h" |
| #include "base/strings/string16.h" |
| #include "base/strings/string_util.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "components/autofill/core/browser/autofill_field.h" |
| #include "components/autofill/core/browser/autofill_scanner.h" |
| #include "components/autofill/core/browser/field_types.h" |
| #include "components/autofill/core/common/autofill_regex_constants.h" |
| |
| using base::UTF8ToUTF16; |
| |
| namespace autofill { |
| |
| namespace { |
| |
| bool SetFieldAndAdvanceCursor(AutofillScanner* scanner, AutofillField** field) { |
| *field = scanner->Cursor(); |
| scanner->Advance(); |
| return true; |
| } |
| |
| } // namespace |
| |
| // Some sites use type="tel" for zip fields (to get a numerical input). |
| // http://crbug.com/426958 |
| const int AddressField::kZipCodeMatchType = |
| MATCH_DEFAULT | MATCH_TELEPHONE | MATCH_NUMBER; |
| |
| // Select fields are allowed here. This occurs on top-100 site rediff.com. |
| const int AddressField::kCityMatchType = MATCH_DEFAULT | MATCH_SELECT; |
| |
| const int AddressField::kStateMatchType = MATCH_DEFAULT | MATCH_SELECT; |
| |
| std::unique_ptr<FormField> AddressField::Parse(AutofillScanner* scanner) { |
| if (scanner->IsEnd()) |
| return nullptr; |
| |
| std::unique_ptr<AddressField> address_field(new AddressField); |
| const AutofillField* const initial_field = scanner->Cursor(); |
| size_t saved_cursor = scanner->SaveCursor(); |
| |
| base::string16 attention_ignored = UTF8ToUTF16(kAttentionIgnoredRe); |
| base::string16 region_ignored = UTF8ToUTF16(kRegionIgnoredRe); |
| |
| // Allow address fields to appear in any order. |
| size_t begin_trailing_non_labeled_fields = 0; |
| bool has_trailing_non_labeled_fields = false; |
| while (!scanner->IsEnd()) { |
| const size_t cursor = scanner->SaveCursor(); |
| // Ignore "Address Lookup" field. http://crbug.com/427622 |
| if (ParseField(scanner, base::UTF8ToUTF16(kAddressLookupRe), nullptr) || |
| ParseField(scanner, base::UTF8ToUTF16(kAddressNameIgnoredRe), |
| nullptr)) { |
| continue; |
| // Ignore email addresses. |
| } else if (ParseFieldSpecifics(scanner, base::UTF8ToUTF16(kEmailRe), |
| MATCH_DEFAULT | MATCH_TEXT_AREA, nullptr)) { |
| continue; |
| } else if (address_field->ParseAddressLines(scanner) || |
| address_field->ParseCityStateZipCode(scanner) || |
| address_field->ParseCountry(scanner) || |
| address_field->ParseCompany(scanner)) { |
| has_trailing_non_labeled_fields = false; |
| continue; |
| } else if (ParseField(scanner, attention_ignored, nullptr) || |
| ParseField(scanner, region_ignored, nullptr)) { |
| // We ignore the following: |
| // * Attention. |
| // * Province/Region/Other. |
| continue; |
| } else if (scanner->Cursor() != initial_field && |
| ParseEmptyLabel(scanner, nullptr)) { |
| // Ignore non-labeled fields within an address; the page |
| // MapQuest Driving Directions North America.html contains such a field. |
| // We only ignore such fields after we've parsed at least one other field; |
| // otherwise we'd effectively parse address fields before other field |
| // types after any non-labeled fields, and we want email address fields to |
| // have precedence since some pages contain fields labeled |
| // "Email address". |
| if (!has_trailing_non_labeled_fields) { |
| has_trailing_non_labeled_fields = true; |
| begin_trailing_non_labeled_fields = cursor; |
| } |
| |
| continue; |
| } else { |
| // No field found. |
| break; |
| } |
| } |
| |
| // If we have identified any address fields in this field then it should be |
| // added to the list of fields. |
| if (address_field->company_ || address_field->address1_ || |
| address_field->address2_ || address_field->address3_ || |
| address_field->street_address_ || address_field->city_ || |
| address_field->state_ || address_field->zip_ || address_field->zip4_ || |
| address_field->country_) { |
| // Don't slurp non-labeled fields at the end into the address. |
| if (has_trailing_non_labeled_fields) |
| scanner->RewindTo(begin_trailing_non_labeled_fields); |
| return std::move(address_field); |
| } |
| |
| scanner->RewindTo(saved_cursor); |
| return nullptr; |
| } |
| |
| AddressField::AddressField() |
| : company_(nullptr), |
| address1_(nullptr), |
| address2_(nullptr), |
| address3_(nullptr), |
| street_address_(nullptr), |
| city_(nullptr), |
| state_(nullptr), |
| zip_(nullptr), |
| zip4_(nullptr), |
| country_(nullptr) {} |
| |
| void AddressField::AddClassifications( |
| FieldCandidatesMap* field_candidates) const { |
| // The page can request the address lines as a single textarea input or as |
| // multiple text fields (or not at all), but it shouldn't be possible to |
| // request both. |
| DCHECK(!(address1_ && street_address_)); |
| DCHECK(!(address2_ && street_address_)); |
| DCHECK(!(address3_ && street_address_)); |
| |
| AddClassification(company_, COMPANY_NAME, kBaseAddressParserScore, |
| field_candidates); |
| AddClassification(address1_, ADDRESS_HOME_LINE1, kBaseAddressParserScore, |
| field_candidates); |
| AddClassification(address2_, ADDRESS_HOME_LINE2, kBaseAddressParserScore, |
| field_candidates); |
| AddClassification(address3_, ADDRESS_HOME_LINE3, kBaseAddressParserScore, |
| field_candidates); |
| AddClassification(street_address_, ADDRESS_HOME_STREET_ADDRESS, |
| kBaseAddressParserScore, field_candidates); |
| AddClassification(city_, ADDRESS_HOME_CITY, kBaseAddressParserScore, |
| field_candidates); |
| AddClassification(state_, ADDRESS_HOME_STATE, kBaseAddressParserScore, |
| field_candidates); |
| AddClassification(zip_, ADDRESS_HOME_ZIP, kBaseAddressParserScore, |
| field_candidates); |
| AddClassification(country_, ADDRESS_HOME_COUNTRY, kBaseAddressParserScore, |
| field_candidates); |
| } |
| |
| bool AddressField::ParseCompany(AutofillScanner* scanner) { |
| if (company_) |
| return false; |
| |
| return ParseField(scanner, UTF8ToUTF16(kCompanyRe), &company_); |
| } |
| |
| bool AddressField::ParseAddressLines(AutofillScanner* scanner) { |
| // We only match the string "address" in page text, not in element names, |
| // because sometimes every element in a group of address fields will have |
| // a name containing the string "address"; for example, on the page |
| // Kohl's - Register Billing Address.html the text element labeled "city" |
| // has the name "BILL_TO_ADDRESS<>city". We do match address labels |
| // such as "address1", which appear as element names on various pages (eg |
| // AmericanGirl-Registration.html, BloomingdalesBilling.html, |
| // EBay Registration Enter Information.html). |
| if (address1_ || street_address_) |
| return false; |
| |
| base::string16 pattern = UTF8ToUTF16(kAddressLine1Re); |
| base::string16 label_pattern = UTF8ToUTF16(kAddressLine1LabelRe); |
| if (!ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT, &address1_) && |
| !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT, |
| &address1_) && |
| !ParseFieldSpecifics(scanner, pattern, MATCH_DEFAULT | MATCH_TEXT_AREA, |
| &street_address_) && |
| !ParseFieldSpecifics(scanner, label_pattern, |
| MATCH_LABEL | MATCH_TEXT_AREA, |
| &street_address_)) |
| return false; |
| |
| if (street_address_) |
| return true; |
| |
| // This code may not pick up pages that have an address field consisting of a |
| // sequence of unlabeled address fields. If we need to add this, see |
| // discussion on https://codereview.chromium.org/741493003/ |
| pattern = UTF8ToUTF16(kAddressLine2Re); |
| label_pattern = UTF8ToUTF16(kAddressLine2LabelRe); |
| if (!ParseField(scanner, pattern, &address2_) && |
| !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT, |
| &address2_)) |
| return true; |
| |
| // Optionally parse address line 3. This uses the same label regexp as |
| // address 2 above. |
| pattern = UTF8ToUTF16(kAddressLinesExtraRe); |
| if (!ParseField(scanner, pattern, &address3_) && |
| !ParseFieldSpecifics(scanner, label_pattern, MATCH_LABEL | MATCH_TEXT, |
| &address3_)) |
| return true; |
| |
| // Try for surplus lines, which we will promptly discard. Some pages have 4 |
| // address lines (e.g. uk/ShoesDirect2.html)! |
| // |
| // Since these are rare, don't bother considering unlabeled lines as extra |
| // address lines. |
| pattern = UTF8ToUTF16(kAddressLinesExtraRe); |
| while (ParseField(scanner, pattern, nullptr)) { |
| // Consumed a surplus line, try for another. |
| } |
| return true; |
| } |
| |
| bool AddressField::ParseCountry(AutofillScanner* scanner) { |
| if (country_) |
| return false; |
| |
| scanner->SaveCursor(); |
| if (ParseFieldSpecifics(scanner, |
| UTF8ToUTF16(kCountryRe), |
| MATCH_DEFAULT | MATCH_SELECT, |
| &country_)) { |
| return true; |
| } |
| |
| // The occasional page (e.g. google account registration page) calls this a |
| // "location". However, this only makes sense for select tags. |
| scanner->Rewind(); |
| return ParseFieldSpecifics(scanner, |
| UTF8ToUTF16(kCountryLocationRe), |
| MATCH_LABEL | MATCH_NAME | MATCH_SELECT, |
| &country_); |
| } |
| |
| bool AddressField::ParseZipCode(AutofillScanner* scanner) { |
| if (zip_) |
| return false; |
| |
| if (!ParseFieldSpecifics(scanner, |
| UTF8ToUTF16(kZipCodeRe), |
| kZipCodeMatchType, |
| &zip_)) { |
| return false; |
| } |
| |
| // Look for a zip+4, whose field name will also often contain |
| // the substring "zip". |
| ParseFieldSpecifics(scanner, UTF8ToUTF16(kZip4Re), kZipCodeMatchType, &zip4_); |
| return true; |
| } |
| |
| bool AddressField::ParseCity(AutofillScanner* scanner) { |
| if (city_) |
| return false; |
| |
| return ParseFieldSpecifics(scanner, |
| UTF8ToUTF16(kCityRe), |
| kCityMatchType, |
| &city_); |
| } |
| |
| bool AddressField::ParseState(AutofillScanner* scanner) { |
| if (state_) |
| return false; |
| |
| return ParseFieldSpecifics(scanner, |
| UTF8ToUTF16(kStateRe), |
| kStateMatchType, |
| &state_); |
| } |
| |
| AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelSeparately( |
| AutofillScanner* scanner, |
| const base::string16& pattern, |
| int match_type, |
| AutofillField** match) { |
| if (scanner->IsEnd()) |
| return RESULT_MATCH_NONE; |
| |
| AutofillField* cur_match = nullptr; |
| size_t saved_cursor = scanner->SaveCursor(); |
| bool parsed_name = ParseFieldSpecifics(scanner, |
| pattern, |
| match_type & ~MATCH_LABEL, |
| &cur_match); |
| scanner->RewindTo(saved_cursor); |
| bool parsed_label = ParseFieldSpecifics(scanner, |
| pattern, |
| match_type & ~MATCH_NAME, |
| &cur_match); |
| if (parsed_name && parsed_label) { |
| if (match) |
| *match = cur_match; |
| return RESULT_MATCH_NAME_LABEL; |
| } |
| |
| scanner->RewindTo(saved_cursor); |
| if (parsed_name) |
| return RESULT_MATCH_NAME; |
| if (parsed_label) |
| return RESULT_MATCH_LABEL; |
| return RESULT_MATCH_NONE; |
| } |
| |
| bool AddressField::ParseCityStateZipCode(AutofillScanner* scanner) { |
| // Simple cases. |
| if (scanner->IsEnd()) |
| return false; |
| if (city_ && state_ && zip_) |
| return false; |
| if (state_ && zip_) |
| return ParseCity(scanner); |
| if (city_ && zip_) |
| return ParseState(scanner); |
| if (city_ && state_) |
| return ParseZipCode(scanner); |
| |
| // Check for matches to both name and label. |
| ParseNameLabelResult city_result = ParseNameAndLabelForCity(scanner); |
| if (city_result == RESULT_MATCH_NAME_LABEL) |
| return true; |
| ParseNameLabelResult state_result = ParseNameAndLabelForState(scanner); |
| if (state_result == RESULT_MATCH_NAME_LABEL) |
| return true; |
| ParseNameLabelResult zip_result = ParseNameAndLabelForZipCode(scanner); |
| if (zip_result == RESULT_MATCH_NAME_LABEL) |
| return true; |
| |
| // Check if there is only one potential match. |
| bool maybe_city = city_result != RESULT_MATCH_NONE; |
| bool maybe_state = state_result != RESULT_MATCH_NONE; |
| bool maybe_zip = zip_result != RESULT_MATCH_NONE; |
| if (maybe_city && !maybe_state && !maybe_zip) |
| return SetFieldAndAdvanceCursor(scanner, &city_); |
| if (maybe_state && !maybe_city && !maybe_zip) |
| return SetFieldAndAdvanceCursor(scanner, &state_); |
| if (maybe_zip && !maybe_city && !maybe_state) |
| return ParseZipCode(scanner); |
| |
| // Otherwise give name priority over label. |
| if (city_result == RESULT_MATCH_NAME) |
| return SetFieldAndAdvanceCursor(scanner, &city_); |
| if (state_result == RESULT_MATCH_NAME) |
| return SetFieldAndAdvanceCursor(scanner, &state_); |
| if (zip_result == RESULT_MATCH_NAME) |
| return ParseZipCode(scanner); |
| |
| if (city_result == RESULT_MATCH_LABEL) |
| return SetFieldAndAdvanceCursor(scanner, &city_); |
| if (state_result == RESULT_MATCH_LABEL) |
| return SetFieldAndAdvanceCursor(scanner, &state_); |
| if (zip_result == RESULT_MATCH_LABEL) |
| return ParseZipCode(scanner); |
| |
| return false; |
| } |
| |
| AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForZipCode( |
| AutofillScanner* scanner) { |
| if (zip_) |
| return RESULT_MATCH_NONE; |
| |
| ParseNameLabelResult result = ParseNameAndLabelSeparately( |
| scanner, UTF8ToUTF16(kZipCodeRe), kZipCodeMatchType, &zip_); |
| |
| if (result != RESULT_MATCH_NAME_LABEL || scanner->IsEnd()) |
| return result; |
| |
| size_t saved_cursor = scanner->SaveCursor(); |
| bool found_non_zip4 = ParseCity(scanner); |
| if (found_non_zip4) |
| city_ = nullptr; |
| scanner->RewindTo(saved_cursor); |
| if (!found_non_zip4) { |
| found_non_zip4 = ParseState(scanner); |
| if (found_non_zip4) |
| state_ = nullptr; |
| scanner->RewindTo(saved_cursor); |
| } |
| |
| if (!found_non_zip4) { |
| // Look for a zip+4, whose field name will also often contain |
| // the substring "zip". |
| ParseFieldSpecifics(scanner, |
| UTF8ToUTF16(kZip4Re), |
| kZipCodeMatchType, |
| &zip4_); |
| } |
| return result; |
| } |
| |
| AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForCity( |
| AutofillScanner* scanner) { |
| if (city_) |
| return RESULT_MATCH_NONE; |
| |
| return ParseNameAndLabelSeparately( |
| scanner, UTF8ToUTF16(kCityRe), kCityMatchType, &city_); |
| } |
| |
| AddressField::ParseNameLabelResult AddressField::ParseNameAndLabelForState( |
| AutofillScanner* scanner) { |
| if (state_) |
| return RESULT_MATCH_NONE; |
| |
| return ParseNameAndLabelSeparately( |
| scanner, UTF8ToUTF16(kStateRe), kStateMatchType, &state_); |
| } |
| |
| } // namespace autofill |