components/autofill/core/browser/autofill_profile_comparator.cc - chromium/src - Git at Google

 // Copyright 2016 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "components/autofill/core/browser/autofill_profile_comparator.h"

 #include <algorithm>
 #include <vector>

 #include "base/i18n/case_conversion.h"
 #include "base/i18n/char_iterator.h"
 #include "base/i18n/unicodestring.h"
 #include "base/strings/string_split.h"
 #include "base/strings/string_util.h"
 #include "base/strings/utf_string_conversion_utils.h"
 #include "base/strings/utf_string_conversions.h"
 #include "components/autofill/core/browser/address_rewriter.h"
 #include "components/autofill/core/browser/autofill_country.h"
 #include "components/autofill/core/browser/autofill_data_util.h"
 #include "components/autofill/core/browser/state_names.h"
 #include "third_party/libphonenumber/phonenumber_api.h"

 using i18n::phonenumbers::PhoneNumberUtil;
 using base::UTF16ToUTF8;
 using base::UTF8ToUTF16;

 namespace autofill {
 namespace {

 const base::char16 kSpace[] = {L' ', L'\0'};

 bool ContainsNewline(base::StringPiece16 text) {
   return text.find('\n') != base::StringPiece16::npos;
 }

 std::ostream& operator<<(std::ostream& os,
                          const ::i18n::phonenumbers::PhoneNumber& n) {
   os << "country_code: " << n.country_code() << " "
      << "national_number: " << n.national_number();
   if (n.has_extension())
     os << " extension: \"" << n.extension() << "\"";
   if (n.has_italian_leading_zero())
     os << " italian_leading_zero: " << n.italian_leading_zero();
   if (n.has_number_of_leading_zeros())
     os << " number_of_leading_zeros: " << n.number_of_leading_zeros();
   if (n.has_raw_input())
     os << " raw_input: \"" << n.raw_input() << "\"";
   return os;
 }

 }  // namespace

 AutofillProfileComparator::AutofillProfileComparator(
     const base::StringPiece& app_locale)
     : app_locale_(app_locale.data(), app_locale.size()) {
   // Use ICU transliteration to remove diacritics and fold case.
   // See http://userguide.icu-project.org/transforms/general
   UErrorCode status = U_ZERO_ERROR;
   std::unique_ptr<icu::Transliterator> transliterator(
       icu::Transliterator::createInstance(
           "NFD; [:Nonspacing Mark:] Remove; Lower; NFC", UTRANS_FORWARD,
           status));
   if (U_FAILURE(status) || transliterator == nullptr) {
     // TODO(rogerm): Add a histogram to count how often this happens.
     LOG(ERROR) << "Failed to create ICU Transliterator: "
                << u_errorName(status);
   }

   transliterator_ = std::move(transliterator);
 }

 AutofillProfileComparator::~AutofillProfileComparator() {}

 base::string16 AutofillProfileComparator::NormalizeForComparison(
     base::StringPiece16 text,
     AutofillProfileComparator::WhitespaceSpec whitespace_spec) const {
   // This algorithm is not designed to be perfect, we could get arbitrarily
   // fancy here trying to canonicalize address lines. Instead, this is designed
   // to handle common cases for all types of data (addresses and names) without
   // the need of domain-specific logic.
   //
   // 1. Convert punctuation to spaces and normalize all whitespace to spaces.
   //    This will convert "Mid-Island Plz." -> "Mid Island Plz " (the trailing
   //    space will be trimmed off outside of the end of the loop).
   //
   // 2. Collapse consecutive punctuation/whitespace characters to a single
   //    space. We pretend the string has already started with whitespace in
   //    order to trim leading spaces.
   //
   // 3. Remove diacritics (accents and other non-spacing marks) and perform
   //    case folding to lower-case.
   base::string16 result;
   result.reserve(text.length());
   bool previous_was_whitespace = (whitespace_spec == RETAIN_WHITESPACE);
   for (base::i18n::UTF16CharIterator iter(text.data(), text.length());
        !iter.end(); iter.Advance()) {
     switch (u_charType(iter.get())) {
       // Punctuation
       case U_DASH_PUNCTUATION:
       case U_START_PUNCTUATION:
       case U_END_PUNCTUATION:
       case U_CONNECTOR_PUNCTUATION:
       case U_OTHER_PUNCTUATION:
       // Whitespace
       case U_CONTROL_CHAR:  // To escape the '\n' character.
       case U_SPACE_SEPARATOR:
       case U_LINE_SEPARATOR:
       case U_PARAGRAPH_SEPARATOR:
         if (!previous_was_whitespace && whitespace_spec == RETAIN_WHITESPACE) {
           result.push_back(' ');
           previous_was_whitespace = true;
         }
         break;

       default:
         previous_was_whitespace = false;
         base::WriteUnicodeCharacter(iter.get(), &result);
         break;
     }
   }

   // Trim off trailing whitespace if we left one.
   if (previous_was_whitespace && !result.empty())
     result.resize(result.size() - 1);

   if (transliterator_ == nullptr)
     return result;

   icu::UnicodeString value = icu::UnicodeString(result.data(), result.length());
   transliterator_->transliterate(value);
   return base::i18n::UnicodeStringToString16(value);
 }

 bool AutofillProfileComparator::AreMergeable(const AutofillProfile& p1,
                                              const AutofillProfile& p2) const {
   // Sorted in order to relative expense of the tests to fail early and cheaply
   // if possible.
   DVLOG(1) << "Comparing profiles:\np1 = " << p1 << "\np2 = " << p2;

   if (!HaveMergeableEmailAddresses(p1, p2)) {
     DVLOG(1) << "Different email addresses.";
     return false;
   }

   if (!HaveMergeableCompanyNames(p1, p2)) {
     DVLOG(1) << "Different email company names.";
     return false;
   }

   if (!HaveMergeablePhoneNumbers(p1, p2)) {
     DVLOG(1) << "Different phone numbers.";
     return false;
   }

   if (!HaveMergeableNames(p1, p2)) {
     DVLOG(1) << "Different names.";
     return false;
   }

   if (!HaveMergeableAddresses(p1, p2)) {
     DVLOG(1) << "Different addresses.";
     return false;
   }

   DVLOG(1) << "Profiles are mergeable.";
   return true;
 }

 bool AutofillProfileComparator::MergeNames(const AutofillProfile& p1,
                                            const AutofillProfile& p2,
                                            NameInfo* name_info) const {
   DCHECK(HaveMergeableNames(p1, p2));

   const AutofillType kFullName(NAME_FULL);
   const base::string16& full_name_1 = p1.GetInfo(kFullName, app_locale_);
   const base::string16& full_name_2 = p2.GetInfo(kFullName, app_locale_);

   const base::string16& normalized_full_name_1 =
       NormalizeForComparison(full_name_1);
   const base::string16& normalized_full_name_2 =
       NormalizeForComparison(full_name_2);

   const base::string16* best_name = nullptr;
   if (normalized_full_name_1.empty()) {
     // p1 has no name, so use the name from p2.
     best_name = &full_name_2;
   } else if (normalized_full_name_2.empty()) {
     // p2 has no name, so use the name from p1.
     best_name = &full_name_1;
   } else if (data_util::IsCJKName(full_name_1) &&
              data_util::IsCJKName(full_name_2)) {
     // Use a separate logic for CJK names.
     return MergeCJKNames(p1, p2, name_info);
   } else if (IsNameVariantOf(normalized_full_name_1, normalized_full_name_2)) {
     // full_name_2 is a variant of full_name_1.
     best_name = &full_name_1;
   } else {
     // If the assertion that p1 and p2 have mergeable names is true, then
     // full_name_1 must be a name variant of full_name_2;
     best_name = &full_name_2;
   }

   name_info->SetInfo(AutofillType(NAME_FULL), *best_name, app_locale_);
   return true;
 }

 bool AutofillProfileComparator::MergeCJKNames(
     const AutofillProfile& p1,
     const AutofillProfile& p2,
     NameInfo* info) const {
   DCHECK(data_util::IsCJKName(p1.GetInfo(NAME_FULL, app_locale_)));
   DCHECK(data_util::IsCJKName(p2.GetInfo(NAME_FULL, app_locale_)));

   struct Name {
     base::string16 given;
     base::string16 surname;
     base::string16 full;
   };

   Name name1 = {
     p1.GetRawInfo(NAME_FIRST),
     p1.GetRawInfo(NAME_LAST),
     p1.GetRawInfo(NAME_FULL)
   };
   Name name2 = {
     p2.GetRawInfo(NAME_FIRST),
     p2.GetRawInfo(NAME_LAST),
     p2.GetRawInfo(NAME_FULL)
   };

   const Name* most_recent_name =
       p2.use_date() >= p1.use_date() ? &name2 : &name1;

   // The two |NameInfo| objects might disagree about what the full name looks
   // like. If only one of the two has an explicit (user-entered) full name, use
   // that as ground truth. Otherwise, use the most recent profile.
   const Name* full_name_candidate;
   if (name1.full.empty()) {
     full_name_candidate = &name2;
   } else if (name2.full.empty()) {
     full_name_candidate = &name1;
   } else {
     full_name_candidate = most_recent_name;
   }

   // The two |NameInfo| objects might disagree about how the name is split into
   // given/surname. If only one of the two has an explicit (user-entered)
   // given/surname pair, use that as ground truth. Otherwise, use the most
   // recent profile.
   const Name* name_parts_candidate;
   if (name1.given.empty() || name1.surname.empty()) {
     name_parts_candidate = &name2;
   } else if (name2.given.empty() || name2.surname.empty()) {
     name_parts_candidate = &name1;
   } else {
     name_parts_candidate = most_recent_name;
   }

   if (name_parts_candidate->given.empty() ||
       name_parts_candidate->surname.empty()) {
     // The name was not split correctly into a given/surname, so use the logic
     // from |SplitName()|.
     info->SetInfo(AutofillType(NAME_FULL), full_name_candidate->full,
                   app_locale_);
   } else {
     // The name was already split into a given/surname, so keep those intact.
     if (!full_name_candidate->full.empty()) {
       info->SetRawInfo(NAME_FULL, full_name_candidate->full);
     }
     info->SetRawInfo(NAME_FIRST, name_parts_candidate->given);
     info->SetRawInfo(NAME_LAST, name_parts_candidate->surname);
   }

   return true;
 }

 bool AutofillProfileComparator::IsNameVariantOf(
     const base::string16& full_name_1,
     const base::string16& full_name_2) const {
   data_util::NameParts name_1_parts = data_util::SplitName(full_name_1);

   // Build the variants of full_name_1`s given, middle and family names.
   //
   // TODO(rogerm): Figure out whether or not we should break apart a compound
   // family name into variants (crbug/619051)
   const std::set<base::string16> given_name_variants =
       GetNamePartVariants(name_1_parts.given);
   const std::set<base::string16> middle_name_variants =
       GetNamePartVariants(name_1_parts.middle);
   base::StringPiece16 family_name = name_1_parts.family;

   // Iterate over all full name variants of profile 2 and see if any of them
   // match the full name from profile 1.
   for (const auto& given_name : given_name_variants) {
     for (const auto& middle_name : middle_name_variants) {
       base::string16 candidate = base::CollapseWhitespace(
           base::JoinString({given_name, middle_name, family_name}, kSpace),
           true);
       if (candidate == full_name_2)
         return true;
     }
   }

   // Also check if the name is just composed of the user's initials. For
   // example, "thomas jefferson miller" could be composed as "tj miller".
   if (!name_1_parts.given.empty() && !name_1_parts.middle.empty()) {
     base::string16 initials;
     initials.push_back(name_1_parts.given[0]);
     initials.push_back(name_1_parts.middle[0]);
     base::string16 candidate = base::CollapseWhitespace(
         base::JoinString({initials, family_name}, kSpace), true);
     if (candidate == full_name_2)
       return true;
   }

   // There was no match found.
   return false;
 }

 bool AutofillProfileComparator::MergeEmailAddresses(
     const AutofillProfile& p1,
     const AutofillProfile& p2,
     EmailInfo* email_info) const {
   DCHECK(HaveMergeableEmailAddresses(p1, p2));

   const AutofillType kEmailAddress(EMAIL_ADDRESS);
   const base::string16& e1 = p1.GetInfo(kEmailAddress, app_locale_);
   const base::string16& e2 = p2.GetInfo(kEmailAddress, app_locale_);
   const base::string16* best = nullptr;

   if (e1.empty()) {
     best = &e2;
   } else if (e2.empty()) {
     best = &e1;
   } else {
     best = p2.use_date() > p1.use_date() ? &e2 : &e1;
   }

   email_info->SetInfo(kEmailAddress, *best, app_locale_);
   return true;
 }

 bool AutofillProfileComparator::MergeCompanyNames(
     const AutofillProfile& p1,
     const AutofillProfile& p2,
     CompanyInfo* company_info) const {
   const AutofillType kCompanyName(COMPANY_NAME);
   const base::string16& c1 = p1.GetInfo(kCompanyName, app_locale_);
   const base::string16& c2 = p2.GetInfo(kCompanyName, app_locale_);
   const base::string16* best = nullptr;

   DCHECK(HaveMergeableCompanyNames(p1, p2))
       << "Company names are not mergeable: '" << c1 << "' vs '" << c2 << "'";

   CompareTokensResult result =
       CompareTokens(NormalizeForComparison(c1), NormalizeForComparison(c2));
   switch (result) {
     case DIFFERENT_TOKENS:
     default:
       NOTREACHED() << "Unexpected mismatch: '" << c1 << "' vs '" << c2 << "'";
       return false;
     case S1_CONTAINS_S2:
       best = &c1;
       break;
     case S2_CONTAINS_S1:
       best = &c2;
       break;
     case SAME_TOKENS:
       best = p2.use_date() > p1.use_date() ? &c2 : &c1;
       break;
   }

   company_info->SetInfo(kCompanyName, *best, app_locale_);
   return true;
 }

 bool AutofillProfileComparator::MergePhoneNumbers(
     const AutofillProfile& p1,
     const AutofillProfile& p2,
     PhoneNumber* phone_number) const {
   const ServerFieldType kWholePhoneNumber = PHONE_HOME_WHOLE_NUMBER;
   const base::string16& s1 = p1.GetRawInfo(kWholePhoneNumber);
   const base::string16& s2 = p2.GetRawInfo(kWholePhoneNumber);

   DCHECK(HaveMergeablePhoneNumbers(p1, p2))
       << "Phone numbers are not mergeable: '" << s1 << "' vs '" << s2 << "'";

   if (s1.empty()) {
     phone_number->SetRawInfo(kWholePhoneNumber, s2);
     return true;
   }

   if (s2.empty() || s1 == s2) {
     phone_number->SetRawInfo(kWholePhoneNumber, s1);
     return true;
   }

   // Figure out a country code hint.
   const AutofillType kCountryCode(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE);
   std::string region = UTF16ToUTF8(GetNonEmptyOf(p1, p2, kCountryCode));
   if (region.empty())
     region = AutofillCountry::CountryCodeForLocale(app_locale_);

   // Parse the phone numbers.
   PhoneNumberUtil* phone_util = PhoneNumberUtil::GetInstance();

   ::i18n::phonenumbers::PhoneNumber n1;
   if (phone_util->ParseAndKeepRawInput(UTF16ToUTF8(s1), region, &n1) !=
       PhoneNumberUtil::NO_PARSING_ERROR) {
     return false;
   }

   ::i18n::phonenumbers::PhoneNumber n2;
   if (phone_util->ParseAndKeepRawInput(UTF16ToUTF8(s2), region, &n2) !=
       PhoneNumberUtil::NO_PARSING_ERROR) {
     return false;
   }

   ::i18n::phonenumbers::PhoneNumber merged_number;
   DCHECK_EQ(n1.country_code(), n2.country_code());
   merged_number.set_country_code(n1.country_code());
   merged_number.set_national_number(
       std::max(n1.national_number(), n2.national_number()));
   if (n1.has_extension() && !n1.extension().empty()) {
     merged_number.set_extension(n1.extension());
   } else if (n2.has_extension() && !n2.extension().empty()) {
     merged_number.set_extension(n2.extension());
   }
   if (n1.has_italian_leading_zero() || n2.has_italian_leading_zero()) {
     merged_number.set_italian_leading_zero(n1.italian_leading_zero() ||
                                            n2.italian_leading_zero());
   }
   if (n1.has_number_of_leading_zeros() || n2.has_number_of_leading_zeros()) {
     merged_number.set_number_of_leading_zeros(
         std::max(n1.number_of_leading_zeros(), n2.number_of_leading_zeros()));
   }

   PhoneNumberUtil::PhoneNumberFormat format =
       region.empty() ? PhoneNumberUtil::NATIONAL
                      : PhoneNumberUtil::INTERNATIONAL;

   std::string new_number;
   phone_util->Format(merged_number, format, &new_number);

   DVLOG(2) << "n1 = {" << n1 << "}";
   DVLOG(2) << "n2 = {" << n2 << "}";
   DVLOG(2) << "merged_number = {" << merged_number << "}";
   DVLOG(2) << "new_number = \"" << new_number << "\"";

   // Check if it's a North American number that's missing the area code.
   // Libphonenumber doesn't know how to format short numbers; it will still
   // include the country code prefix.
   if (merged_number.country_code() == 1 &&
       merged_number.national_number() <= 9999999 &&
       base::StartsWith(new_number, "+1", base::CompareCase::SENSITIVE)) {
     size_t offset = 2;  // The char just after "+1".
     while (offset < new_number.size() &&
            base::IsAsciiWhitespace(new_number[offset])) {
       ++offset;
     }
     new_number = new_number.substr(offset);
   }

   phone_number->SetRawInfo(kWholePhoneNumber, UTF8ToUTF16(new_number));

   return true;
 }

 bool AutofillProfileComparator::MergeAddresses(const AutofillProfile& p1,
                                                const AutofillProfile& p2,
                                                Address* address) const {
   DCHECK(HaveMergeableAddresses(p1, p2));

   // One of the countries is empty or they are the same modulo case, so we just
   // have to find the non-empty one, if any.
   const AutofillType kCountryCode(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE);
   const base::string16& country_code =
       base::i18n::ToUpper(GetNonEmptyOf(p1, p2, kCountryCode));
   address->SetInfo(kCountryCode, country_code, app_locale_);

   // One of the zip codes is empty, they are the same, or one is a substring
   // of the other. We prefer the most recently used zip code.
   const AutofillType kZipCode(ADDRESS_HOME_ZIP);
   const base::string16& zip1 = p1.GetInfo(kZipCode, app_locale_);
   const base::string16& zip2 = p2.GetInfo(kZipCode, app_locale_);
   if (zip1.empty()) {
     address->SetInfo(kZipCode, zip2, app_locale_);
   } else if (zip2.empty()) {
     address->SetInfo(kZipCode, zip1, app_locale_);
   } else {
     address->SetInfo(kZipCode, (p2.use_date() > p1.use_date() ? zip2 : zip1),
                      app_locale_);
   }

   // One of the states is empty or one of the states has a subset of tokens from
   // the other. Pick the non-empty state that is shorter. This is usually the
   // abbreviated one.
   const AutofillType kState(ADDRESS_HOME_STATE);
   const base::string16& state1 = p1.GetInfo(kState, app_locale_);
   const base::string16& state2 = p2.GetInfo(kState, app_locale_);
   if (state1.empty()) {
     address->SetInfo(kState, state2, app_locale_);
   } else if (state2.empty()) {
     address->SetInfo(kState, state1, app_locale_);
   } else {
     address->SetInfo(kState, (state2.size() < state1.size() ? state2 : state1),
                      app_locale_);
   }

   AddressRewriter rewriter = AddressRewriter::ForCountryCode(country_code);

   // One of the cities is empty or one of the cities has a subset of tokens from
   // the other. Pick the city name with more tokens; this is usually the most
   // explicit one.
   const AutofillType kCity(ADDRESS_HOME_CITY);
   const base::string16& city1 = p1.GetInfo(kCity, app_locale_);
   const base::string16& city2 = p2.GetInfo(kCity, app_locale_);
   if (city1.empty()) {
     address->SetInfo(kCity, city2, app_locale_);
   } else if (city2.empty()) {
     address->SetInfo(kCity, city1, app_locale_);
   } else {
     // Prefer the one with more tokens, making sure to apply address
     // normalization and rewriting before doing the comparison.
     CompareTokensResult result =
         CompareTokens(rewriter.Rewrite(NormalizeForComparison(city1)),
                       rewriter.Rewrite(NormalizeForComparison(city2)));
     switch (result) {
       case SAME_TOKENS:
         // They have the same set of unique tokens. Let's pick the more recently
         // used one.
         address->SetInfo(kCity, (p2.use_date() > p1.use_date() ? city2 : city1),
                          app_locale_);
         break;
       case S1_CONTAINS_S2:
         // city1 has more unique tokens than city2.
         address->SetInfo(kCity, city1, app_locale_);
         break;
       case S2_CONTAINS_S1:
         // city2 has more unique tokens than city1.
         address->SetInfo(kCity, city2, app_locale_);
         break;
       case DIFFERENT_TOKENS:
       default:
         // The cities aren't mergeable and we shouldn't be doing any of
         // this.
         NOTREACHED() << "Unexpected mismatch: '" << city1 << "' vs '" << city2
                      << "'";
         return false;
     }
   }

   // One of the dependend localities is empty or one of the localities has a
   // subset of tokens from the other. Pick the locality name with more tokens;
   // this is usually the most explicit one.
   const AutofillType kDependentLocality(ADDRESS_HOME_DEPENDENT_LOCALITY);
   const base::string16& locality1 = p1.GetInfo(kDependentLocality, app_locale_);
   const base::string16& locality2 = p2.GetInfo(kDependentLocality, app_locale_);
   if (locality1.empty()) {
     address->SetInfo(kDependentLocality, locality2, app_locale_);
   } else if (locality2.empty()) {
     address->SetInfo(kDependentLocality, locality1, app_locale_);
   } else {
     // Prefer the one with more tokens, making sure to apply address
     // normalization and rewriting before doing the comparison.
     CompareTokensResult result =
         CompareTokens(rewriter.Rewrite(NormalizeForComparison(locality1)),
                       rewriter.Rewrite(NormalizeForComparison(locality2)));
     switch (result) {
       case SAME_TOKENS:
         // They have the same set of unique tokens. Let's pick the more recently
         // used one.
         address->SetInfo(
             kDependentLocality,
             (p2.use_date() > p1.use_date() ? locality2 : locality1),
             app_locale_);
         break;
       case S1_CONTAINS_S2:
         // locality1 has more unique tokens than locality2.
         address->SetInfo(kDependentLocality, locality1, app_locale_);
         break;
       case S2_CONTAINS_S1:
         // locality2 has more unique tokens than locality1.
         address->SetInfo(kDependentLocality, locality2, app_locale_);
         break;
       case DIFFERENT_TOKENS:
       default:
         // The localities aren't mergeable and we shouldn't be doing any of
         // this.
         NOTREACHED() << "Unexpected mismatch: '" << locality1 << "' vs '"
                      << locality2 << "'";
         return false;
     }
   }

   // One of the sorting codes is empty, they are the same, or one is a substring
   // of the other. We prefer the most recently used sorting code.
   const AutofillType kSortingCode(ADDRESS_HOME_SORTING_CODE);
   const base::string16& sorting1 = p1.GetInfo(kSortingCode, app_locale_);
   const base::string16& sorting2 = p2.GetInfo(kSortingCode, app_locale_);
   if (sorting1.empty()) {
     address->SetInfo(kSortingCode, sorting2, app_locale_);
   } else if (sorting2.empty()) {
     address->SetInfo(kSortingCode, sorting1, app_locale_);
   } else {
     address->SetInfo(kSortingCode,
                      (p2.use_date() > p1.use_date() ? sorting2 : sorting1),
                      app_locale_);
   }

   // One of the addresses is empty or one of the addresses has a subset of
   // tokens from the other. Prefer the more verbosely expressed one.
   const AutofillType kStreetAddress(ADDRESS_HOME_STREET_ADDRESS);
   const base::string16& address1 = p1.GetInfo(kStreetAddress, app_locale_);
   const base::string16& address2 = p2.GetInfo(kStreetAddress, app_locale_);
   // If one of the addresses is empty then use the other.
   if (address1.empty()) {
     address->SetInfo(kStreetAddress, address2, app_locale_);
   } else if (address2.empty()) {
     address->SetInfo(kStreetAddress, address1, app_locale_);
   } else {
     // Prefer the multi-line address if one is multi-line and the other isn't.
     bool address1_multiline = ContainsNewline(address1);
     bool address2_multiline = ContainsNewline(address2);
     if (address1_multiline && !address2_multiline) {
       address->SetInfo(kStreetAddress, address1, app_locale_);
     } else if (address2_multiline && !address1_multiline) {
       address->SetInfo(kStreetAddress, address2, app_locale_);
     } else {
       // Prefer the one with more tokens if they're both single-line or both
       // multi-line addresses, making sure to apply address normalization and
       // rewriting before doing the comparison.
       CompareTokensResult result =
           CompareTokens(rewriter.Rewrite(NormalizeForComparison(address1)),
                         rewriter.Rewrite(NormalizeForComparison(address2)));
       switch (result) {
         case SAME_TOKENS:
           // They have the same set of unique tokens. Let's pick the one that's
           // longer.
           address->SetInfo(
               kStreetAddress,
               (p2.use_date() > p1.use_date() ? address2 : address1),
               app_locale_);
           break;
         case S1_CONTAINS_S2:
           // address1 has more unique tokens than address2.
           address->SetInfo(kStreetAddress, address1, app_locale_);
           break;
         case S2_CONTAINS_S1:
           // address2 has more unique tokens than address1.
           address->SetInfo(kStreetAddress, address2, app_locale_);
           break;
         case DIFFERENT_TOKENS:
         default:
           // The addresses aren't mergeable and we shouldn't be doing any of
           // this.
           NOTREACHED() << "Unexpected mismatch: '" << address1 << "' vs '"
                        << address2 << "'";
           return false;
       }
     }
   }
   return true;
 }

 // static
 std::set<base::StringPiece16> AutofillProfileComparator::UniqueTokens(
     base::StringPiece16 s) {
   std::vector<base::StringPiece16> tokens = base::SplitStringPiece(
       s, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
   return std::set<base::StringPiece16>(tokens.begin(), tokens.end());
 }

 // static
 AutofillProfileComparator::CompareTokensResult
 AutofillProfileComparator::CompareTokens(base::StringPiece16 s1,
                                          base::StringPiece16 s2) {
   // Note: std::include() expects the items in each range to be in sorted order,
   // hence the use of std::set<> instead of std::unordered_set<>.
   std::set<base::StringPiece16> t1 = UniqueTokens(s1);
   std::set<base::StringPiece16> t2 = UniqueTokens(s2);

   // Does s1 contains all of the tokens in s2? As a special case, return 0 if
   // the two sets are exactly the same.
   if (std::includes(t1.begin(), t1.end(), t2.begin(), t2.end()))
     return t1.size() == t2.size() ? SAME_TOKENS : S1_CONTAINS_S2;

   // Does s2 contain all of the tokens in s1?
   if (std::includes(t2.begin(), t2.end(), t1.begin(), t1.end()))
     return S2_CONTAINS_S1;

   // Neither string contains all of the tokens from the other.
   return DIFFERENT_TOKENS;
 }

 base::string16 AutofillProfileComparator::GetNonEmptyOf(
     const AutofillProfile& p1,
     const AutofillProfile& p2,
     AutofillType t) const {
   const base::string16& s1 = p1.GetInfo(t, app_locale_);
   if (!s1.empty())
     return s1;
   return p2.GetInfo(t, app_locale_);
 }

 // static
 std::set<base::string16> AutofillProfileComparator::GetNamePartVariants(
     const base::string16& name_part) {
   const size_t kMaxSupportedSubNames = 8;

   std::vector<base::StringPiece16> sub_names = base::SplitStringPiece(
       name_part, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

   // Limit the number of sub-names we support (to constrain memory usage);
   if (sub_names.size() > kMaxSupportedSubNames)
     return {name_part};

   // Start with the empty string as a variant.
   std::set<base::string16> variants = {{}};

   // For each sub-name, add a variant of all the already existing variants that
   // appends this sub-name and one that appends the initial of this sub-name.
   // Duplicates will be discarded when they're added to the variants set.
   for (const auto& sub_name : sub_names) {
     if (sub_name.empty())
       continue;
     std::vector<base::string16> new_variants;
     for (const base::string16& variant : variants) {
       new_variants.push_back(base::CollapseWhitespace(
           base::JoinString({variant, sub_name}, kSpace), true));
       new_variants.push_back(base::CollapseWhitespace(
           base::JoinString({variant, sub_name.substr(0, 1)}, kSpace), true));
     }
     variants.insert(new_variants.begin(), new_variants.end());
   }

   // As a common case, also add the variant that just concatenates all of the
   // initials.
   base::string16 initials;
   for (const auto& sub_name : sub_names) {
     if (sub_name.empty())
       continue;
     initials.push_back(sub_name[0]);
   }
   variants.insert(initials);

   // And, we're done.
   return variants;
 }

 bool AutofillProfileComparator::HaveMergeableNames(
     const AutofillProfile& p1,
     const AutofillProfile& p2) const {
   base::string16 full_name_1 =
       NormalizeForComparison(p1.GetInfo(NAME_FULL, app_locale_));
   base::string16 full_name_2 =
       NormalizeForComparison(p2.GetInfo(NAME_FULL, app_locale_));

   if (full_name_1.empty() || full_name_2.empty() ||
       full_name_1 == full_name_2) {
     return true;
   }

   if (data_util::IsCJKName(full_name_1) && data_util::IsCJKName(full_name_2)) {
     return HaveMergeableCJKNames(p1, p2);
   }

   // Is it reasonable to merge the names from p1 and p2.
   return IsNameVariantOf(full_name_1, full_name_2) ||
          IsNameVariantOf(full_name_2, full_name_1);
 }

 bool AutofillProfileComparator::HaveMergeableCJKNames(
     const AutofillProfile& p1,
     const AutofillProfile& p2) const {
   base::string16 name_1 = NormalizeForComparison(
       p1.GetInfo(NAME_FULL, app_locale_), DISCARD_WHITESPACE);
   base::string16 name_2 = NormalizeForComparison(
       p2.GetInfo(NAME_FULL, app_locale_), DISCARD_WHITESPACE);
   return name_1 == name_2;
 }

 bool AutofillProfileComparator::HaveMergeableEmailAddresses(
     const AutofillProfile& p1,
     const AutofillProfile& p2) const {
   const base::string16& email_1 = p1.GetInfo(EMAIL_ADDRESS, app_locale_);
   const base::string16& email_2 = p2.GetInfo(EMAIL_ADDRESS, app_locale_);
   return email_1.empty() || email_2.empty() ||
          case_insensitive_compare_.StringsEqual(email_1, email_2);
 }

 bool AutofillProfileComparator::HaveMergeableCompanyNames(
     const AutofillProfile& p1,
     const AutofillProfile& p2) const {
   const base::string16& company_name_1 =
       NormalizeForComparison(p1.GetInfo(COMPANY_NAME, app_locale_));
   const base::string16& company_name_2 =
       NormalizeForComparison(p2.GetInfo(COMPANY_NAME, app_locale_));
   return company_name_1.empty() || company_name_2.empty() ||
          CompareTokens(company_name_1, company_name_2) != DIFFERENT_TOKENS;
 }

 bool AutofillProfileComparator::HaveMergeablePhoneNumbers(
     const AutofillProfile& p1,
     const AutofillProfile& p2) const {
   // We work with the raw phone numbers to avoid losing any helpful information
   // as we parse.
   const base::string16& raw_phone_1 = p1.GetRawInfo(PHONE_HOME_WHOLE_NUMBER);
   const base::string16& raw_phone_2 = p2.GetRawInfo(PHONE_HOME_WHOLE_NUMBER);

   // Are the two phone numbers trivially mergeable?
   if (raw_phone_1.empty() || raw_phone_2.empty() ||
       raw_phone_1 == raw_phone_2) {
     return true;
   }

   // TODO(rogerm): Modify ::autofill::i18n::PhoneNumbersMatch to support
   // SHORT_NSN_MATCH and just call that instead of accessing the underlying
   // utility library directly?

   // The phone number util library needs the numbers in utf8.
   const std::string phone_1 = base::UTF16ToUTF8(raw_phone_1);
   const std::string phone_2 = base::UTF16ToUTF8(raw_phone_2);

   // Parse and compare the phone numbers.
   PhoneNumberUtil* phone_util = PhoneNumberUtil::GetInstance();
   switch (phone_util->IsNumberMatchWithTwoStrings(phone_1, phone_2)) {
     case PhoneNumberUtil::SHORT_NSN_MATCH:
     case PhoneNumberUtil::NSN_MATCH:
     case PhoneNumberUtil::EXACT_MATCH:
       return true;
     case PhoneNumberUtil::INVALID_NUMBER:
     case PhoneNumberUtil::NO_MATCH:
       return false;
     default:
       NOTREACHED();
       return false;
   }
 }

 bool AutofillProfileComparator::HaveMergeableAddresses(
     const AutofillProfile& p1,
     const AutofillProfile& p2) const {
   // If the address are not in the same country, then they're not the same. If
   // one of the address countries is unknown/invalid the comparison continues.
   const AutofillType kCountryCode(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE);
   const base::string16& country1 = p1.GetInfo(kCountryCode, app_locale_);
   const base::string16& country2 = p2.GetInfo(kCountryCode, app_locale_);
   if (!country1.empty() && !country2.empty() &&
       !case_insensitive_compare_.StringsEqual(country1, country2)) {
     return false;
   }

   // Zip
   // ----
   // If the addresses are definitely not in the same zip/area code then we're
   // done. Otherwise,the comparison continues.
   const AutofillType kZipCode(ADDRESS_HOME_ZIP);
   const base::string16& zip1 = NormalizeForComparison(
       p1.GetInfo(kZipCode, app_locale_), DISCARD_WHITESPACE);
   const base::string16& zip2 = NormalizeForComparison(
       p2.GetInfo(kZipCode, app_locale_), DISCARD_WHITESPACE);
   if (!zip1.empty() && !zip2.empty() &&
       zip1.find(zip2) == base::string16::npos &&
       zip2.find(zip1) == base::string16::npos) {
     return false;
   }

   // Use the token rewrite rules for the (common) country of the address to
   // transform equivalent substrings to a representative token for comparison.
   AddressRewriter rewriter =
       AddressRewriter::ForCountryCode(country1.empty() ? country2 : country1);

   // State
   // ------
   // Heuristic: States are mergeable if one is a (possibly empty) bag of words
   // subset of the other.
   //
   // TODO(rogerm): If the match is between non-empty zip codes then we can infer
   // that the two state strings are intended to have the same meaning. This
   // handles the cases where we have invalid or poorly formed data in one of the
   // state values (like "Select one", or "CA - California").
   const AutofillType kState(ADDRESS_HOME_STATE);
   const base::string16& state1 =
       rewriter.Rewrite(NormalizeForComparison(p1.GetInfo(kState, app_locale_)));
   const base::string16& state2 =
       rewriter.Rewrite(NormalizeForComparison(p2.GetInfo(kState, app_locale_)));
   if (CompareTokens(state1, state2) == DIFFERENT_TOKENS) {
     return false;
   }

   // City
   // ------
   // Heuristic: Cities are mergeable if one is a (possibly empty) bag of words
   // subset of the other.
   //
   // TODO(rogerm): If the match is between non-empty zip codes then we can infer
   // that the two city strings are intended to have the same meaning. This
   // handles the cases where we have a city vs one of its suburbs.
   const AutofillType kCity(ADDRESS_HOME_CITY);
   const base::string16& city1 =
       rewriter.Rewrite(NormalizeForComparison(p1.GetInfo(kCity, app_locale_)));
   const base::string16& city2 =
       rewriter.Rewrite(NormalizeForComparison(p2.GetInfo(kCity, app_locale_)));
   if (CompareTokens(city1, city2) == DIFFERENT_TOKENS) {
     return false;
   }

   // Dependent Locality
   // -------------------
   // Heuristic: Dependent Localities are mergeable if one is a (possibly empty)
   // bag of words subset of the other.
   const AutofillType kDependentLocality(ADDRESS_HOME_DEPENDENT_LOCALITY);
   const base::string16& locality1 = rewriter.Rewrite(
       NormalizeForComparison(p1.GetInfo(kDependentLocality, app_locale_)));
   const base::string16& locality2 = rewriter.Rewrite(
       NormalizeForComparison(p2.GetInfo(kDependentLocality, app_locale_)));
   if (CompareTokens(locality1, locality2) == DIFFERENT_TOKENS) {
     return false;
   }

   // Sorting Code
   // -------------
   // Heuristic: Sorting codes are mergeable if one is empty or one is a
   // substring of the other, post normalization and whitespace removed. This
   // is similar to postal/zip codes.
   const AutofillType kSortingCode(ADDRESS_HOME_SORTING_CODE);
   const base::string16& sorting1 = NormalizeForComparison(
       p1.GetInfo(kSortingCode, app_locale_), DISCARD_WHITESPACE);
   const base::string16& sorting2 = NormalizeForComparison(
       p2.GetInfo(kSortingCode, app_locale_), DISCARD_WHITESPACE);
   if (!sorting1.empty() && !sorting2.empty() &&
       sorting1.find(sorting2) == base::string16::npos &&
       sorting2.find(sorting1) == base::string16::npos) {
     return false;
   }

   // Address
   // --------
   // Heuristic: Street addresses are mergeable if one is a (possibly empty) bag
   // of words subset of the other.
   const base::string16& address1 = rewriter.Rewrite(NormalizeForComparison(
       p1.GetInfo(ADDRESS_HOME_STREET_ADDRESS, app_locale_)));
   const base::string16& address2 = rewriter.Rewrite(NormalizeForComparison(
       p2.GetInfo(ADDRESS_HOME_STREET_ADDRESS, app_locale_)));
   if (CompareTokens(address1, address2) == DIFFERENT_TOKENS) {
     return false;
   }

   return true;
 }

 }  // namespace autofill