| // Copyright 2020 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "components/url_formatter/spoof_checks/skeleton_generator.h" |
| |
| #include <ostream> |
| #include <queue> |
| #include <string_view> |
| |
| #include "base/i18n/unicodestring.h" |
| #include "base/memory/ptr_util.h" |
| #include "base/strings/string_util.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "third_party/icu/source/i18n/unicode/regex.h" |
| #include "third_party/icu/source/i18n/unicode/translit.h" |
| #include "third_party/icu/source/i18n/unicode/uspoof.h" |
| |
| namespace { |
| |
| using QueueItem = std::vector<std::u16string>; |
| |
| // Maximum length of a hostname whose supplemental hostnames we'll calculate. |
| // For hostnames longer than this length, the supplemental hostnames will be |
| // empty. |
| const size_t kMaxHostnameLengthToComputeSupplementalHostnames = 32; |
| |
| // Maximum number of supplemental hostname to generate for a given input. |
| // If this number is too high, we may end up DOSing the browser process. |
| // If it's too low, we may not be able to cover some lookalike URLs. |
| const size_t kMaxSupplementalHostnames = 128; |
| |
| // Maximum number of characters with multiple skeletons in a hostname (i.e. |
| // interesting characters). The number of interesting characters directly affect |
| // how many supplemental hostnames are generated. Assuming an interesting |
| // character has 3 skeletons (1 original skeleton, 2 supplemental skeletons), |
| // this will generate pow(3, kMaxCharactersWithMultipleSkeletons) supplemental |
| // hostnames, so we cap it. |
| // If a hostname has too many interesting characters, it's unlikely to be a |
| // convincing spoof. |
| const size_t kMaxCharactersWithMultipleSkeletons = 5; |
| |
| // Limit the number of maximum supplemental skeletons for a given character to a |
| // reasonable number. This can be adjusted in the future as needed. |
| const size_t kMaxSupplementalSkeletonsPerCharacter = 3; |
| |
| } // namespace |
| |
| SkeletonGenerator::SkeletonGenerator(const USpoofChecker* checker) |
| : checker_(checker) { |
| UErrorCode status = U_ZERO_ERROR; |
| // Used for diacritics-removal before the skeleton calculation. Add |
| // "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark |
| // removal; NFC". |
| // TODO(jshin): Revisit "ł > l; ø > o" mapping. |
| UParseError parse_error; |
| diacritic_remover_ = base::WrapUnique(icu::Transliterator::createFromRules( |
| UNICODE_STRING_SIMPLE("DropAcc"), |
| icu::UnicodeString::fromUTF8("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;" |
| " ł > l; ø > o; đ > d;"), |
| UTRANS_FORWARD, parse_error, status)); |
| |
| // This set is used to determine whether or not to apply a slow |
| // transliteration to remove diacritics to a given hostname before the |
| // confusable skeleton calculation for comparison with top domain names. If |
| // it has any character outside the set, the expensive step will be skipped |
| // because it cannot match any of top domain names. |
| // The last ([\u0300-\u0339] is a shorthand for "[:Identifier_Status=Allowed:] |
| // & [:Script_Extensions=Inherited:] - [\\u200C\\u200D]". The latter is a |
| // subset of the former but it does not matter because hostnames with |
| // characters outside the latter set would be rejected in an earlier step. |
| lgc_letters_n_ascii_ = icu::UnicodeSet( |
| UNICODE_STRING_SIMPLE("[[:Latin:][:Greek:][:Cyrillic:][0-9\\u002e_" |
| "\\u002d][\\u0300-\\u0339]]"), |
| status); |
| lgc_letters_n_ascii_.freeze(); |
| |
| // Supplement the Unicode confusable list by the following mapping. |
| // IMPORTANT: Adding a digit-lookalike? Add it to digit_lookalikes_ in |
| // idn_spoof_checker.cc, too. |
| // - {U+00E6 (æ), U+04D5 (ӕ)} => "ae" |
| // - {U+03FC (ϼ), U+048F (ҏ)} => p |
| // - {U+0127 (ħ), U+043D (н), U+045B (ћ), U+04A3 (ң), U+04A5 (ҥ), |
| // U+04C8 (ӈ), U+04CA (ӊ), U+050B (ԋ), U+0527 (ԧ), U+0529 (ԩ)} => h |
| // - {U+0138 (ĸ), U+03BA (κ), U+043A (к), U+049B (қ), U+049D (ҝ), |
| // U+049F (ҟ), U+04A1(ҡ), U+04C4 (ӄ), U+051F (ԟ)} => k |
| // - {U+014B (ŋ), U+043F (п), U+0525 (ԥ), U+0E01 (ก), U+05D7 (ח)} => n |
| // TODO(crbug.com/40091387): Handle multiple skeletons for U+0525 and U+0153. |
| // - {U+0167 (ŧ), U+0442 (т), U+04AD (ҭ), U+050F (ԏ), U+4E03 (七), |
| // U+4E05 (丅), U+4E06 (丆), U+4E01 (丁)} => t |
| // - {U+0185 (ƅ), U+044C (ь), U+048D (ҍ), U+0432 (в)} => b |
| // - {U+03C9 (ω), U+0448 (ш), U+0449 (щ), U+0E1E (พ), |
| // U+0E1F (ฟ), U+0E9E (ພ), U+0E9F (ຟ)} => w |
| // - {U+043C (м), U+04CE (ӎ)} => m |
| // - {U+0454 (є), U+04BD (ҽ), U+04BF (ҿ), U+1054 (ၔ)} => e |
| // - U+0491 (ґ) => r |
| // - {U+0493 (ғ), U+04FB (ӻ)} => f |
| // - {U+04AB (ҫ), U+1004 (င)} => c |
| // - {U+04B1 (ұ), U+4E2B (丫)} => y |
| // - {U+03C7 (χ), U+04B3 (ҳ), U+04FD (ӽ), U+04FF (ӿ), U+4E42 (乂)} => x |
| // - {U+0503 (ԃ), U+10EB (ძ)} => d |
| // - {U+050D (ԍ), U+100c (ဌ)} => g |
| // - {U+0D1F (ട), U+0E23 (ร), U+0EA3 (ຣ), U+0EAE (ຮ)} => s |
| // - U+1042 (၂) => j |
| // - {U+0966 (०), U+09E6 (০), U+0A66 (੦), U+0AE6 (૦), U+0B30 (ଠ), |
| // U+0B66 (୦), U+0CE6 (೦), U+1005 (စ)} => o, |
| // - {U+09ED (৭), U+0A67 (੧), U+0AE7 (૧)} => q, |
| // - {U+0E1A (บ), U+0E9A (ບ)} => u, |
| // - {U+03B8 (θ)} => 0, |
| // - {U+0968 (२), U+09E8 (২), U+0A68 (੨), U+0A68 (੨), U+0AE8 (૨), |
| // U+0ce9 (೩), U+0ced (೭), U+0577 (շ)} => 2, |
| // - {U+0437 (з), U+0499 (ҙ), U+04E1 (ӡ), U+0909 (उ), U+0993 (ও), |
| // U+0A24 (ਤ), U+0A69 (੩), U+0AE9 (૩), U+0C69 (౩), |
| // U+1012 (ဒ), U+10D5 (ვ), U+10DE (პ), U+0A5C (ੜ), U+10D9 (კ)} => 3 |
| // - {U+0A6B (੫), U+4E29 (丩), U+3110 (ㄐ)} => 4, |
| // - U+0573 (ճ) => 6 |
| // - {U+09EA (৪), U+0A6A (੪), U+0b6b (୫)} => 8, |
| // - {U+0AED (૭), U+0b68 (୨), U+0C68 (౨)} => 9, |
| // Map a few dashes that ICU doesn't map. These are already blocked by ICU, |
| // but mapping them allows us to detect same skeletons. |
| // - {U+2014 (—), U+4E00 (一), U+2015 (―), U+23EA (⸺), U+2E3B (⸻)} => -, |
| extra_confusable_mapper_ = |
| base::WrapUnique(icu::Transliterator::createFromRules( |
| UNICODE_STRING_SIMPLE("ExtraConf"), |
| icu::UnicodeString::fromUTF8( |
| "[æӕ] > ae; [ϼҏ] > p; [ħнћңҥӈӊԋԧԩ] > h;" |
| "[ĸκкқҝҟҡӄԟ] > k; [ŋпԥกח] > n;" |
| "[ŧтҭԏ七丅丆丁] > t; [ƅьҍвß] > b; [ωшщพฟພຟ] > w;" |
| "[мӎ] > m; [єҽҿၔ] > e; ґ > r; [ғӻ] > f;" |
| "[ҫင] > c; [ұ丫] > y; [χҳӽӿ乂] > x;" |
| "[ԃძ] > d; [ԍဌ] > g; [ടรຣຮ] > s; ၂ > j;" |
| "[०০੦૦ଠ୦೦စ] > o;" |
| "[৭੧૧] > q;" |
| "[บບ] > u;" |
| "[θ] > 0;" |
| "[२২੨੨૨೩೭շ] > 2;" |
| "[зҙӡउওਤ੩૩౩ဒვპੜკ] > 3;" |
| "[੫丩ㄐ] > 4;" |
| "[ճ] > 6;" |
| "[৪੪୫] > 8;" |
| "[૭୨౨] > 9;" |
| "[—一―⸺⸻] > \\-;"), |
| UTRANS_FORWARD, parse_error, status)); |
| DCHECK(U_SUCCESS(status)) |
| << "Skeleton generator initialization failed due to an error: " |
| << u_errorName(status); |
| |
| // Characters that look like multiple characters. |
| character_map_[u'þ'] = {"b", "p"}; |
| character_map_[u'œ'] = {"ce", "oe"}; |
| // https://crbug.com/1250993: |
| character_map_[u'ł'] = {"l", "t"}; |
| |
| // Find the characters with diacritics that have multiple skeletons. |
| for (const auto& it : character_map_) { |
| std::u16string char_str(1, it.first); |
| if (char_str != MaybeRemoveDiacritics(char_str)) { |
| characters_with_multiple_skeletons_with_diacritics_.insert(it.first); |
| } |
| } |
| } |
| |
| SkeletonGenerator::~SkeletonGenerator() = default; |
| |
| void SkeletonGenerator::MaybeRemoveDiacritics(icu::UnicodeString& hostname) { |
| // If input has any characters outside Latin-Greek-Cyrillic and [0-9._-], |
| // there is no point in getting rid of diacritics because combining marks |
| // attached to non-LGC characters are already blocked. |
| if (ShouldRemoveDiacriticsFromLabel(hostname)) |
| diacritic_remover_->transliterate(hostname); |
| } |
| |
| std::u16string SkeletonGenerator::MaybeRemoveDiacritics( |
| std::u16string_view hostname) { |
| size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0); |
| icu::UnicodeString host(false, hostname.data(), hostname_length); |
| MaybeRemoveDiacritics(host); |
| return base::i18n::UnicodeStringToString16(host); |
| } |
| |
| bool SkeletonGenerator::ShouldComputeSupplementalHostnamesWithDiacritics( |
| std::u16string_view input_hostname) const { |
| for (const char16_t c : characters_with_multiple_skeletons_with_diacritics_) { |
| if (input_hostname.find(c) != std::u16string_view::npos) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| Skeletons SkeletonGenerator::GetSkeletons(std::u16string_view input_hostname) { |
| // Generate supplemental hostnames for the input hostname with and without |
| // diacritics. We do this to cover characters whose diacritic versions can |
| // look like completely other characters, such as LATIN SMALL LETTER L WITH |
| // STROKE (ł) looking like t. By doing this, we can generate multiple |
| // skeletons for ł (l and t). |
| // |
| // Ideally, we'd compute a hostname variant for each character with and |
| // without its diacritic. That would result in 2^n hostname variants where n |
| // is the number of characters in the hostname with diacritics, which is too |
| // expensive. Currently, there is only one character with a diacritic that has |
| // multiple skeletons (ł), so this isn't needed. |
| std::u16string hostname_no_diacritics = MaybeRemoveDiacritics(input_hostname); |
| base::flat_set<std::u16string> all_variants = GenerateSupplementalHostnames( |
| hostname_no_diacritics, kMaxSupplementalHostnames, character_map_); |
| if (ShouldComputeSupplementalHostnamesWithDiacritics(input_hostname)) { |
| base::flat_set<std::u16string> diacritic_variants = |
| GenerateSupplementalHostnames(input_hostname, kMaxSupplementalHostnames, |
| character_map_); |
| all_variants.insert(diacritic_variants.begin(), diacritic_variants.end()); |
| } |
| |
| // Extract skeletons of all hostname variants. |
| Skeletons skeletons; |
| for (const std::u16string& hostname : all_variants) { |
| size_t hostname_length = |
| hostname.length() - (hostname.back() == '.' ? 1 : 0); |
| icu::UnicodeString hostname_unicode(false, hostname.data(), |
| hostname_length); |
| extra_confusable_mapper_->transliterate(hostname_unicode); |
| |
| UErrorCode status = U_ZERO_ERROR; |
| icu::UnicodeString ustr_skeleton; |
| |
| // Map U+04CF (ӏ) to lowercase L in addition to what uspoof_getSkeleton does |
| // (mapping it to lowercase I). |
| AddSkeletonMapping(hostname_unicode, 0x4CF /* ӏ */, 0x6C /* lowercase L */, |
| &skeletons); |
| |
| uspoof_getSkeletonUnicodeString(checker_, 0, hostname_unicode, |
| ustr_skeleton, &status); |
| if (U_SUCCESS(status)) { |
| std::string skeleton; |
| ustr_skeleton.toUTF8String(skeleton); |
| skeletons.insert(skeleton); |
| } |
| } |
| return skeletons; |
| } |
| |
| bool SkeletonGenerator::ShouldRemoveDiacriticsFromLabel( |
| const icu::UnicodeString& label) const { |
| return lgc_letters_n_ascii_.containsAll(label); |
| } |
| |
| void SkeletonGenerator::AddSkeletonMapping(const icu::UnicodeString& host, |
| int32_t src_char, |
| int32_t mapped_char, |
| Skeletons* skeletons) { |
| int32_t src_pos = host.indexOf(src_char); |
| if (src_pos == -1) { |
| return; |
| } |
| icu::UnicodeString host_alt(host); |
| size_t length = host_alt.length(); |
| char16_t* buffer = host_alt.getBuffer(-1); |
| for (char16_t* uc = buffer + src_pos; uc < buffer + length; ++uc) { |
| if (*uc == src_char) |
| *uc = mapped_char; |
| } |
| host_alt.releaseBuffer(length); |
| UErrorCode status = U_ZERO_ERROR; |
| icu::UnicodeString ustr_skeleton; |
| uspoof_getSkeletonUnicodeString(checker_, 0, host_alt, ustr_skeleton, |
| &status); |
| if (U_SUCCESS(status)) { |
| std::string skeleton; |
| ustr_skeleton.toUTF8String(skeleton); |
| skeletons->insert(skeleton); |
| } |
| } |
| |
| // static |
| base::flat_set<std::u16string> SkeletonGenerator::GenerateSupplementalHostnames( |
| std::u16string_view input, |
| size_t max_alternatives, |
| const SkeletonMap& mapping) { |
| base::flat_set<std::u16string> output; |
| if (!input.size() || |
| input.size() > kMaxHostnameLengthToComputeSupplementalHostnames || |
| max_alternatives == 0) { |
| return output; |
| } |
| icu::UnicodeString input_unicode = |
| icu::UnicodeString::fromUTF8(base::UTF16ToUTF8(input)); |
| // Read only buffer, doesn't need to be released. |
| const char16_t* input_buffer = input_unicode.getBuffer(); |
| const size_t input_length = static_cast<size_t>(input_unicode.length()); |
| |
| // Count the characters that have multiple skeletons. If this number is high, |
| // bail out early to avoid running the skeleton generation for too long. |
| size_t characters_with_multiple_skeletons = 0; |
| for (size_t i = 0; i < input_length; i++) { |
| char16_t c = input_buffer[i]; |
| const auto it = mapping.find(c); |
| if (it != mapping.end()) { |
| characters_with_multiple_skeletons++; |
| } |
| } |
| if (characters_with_multiple_skeletons > |
| kMaxCharactersWithMultipleSkeletons) { |
| return output; |
| } |
| |
| // This queue contains vectors of skeleton strings. For each character in |
| // the input string, its skeleton string will be appended to the queue item. |
| // Thus, the number of skeleton strings in the queue item will always |
| // correspond to the index of the input string processed so far. |
| std::queue<QueueItem> q; |
| q.push(QueueItem()); |
| |
| while (!q.empty()) { |
| QueueItem current = q.front(); |
| q.pop(); |
| |
| if (current.size() == input_length) { |
| // Reached the end of the original string. We now generated a complete |
| // alternative string. Add the result to output. |
| output.insert(base::JoinString(current, u"")); |
| if (output.size() == max_alternatives) { |
| break; |
| } |
| continue; |
| } |
| |
| // First, add the original character from input. |
| char16_t c = input_buffer[current.size()]; |
| QueueItem new_item1 = current; |
| new_item1.push_back(std::u16string(1, c)); |
| q.push(new_item1); |
| |
| // Then, find all alternative characters for the current input character and |
| // generate new alternative strings by appending each alternative character |
| // to the string generated so far. |
| const auto it = mapping.find(c); |
| if (it != mapping.end()) { |
| DCHECK_LE(it->second.size(), kMaxSupplementalSkeletonsPerCharacter); |
| for (auto alternative : it->second) { |
| QueueItem new_item2 = current; |
| new_item2.push_back(base::UTF8ToUTF16(alternative)); |
| q.push(new_item2); |
| } |
| } |
| } |
| return output; |
| } |