Disallow middle dot (U+00B7) when unsafe in IDN display
This character ("·") can be used to spoof domain names. Only allow
if it's used to express Catalan character ela geminada on Catalan
domains (i.e. when used between 'l' characters).
According to usage logs, this change affects a single domain name
with a small number of users.
Bug: 1025442
Change-Id: I3b63e638a4590c881df0a6b65c4be90364f7350c
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1922280
Reviewed-by: Mustafa Emre Acer <meacer@chromium.org>
Reviewed-by: Christopher Thompson <cthomp@chromium.org>
Commit-Queue: Mustafa Emre Acer <meacer@chromium.org>
Cr-Commit-Position: refs/heads/master@{#717124}
diff --git a/components/url_formatter/spoof_checks/idn_spoof_checker.cc b/components/url_formatter/spoof_checks/idn_spoof_checker.cc
index 5f979a1..81599b5 100644
--- a/components/url_formatter/spoof_checks/idn_spoof_checker.cc
+++ b/components/url_formatter/spoof_checks/idn_spoof_checker.cc
@@ -79,6 +79,35 @@
return *dangerous_pattern_tls;
}
+// Allow middle dot (U+00B7) only on Catalan domains when between two 'l's, to
+// permit the Catalan character ela geminada to be expressed.
+// See https://tools.ietf.org/html/rfc5892#appendix-A.3 for details.
+bool HasUnsafeMiddleDot(const icu::UnicodeString& label_string,
+ base::StringPiece top_level_domain) {
+ int last_index = 0;
+ while (true) {
+ int index = label_string.indexOf("·", last_index);
+ if (index < 0) {
+ break;
+ }
+ DCHECK_LT(index, label_string.length());
+ if (top_level_domain != "cat") {
+ // Non-Catalan domains cannot contain middle dot.
+ return true;
+ }
+ // Middle dot at the beginning or end.
+ if (index == 0 || index == label_string.length() - 1) {
+ return true;
+ }
+ // Middle dot not surrounded by an 'l'.
+ if (label_string[index - 1] != 'l' || label_string[index + 1] != 'l') {
+ return true;
+ }
+ last_index = index + 1;
+ }
+ return false;
+}
+
#include "components/url_formatter/spoof_checks/top_domains/domains-trie-inc.cc"
// All the domains in the above file have 4 or fewer labels.
@@ -303,6 +332,11 @@
label_string.indexOf("ə") != -1)
return false;
+ // Disallow middle dot (U+00B7) when unsafe.
+ if (HasUnsafeMiddleDot(label_string, top_level_domain)) {
+ return false;
+ }
+
// If there's no script mixing, the input is regarded as safe without any
// extra check unless it falls into one of three categories:
// - contains Kana letter exceptions
diff --git a/components/url_formatter/spoof_checks/idn_spoof_checker_unittest.cc b/components/url_formatter/spoof_checks/idn_spoof_checker_unittest.cc
index 0004c00..d7dc36cb 100644
--- a/components/url_formatter/spoof_checks/idn_spoof_checker_unittest.cc
+++ b/components/url_formatter/spoof_checks/idn_spoof_checker_unittest.cc
@@ -366,9 +366,10 @@
// 3) ѕсоре-рау.com with ѕсоре and рау in Cyrillic.
{"xn----8sbn9akccw8m.com",
L"\x0455\x0441\x043e\x0440\x0435-\x0440\x0430\x0443.com", false},
- // 4) ѕсоре·рау.com with scope and pay in Cyrillic and U+00B7 between them.
- {"xn--uba29ona9akccw8m.com",
- L"\x0455\x0441\x043e\x0440\x0435\u00b7\x0440\x0430\x0443.com", false},
+ // 4) ѕсоре1рау.com with scope and pay in Cyrillic and a non-letter between
+ // them.
+ {"xn--1-8sbn9akccw8m.com",
+ L"\x0455\x0441\x043e\x0440\x0435\x0031\x0440\x0430\x0443.com", false},
// The same as above three, but in IDN TLD (рф).
// 1) ѕсоре.рф with ѕсоре in Cyrillic.
@@ -382,9 +383,10 @@
// 3) ѕсоре-рау.рф with ѕсоре and рау in Cyrillic.
{"xn----8sbn9akccw8m.xn--p1ai",
L"\x0455\x0441\x043e\x0440\x0435-\x0440\x0430\x0443.\x0440\x0444", true},
- // 4) ѕсоре·рау.com with scope and pay in Cyrillic and U+00B7 between them.
- {"xn--uba29ona9akccw8m.xn--p1ai",
- L"\x0455\x0441\x043e\x0440\x0435\u00b7\x0440\x0430\x0443.\x0440\x0444",
+ // 4) ѕсоре1рау.com with scope and pay in Cyrillic and a non-letter between
+ // them.
+ {"xn--1-8sbn9akccw8m.xn--p1ai",
+ L"\x0455\x0441\x043e\x0440\x0435\x0031\x0440\x0430\x0443.\x0440\x0444",
true},
// Same as above three, but in .ru TLD.
@@ -398,9 +400,10 @@
// 3) ѕсоре-рау.ru with ѕсоре and рау in Cyrillic.
{"xn----8sbn9akccw8m.ru",
L"\x0455\x0441\x043e\x0440\x0435-\x0440\x0430\x0443.ru", true},
- // 4) ѕсоре·рау.ru with scope and pay in Cyrillic and U+00B7 between them.
- {"xn--uba29ona9akccw8m.ru",
- L"\x0455\x0441\x043e\x0440\x0435\u00b7\x0440\x0430\x0443.ru", true},
+ // 4) ѕсоре1рау.com with scope and pay in Cyrillic and a non-letter between
+ // them.
+ {"xn--1-8sbn9akccw8m.ru",
+ L"\x0455\x0441\x043e\x0440\x0435\x0031\x0440\x0430\x0443.ru", true},
// ѕсоре-рау.한국 with ѕсоре and рау in Cyrillic. The label will remain
// punycode while the TLD will be decoded.
@@ -1131,6 +1134,16 @@
// U+0259 (ə) is only allowed under the .az TLD.
{"xn--xample-vyc.com", L"əxample.com", false},
{"xn--xample-vyc.az", L"əxample.az", true},
+
+ // U+00B7 is only allowed on Catalan domains between two l's.
+ {"xn--googlecom-5pa.com", L"google·com.com", false},
+ {"xn--ll-0ea.com", L"l·l.com", false},
+ {"xn--ll-0ea.cat", L"l·l.cat", true},
+ {"xn--al-0ea.cat", L"a·l.cat", false},
+ {"xn--la-0ea.cat", L"l·a.cat", false},
+ {"xn--l-fda.cat", L"·l.cat", false},
+ {"xn--l-gda.cat", L"l·.cat", false},
+
}; // namespace
namespace test {