blob: acb4de77cc53c731e41d775c12d85cf93e0f428f [file] [log] [blame]
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/url_formatter/url_formatter.h"
#include <stddef.h>
#include <string.h>
#include <vector>
#include "base/macros.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversions.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "url/gurl.h"
namespace url_formatter {
namespace {
using base::WideToUTF16;
using base::ASCIIToUTF16;
const size_t kNpos = base::string16::npos;
struct IDNTestCase {
// The IDNA/Punycode version of the domain (plain ASCII).
const char* const input;
// The equivalent Unicode version of the domain. Even if we expect the domain
// to be displayed in Punycode, this should still contain the Unicode
// equivalent (see |unicode_allowed|).
const wchar_t* unicode_output;
// Whether we expect the domain to be displayed decoded as a Unicode string
// (true) or in its Punycode form (false).
const bool unicode_allowed;
// These cases can be generated with the script
// tools/security/
// See documentation there: you can either run it from the command line or call
// the make_case function directly from the Python shell (which may be easier
// for entering Unicode text).
// Q: Why not just do this conversion right here in the test, rather than having
// a Python script to generate it?
// A: Because then we would have to rely on complex logic (IDNA encoding) in the
// test itself; the same code we are trying to test. By using Python's IDN
// encoder to generate the test data, we independently verify that our
// algorithm is correct.
// TODO(jshin): Replace L"..." with "..." in UTF-8 when it's easier to read.
const IDNTestCase idn_cases[] = {
// No IDN
{"", L"", true},
{"", L"", true},
{".", L".", true},
{"", L"", true},
// IDN
// Hanzi (Traditional Chinese)
{"", L"\x5317\x4eac\x5927\", true},
// Hanzi ('video' in Simplified Chinese
{"", L"\x89c6\", true},
// Hanzi + '123'
{"", L"www.\x4e00" L"", true},
// Hanzi + Latin : U+56FD is simplified
{"", L"www.hello\x4e2d\", true},
// Kanji + Kana (Japanese)
{"", L"\x671d\x65e5\x3042\x3055\", true},
// Katakana including U+30FC
{"", L"\x30b3\x30de\x30fc\", true},
{"", L"\u30ce\u30f3\", true},
// Katakana + Latin (Japanese)
{"", L"e\x30b3\x30de\x30fc\", true},
{"", L"\x30c8\", true},
// Hangul (Korean)
{"", L"www.\xc804\xc790\xc815\", true},
// b<u-umlaut>cher (German)
{"", L"b\x00fc" L"", true},
// a with diaeresis
{"", L"www.f\", true},
// c-cedilla (French)
L"www.alliancefran\x00e7" L"", true},
// caf'e with acute accent' (French)
{"", L"caf\", true},
// c-cedillla and a with tilde (Portuguese)
{"", L"p\x00e3oema\x00e7\", true},
// s with caron
{"", L"\x0161" L"", true},
{"", L"\x03bf\x03c5\x03c4\x03bf\x03c0\x03af\",
// Eutopia + 123 (Greek)
L"\x03bf\x03c5\x03c4\x03bf\x03c0\x03af\", true},
// Cyrillic (Russian)
{"", L"\x0442\x043e\x0440\x0442\", true},
// Cyrillic + 123 (Russian)
{"", L"\x0442\x043e\x0440\x0442\", true},
// 'president' in Russian. Is a wholescript confusable, but allowed.
// Arabic
{"", L"\x0627\x0641\x0644\x0627\", true},
// Hebrew
{"xn--4dbib.he", L"\x05d5\x05d0\x05d4.he", true},
// Hebrew + Common
{"", L"\x05e2\x05d1\x05e8\x05d9\", true},
// Thai
L"\x0e2a\x0e32\x0e22\x0e01\x0e32\x0e23\x0e1a\x0e34\", true},
// Thai + Common
L"\x0e20\x0e32\x0e29\x0e32\x0e44\x0e17\", true},
// Devangari (Hindi)
{"", L"www.\x0905\x0915\x094b\x0932\", true},
// Devanagari + Common
L"\x0939\x093f\x0928\x094d\x0926\", true},
// 5 Aspirational scripts
// Unifieid Canadian Syllabary
{"", L"\x1456\x14c2\", true},
// Tifinagh
L"\x2d5c\x2d49\x2d3c\x2d49\x2d4f\x2d30\", true},
// Tifinagh with a disallowed character(U+2D6F)
{"", L"\x2d5c\x2d49\x2d3c\x2d6f\x2d49\", false},
// Yi
{"", L"\xa188\xa320\xa071\", true},
// Mongolian - 'ordu' (place, camp)
{"", L"\x1823\x1837\x1833\", true},
// Mongolian with a disallowed character
{"", L"\x1823\x1837\x1804\x1833\", false},
// Miao/Pollad
{"", L"\U00016f04\U00016f62\", true},
// Script mixing tests
// The following script combinations are allowed.
// MODERATELY_RESTRICTIVE with Latin limited to ASCII-Latin.
// ASCII-Latin + Japn (Kana + Han)
// ASCII-Latin + Kore (Hangul + Han)
// ASCII-Latin + Han + Bopomofo
// ASCII-Latin + any allowed script other than Cyrillic, Greek, Cherokee
// and Unified Canadian Syllabary
// "payp<alpha>"
{"", L"payp\", false},
// with Greek omicron and epsilon
{"", L"g\x03bf\x03bfgl\", false},
// with Cyrillic o
{"", L"g\x043e\x043egl\", false},
// h<e with acute>llo<China in Han>.cn
{"", L"h\x00e9llo\x4e2d\", false},
// <Greek rho><Cyrillic a><Cyrillic u>.ru
{"", L"\x03c1\x0430\", false},
// Hangul + Latin
{"", L"\xd55c\", true},
// Hangul + Latin + Han with IDN ccTLD
L"\xd55c\xae00han\x97d3.\xd55c\xad6d", true},
// non-ASCII Latin + Hangul
{"", L"caf\x00e9\xce74\", false},
// Hangul + Hiragana
{"", L"\xd55c\x3072\", false},
// <Hiragana>.<Hangul> is allowed because script mixing check is per label.
{"xn--y9j3b.xn--3e0b707e", L"\x3072\x3089.\xd55c\xad6d", true},
// Traditional Han + Latin
{"", L"\x6f22\", true},
// Simplified Han + Latin
{"", L"\x6c49\", true},
// Simplified Han + Traditonal Han
{"", L"\x6c49\", true},
// Han + Hiragana + Katakana + Latin
L"\x632f\x308a\x4eee\x540d\x30ab\", true},
// Han + Bopomofo
{"", L"\x6ce8\x97f3\x3105\x3106\x3107\", true},
// Han + Latin + Bopomofo
L"\x6ce8\x97f3" L"bopo\x3105\x3106\x3107\", true},
// Latin + Bopomofo
{"", L"bopomofo\x3105\x3106\x3107\", true},
// Bopomofo + Katakana
L"\x3105\x3106\x3107\x3108\x30ab\x30bf\x30ab\", false},
// Bopomofo + Hangul
{"", L"\x3105\x3106\x3107\x3108\xc8fc\", false},
// Devanagari + Latin
{"", L"ab\x0939\x093f\x0928\x094d\x0926\", true},
// Thai + Latin
L"ab\x0e20\x0e32\x0e29\x0e32\x0e44\x0e17\", true},
// <vitamin in Katakana>
{"", L"\x30d3\x30bf\x30df\x30f3" L"", true},
// Devanagari + Han
{"", L"\x0930\x094b\x0932\x0947\x76e7\", false},
// Devanagari + Bengali
{"", L"\x0915\", false},
// Canadian Syllabary + Latin
{"", L"ab\", false},
{"", L"ab1\", false},
{"", L"\x14BF" L"", false},
{"", L"\x14BF" L"", false},
// Tifinagh + Latin
{"", L"li\", false},
{"", L"rol\", false},
{"", L"\", false},
{"", L"\", false},
// Invisibility check
// Thai tone mark malek(U+0E48) repeated
{"", L"\x0e23\x0e35\x0e48\", false},
// Accute accent repeated
{"", L"a\x0301\", false},
// 'a' with acuted accent + another acute accent
{"", L"\x00e1\", false},
// Combining mark at the beginning
{"", L"\x0300" L"", false},
// Mixed script confusable
// google with Armenian Small Letter Oh(U+0585)
{"", L"g\", false},
{"", L"\", false},
{"", L"cucko\", false},
// Latin 'o' in Armenian.
L"o\x0585\x0580\x0574\x0578\x0582\x0566\", false},
// Hiragana HE(U+3078) mixed with Katakana
L"\x30e2\x30d2\x30fc\x30c8\x3078\x30d6\", false},
// U+30FC should be preceded by a Hiragana/Katakana.
// Katakana + U+30FC + Han
{"", L"\x30ab\x30fc\x91ce\", true},
// Hiragana + U+30FC + Han
{"", L"\x304b\x30fc\x91ce\", true},
// U+30FC + Han
{"", L"\x30fc\x52d5\x753b\", false},
// Han + U+30FC + Han
{"", L"\x65e5\x672c\x30fc\x91ce\", false},
// U+30FC at the beginning
{"", L"\x30fc\x65e5\x672c", false},
// Latin + U+30FC + Latin
{"", L"abc\x30fc" L"", false},
// U+30FB (・) is not allowed next to Latin, but allowed otherwise.
// U+30FB + Han
{"", L"\x30fb\", true},
// Han + U+30FB + Han
{"", L"\x65e5\x672c\x30fb\x91ce\", true},
// Latin + U+30FB + Latin
{"", L"abc\x30fb" L"", false},
// U+30FB + Latin
{"", L"\x30fb" L"", false},
// U+30FD (ヽ) is allowed only after Katakana.
// Katakana + U+30FD
{"", L"\x30ab\", true},
// Hiragana + U+30FD
{"", L"\x304b\", false},
// Han + U+30FD
{"", L"\x4e00\", false},
{"", L"a\", false},
{"", L"a1\", false},
// U+30FE (ヾ) is allowed only after Katakana.
// Katakana + U+30FE
{"", L"\x30ab\", true},
// Hiragana + U+30FE
{"", L"\x304b\", false},
// Han + U+30FE
{"", L"\x4e00\", false},
{"", L"a\", false},
{"", L"a1\", false},
// Cyrillic labels made of Latin-look-alike Cyrillic letters.
// ѕсоре.com with ѕсоре in Cyrillic
{"", L"\x0455\x0441\x043e\x0440\", false},
// ѕсоре with ѕсоре in Cyrillic.
L"\x0455\x0441\x043e\x0440\x0435" L"", false},
// ѕсоре-рау.com with ѕсоре and рау in Cyrillic.
L"\x0455\x0441\x043e\x0440\x0435-\x0440\x0430\", false},
// ѕсоре·рау.com with scope and pay in Cyrillic and U+00B7 between them.
L"\x0455\x0441\x043e\x0440\x0435\u00b7\x0440\x0430\", false},
// The same as above three, but in IDN TLD.
L"\x0455\x0441\x043e\x0440\x0435.\x0440\x0444", true},
L"\x0455\x0441\x043e\x0440\x0435" L"123.\x0440\x0444", true},
// ѕсоре-рау.한국 with ѕсоре and рау in Cyrillic.
L"\x0455\x0441\x043e\x0440\x0435-\x0440\x0430\x0443.\xd55c\xad6d", true},
// музей (museum in Russian) has characters without a Latin-look-alike.
{"", L"\x043c\x0443\x0437\x0435\", true},
// Combining Diacritic marks after a script other than Latin-Greek-Cyrillic
{"", L"\xd55c\x0301\", false}, // 한́글.com
{"", L"\x6f22\x0307\", false}, // 漢̇字.com
// नागरी́.com
{"", L"\x0928\x093e\x0917\x0930\x0940\",
// Similarity checks against the list of top domains. "" and
// '" are listed for unittest in the top domain list.
{"", L"\", false}, // đ
{"", L"www.\", false},
{"", L"\", false},
{"", L"\", false},
{"", L"mail.\", false},
{"", L"di\", false}, // di̇
{"", L"dig\", false}, // dig̱
{"", L"dig\", false}, // digĸ
{"", L"digk\", false}, // digkł
{"", L"digkl\", false}, // digklṃ
{"", L"digklm\x00f8" L"", false}, // digklmø
{"", L"digklmo\x0431" L"", false}, // digklmoб
{"", L"digklmo6\", false}, // digklmo6৪.com
// '' is listed for unitest in the top domain list.
// 'іѕӏкрх123' can look like 'islkpx123' in some fonts.
L"\x0456\x0455\x04cf\x043a\x0440\x0445" L"", false},
// Mixed digits: the first two will also fail mixed script test
// Latin + ASCII digit + Deva digit
{"", L"asc1deva\", false},
// Latin + Deva digit + Beng digit
{"", L"deva\x0967" L"beng\", false},
// ASCII digit + Deva digit
{"", L"7\x09ea" L"", false},
// Deva digit + Beng digit
{"", L"\x0967\", false},
// U+4E00 (CJK Ideograph One) is not a digit
{"", L"d12\", true},
// One that's really long that will force a buffer realloc
// Not allowed; characters outside [:Identifier_Status=Allowed:]
// Limited Use Scripts: UTS 31 Table 7.
// Vai
{"", L"\", false},
// 'CARD' look-alike in Cherokee
{"", L"\x13df\x13aa\x13a1\", false},
// Scripts excluded from Identifiers: UTS 31 Table 4
// Coptic
{"", L"\", false},
// Old Italic
{"", L"\U00010300\", false},
// U+115F (Hangul Filler)
{"", L"\xac00\xb098\", false},
{"", L"www.\x2039google\", false},
// Latin small capital w: hardᴡ
{"", L"hard\x1d21" L"", false},
// Minus Sign(U+2212)
{"", L"\x65e5\x2212\", false},
// Latin Small Letter Script G: ɡɡ.com
{"", L"\x0261\", false},
// Hangul Jamo(U+11xx)
{"", L"\x1102\x1103\", false},
// degree sign: 36°
{"", L"36\x00b0" L"", false},
// Pound sign
{"", L"5free\", false},
// Hebrew points (U+05B0, U+05B6)
{"", L"\x05e1\x05b6\x05e7\x05b0\", false},
// Danda(U+0964)
{"", L"\x0924\x093f\x091c\x0964\x0930\", false},
// Small letter script G(U+0261)
{"", L"\", false},
// Small Katakana Extension(U+31F1)
{"", L"\", false},
// Heart symbol: ♥
{"", L"ab\", false},
// Emoji
{"", L"\U0001f355\", false},
// Registered trade mark
{"", L"\x00ae" L"", false},
// Latin Letter Retroflex Click
{"", L"registered\", false},
// ASCII '!' not allowed in IDN
{"xn--!", L"\xc548\xb155!.kr", false},
// 'GOOGLE' in IPA extension: ɢᴏᴏɢʟᴇ
L"\x0262\x1d0f\x1d0f\x0262\x029f\", false},
// Padlock icon spoof.
{"xn--google-hj64e", L"\", false},
// Custom black list
// Combining Long Solidus Overlay
{"google.xn--comabc-k8d", L"\x0338" L"abc", false},
// Hyphenation Point instead of Katakana Middle dot
{"", L"\x30a1\x2027\", false},
// Gershayim with other Hebrew characters is allowed.
{"", L"\x05e9\x05d1\x05f4\", true},
// Hebrew Gershayim with Latin is disallowed.
{"", L"a\x05f4" L"", false},
// Hebrew Gershayim with Arabic is disallowed.
{"", L"\x0628\", false},
#if defined(OS_MACOSX)
// These characters are blocked due to a font issue on Mac.
// Tibetan transliteration characters.
{"", L"\", false},
// Arabic letter KASHMIRI YEH
{"", L"\", false},
// Hyphens (
// Hyphen-Minus (the only hyphen allowed)
// abc-def
{"", L"", true},
// Modifier Letter Minus Sign
{"", L"abc\x02d7" L"", false},
// Hyphen
{"", L"abc\x2010" L"", false},
// Non-Breaking Hyphen
// This is actually an invalid IDNA domain (U+2011 normalizes to U+2010), but
// it is included to ensure that we do not inadvertently allow this character
// to be displayed as Unicode.
{"", L"abc\x2011" L"", false},
// Figure Dash
{"", L"abc\x2012" L"", false},
// En Dash
{"", L"abc\x2013" L"", false},
// Hyphen Bullet
{"", L"abc\x2043" L"", false},
// Minus Sign
{"", L"abc\x2212" L"", false},
// Heavy Minus Sign
{"", L"abc\x2796" L"", false},
// Coptic Capital Letter Dialect-P Ni
{"", L"abc\x2cba" L"", false},
// Small Em Dash
{"", L"abc\xfe58" L"", false},
// Block NV8 (Not valid in IDN 2008) characters.
// U+058A (֊)
{"", L"a\x058a" L"", false},
{"", L"\x0561\x058a\", false},
// U+2019 (’)
{"", L"a\x2019" L"", false},
// U+2027 (‧)
{"", L"a\x2027" L"", false},
// U+30A0 (゠)
{"", L"a\x30a0" L"", false},
{"", L"\xac00\x30a0\", false},
{"", L"\x4e00\x30a0\", false},
{"", L"\x304a\x30a0\", false},
{"", L"\x3082\x30a0\", false},
// Block single/double-quote-like characters.
// U+02BB (ʻ)
{"", L"a\x02bb" L"", false},
// U+02BC (ʼ)
{"", L"a\x02bc" L"", false},
// U+144A: Not allowed to mix with scripts other than Canadian Syllabics.
{"", L"a\x144a" L"", false},
{"", L"\x1401\x144a\", true},
// Custom dangerous patterns
// Two Katakana-Hiragana combining mark in a row
{"", L"\x309a\", false},
// Katakana Letter No not enclosed by {Han,Hiragana,Katakana}.
{"", L"\x30ce" L"", false},
// TODO(jshin): Review the danger of allowing the following two.
// Hiragana 'No' by itself is allowed.
{"", L"\", true},
// Hebrew Gershayim used by itself is allowed.
{"", L"\", true},
// 4 Deviation characters between IDNA 2003 and IDNA 2008
// When entered in Unicode, the first two are mapped to 'ss' and Greek sigma
// and the latter two are mapped away. However, the punycode form should
// remain in punycode.
// U+00DF(sharp-s)
{"", L"fu\", false},
// U+03C2(final-sigma)
{"", L"\x03b1\x03b2\", false},
// U+200C(ZWNJ)
{"", L"\x0924\x094d\x200c\x0930\", false},
// U+200C(ZWJ)
{"", L"\x0915\x094d\", false},
// Math Monospace Small A. When entered in Unicode, it's canonicalized to
// 'a'. The punycode form should remain in punycode.
{"", L"\U0001d68a" L"", false},
// Math Sans Bold Capital Alpha
{"", L"\U0001d756" L"", false},
// U+3000 is canonicalized to a space(U+0020), but the punycode form
// should remain in punycode.
{"", L"\x4e2d\x56fd\x3000", false},
// U+3002 is canonicalized to ASCII fullstop(U+002E), but the punycode form
// should remain in punycode.
{"", L"\x4e2d\x56fd\x3002", false},
// Invalid punycode
// Has a codepoint beyond U+10FFFF.
{"xn--krank-kg706554a", nullptr, false},
// '?' in punycode.
{"xn--hello?", nullptr, false},
// Not allowed in UTS46/IDNA 2008
// Georgian Capital Letter(U+10BD)
{"", L"\", false},
// 3rd and 4th characters are '-'.
{"xn-----8kci4dhsd", L"\x0440\x0443--\x0430\x0432\x0442\x043e", false},
// Leading combining mark
{"", L"\", false},
// BiDi check per IDNA 2008/UTS 46
// Cannot starts with AN(Arabic-Indic Number)
{"", L"\x0662\x0660\", false},
// Cannot start with a RTL character and ends with a LTR
{"", L"\x062c\x0627\", false},
// Can start with a RTL character and ends with EN(European Number)
{"", L"\x062c\x0627\x0631" L"", true},
// Can start with a RTL and end with AN
{"", L"\x062c\x0627\x0631\", true},
struct AdjustOffsetCase {
size_t input_offset;
size_t output_offset;
struct UrlTestData {
const char* const description;
const char* const input;
FormatUrlTypes format_types;
net::UnescapeRule::Type escape_rules;
const wchar_t* output; // Use |wchar_t| to handle Unicode constants easily.
size_t prefix_len;
// A pair of helpers for the FormatUrlWithOffsets() test.
void VerboseExpect(size_t expected,
size_t actual,
const std::string& original_url,
size_t position,
const base::string16& formatted_url) {
EXPECT_EQ(expected, actual) << "Original URL: " << original_url
<< " (at char " << position << ")\nFormatted URL: " << formatted_url;
void CheckAdjustedOffsets(const std::string& url_string,
FormatUrlTypes format_types,
net::UnescapeRule::Type unescape_rules,
const size_t* output_offsets) {
GURL url(url_string);
size_t url_length = url_string.length();
std::vector<size_t> offsets;
for (size_t i = 0; i <= url_length + 1; ++i)
offsets.push_back(500000); // Something larger than any input length.
base::string16 formatted_url = FormatUrlWithOffsets(url, format_types,
unescape_rules, nullptr, nullptr, &offsets);
for (size_t i = 0; i < url_length; ++i)
VerboseExpect(output_offsets[i], offsets[i], url_string, i, formatted_url);
VerboseExpect(formatted_url.length(), offsets[url_length], url_string,
url_length, formatted_url);
VerboseExpect(base::string16::npos, offsets[url_length + 1], url_string,
500000, formatted_url);
VerboseExpect(base::string16::npos, offsets[url_length + 2], url_string,
std::string::npos, formatted_url);
TEST(UrlFormatterTest, IDNToUnicode) {
for (size_t i = 0; i < arraysize(idn_cases); i++) {
base::string16 output(IDNToUnicode(idn_cases[i].input));
base::string16 expected(idn_cases[i].unicode_allowed
? WideToUTF16(idn_cases[i].unicode_output)
: ASCIIToUTF16(idn_cases[i].input));
EXPECT_EQ(expected, output) << "input # " << i << ": \""
<< idn_cases[i].input << "\"";
TEST(UrlFormatterTest, FormatUrl) {
FormatUrlTypes default_format_type = kFormatUrlOmitUsernamePassword;
const UrlTestData tests[] = {
{"Empty URL", "", default_format_type, net::UnescapeRule::NORMAL, L"",
{"Simple URL", "", default_format_type,
net::UnescapeRule::NORMAL, L"", 7},
{"With a port number and a reference",
"\xE3\x82\xB0", default_format_type,
net::UnescapeRule::NORMAL, L"\x30B0", 7},
// -------- IDN tests --------
{"Japanese IDN with ja", "",
default_format_type, net::UnescapeRule::NORMAL,
L"http://\x671d\x65e5\x3042\x3055\", 7},
{"mailto: with Japanese IDN", "",
default_format_type, net::UnescapeRule::NORMAL,
// GURL doesn't assume an email address's domain part as a host name.
L"", 7},
{"file: with Japanese IDN", "file://",
default_format_type, net::UnescapeRule::NORMAL,
L"file://\x671d\x65e5\x3042\x3055\", 7},
{"ftp: with Japanese IDN", "",
default_format_type, net::UnescapeRule::NORMAL,
L"ftp://\x671d\x65e5\x3042\x3055\", 6},
// -------- omit_username_password flag tests --------
{"With username and password, omit_username_password=false",
"", kFormatUrlOmitNothing,
net::UnescapeRule::NORMAL, L"", 19},
{"With username and password, omit_username_password=true",
"", default_format_type,
net::UnescapeRule::NORMAL, L"", 7},
{"With username and no password", "",
default_format_type, net::UnescapeRule::NORMAL,
L"", 7},
{"Just '@' without username and password", "",
default_format_type, net::UnescapeRule::NORMAL,
L"", 7},
// GURL doesn't think local-part of an email address is username for URL.
{"mailto:, omit_username_password=true", "",
default_format_type, net::UnescapeRule::NORMAL,
L"", 7},
// -------- unescape flag tests --------
{"Do not unescape",
default_format_type, net::UnescapeRule::NONE,
// GURL parses %-encoded hostnames into Punycode.
{"Unescape normally",
default_format_type, net::UnescapeRule::NORMAL,
{"Unescape normally with BiDi control character",
default_format_type, net::UnescapeRule::NORMAL,
L"", 7},
{"Unescape normally including unescape spaces",
default_format_type, net::UnescapeRule::SPACES,
L" World", 7},
{"unescape=true with some special characters",
kFormatUrlOmitNothing, net::UnescapeRule::NORMAL,
L"", 25},
// Disabled: the resultant URL becomes "...user%253A:%2540passwd...".
// -------- omit http: --------
{"omit http with user name", "",
kFormatUrlOmitAll, net::UnescapeRule::NORMAL, L"", 0},
{"omit http", "", kFormatUrlOmitHTTP,
net::UnescapeRule::NORMAL, L"", 0},
{"omit http with https", "",
kFormatUrlOmitHTTP, net::UnescapeRule::NORMAL,
L"", 8},
{"omit http starts with ftp.", "",
kFormatUrlOmitHTTP, net::UnescapeRule::NORMAL, L"",
// -------- omit trailing slash on bare hostname --------
{"omit slash when it's the entire path", "",
kFormatUrlOmitTrailingSlashOnBareHostname, net::UnescapeRule::NORMAL,
L"", 7},
{"omit slash when there's a ref", "",
kFormatUrlOmitTrailingSlashOnBareHostname, net::UnescapeRule::NORMAL,
L"", 7},
{"omit slash when there's a query", "",
kFormatUrlOmitTrailingSlashOnBareHostname, net::UnescapeRule::NORMAL,
L"", 7},
{"omit slash when it's not the entire path", "",
net::UnescapeRule::NORMAL, L"", 7},
{"omit slash for nonstandard URLs", "data:/",
kFormatUrlOmitTrailingSlashOnBareHostname, net::UnescapeRule::NORMAL,
L"data:/", 5},
{"omit slash for file URLs", "file:///",
kFormatUrlOmitTrailingSlashOnBareHostname, net::UnescapeRule::NORMAL,
L"file:///", 7},
// -------- view-source: --------
{"view-source", "view-source:",
default_format_type, net::UnescapeRule::NORMAL,
L"view-source:http://\x30B0\x30FC\x30B0\", 19},
{"view-source of view-source",
default_format_type, net::UnescapeRule::NORMAL,
L"view-source:view-source:", 12},
// view-source should omit http and trailing slash where non-view-source
// would.
{"view-source omit http", "view-source:http://a.b/c",
kFormatUrlOmitAll, net::UnescapeRule::NORMAL, L"view-source:a.b/c", 12},
{"view-source omit http starts with ftp.", "view-source:http://ftp.b/c",
kFormatUrlOmitAll, net::UnescapeRule::NORMAL,
L"view-source:http://ftp.b/c", 19},
{"view-source omit slash when it's the entire path",
"view-source:http://a.b/", kFormatUrlOmitAll,
net::UnescapeRule::NORMAL, L"view-source:a.b", 12},
for (size_t i = 0; i < arraysize(tests); ++i) {
size_t prefix_len;
base::string16 formatted = FormatUrl(
GURL(tests[i].input), tests[i].format_types, tests[i].escape_rules,
nullptr, &prefix_len, nullptr);
EXPECT_EQ(WideToUTF16(tests[i].output), formatted) << tests[i].description;
EXPECT_EQ(tests[i].prefix_len, prefix_len) << tests[i].description;
TEST(UrlFormatterTest, FormatUrlParsed) {
// No unescape case.
url::Parsed parsed;
base::string16 formatted =
kFormatUrlOmitNothing, net::UnescapeRule::NONE,
&parsed, nullptr, nullptr);
L"/%E3%82%B0/?q=%E3%82%B0#\x30B0"), formatted);
formatted.substr(parsed.username.begin, parsed.username.len));
formatted.substr(parsed.password.begin, parsed.password.len));
formatted.substr(parsed.port.begin, parsed.port.len));
formatted.substr(parsed.path.begin, parsed.path.len));
formatted.substr(parsed.query.begin, parsed.query.len));
formatted.substr(parsed.ref.begin, parsed.ref.len));
// Unescape case.
formatted =
kFormatUrlOmitNothing, net::UnescapeRule::NORMAL, &parsed,
nullptr, nullptr);
L"/\x30B0/?q=\x30B0#\x30B0"), formatted);
formatted.substr(parsed.username.begin, parsed.username.len));
formatted.substr(parsed.password.begin, parsed.password.len));
formatted.substr(parsed.port.begin, parsed.port.len));
formatted.substr(parsed.path.begin, parsed.path.len));
formatted.substr(parsed.query.begin, parsed.query.len));
formatted.substr(parsed.ref.begin, parsed.ref.len));
// Omit_username_password + unescape case.
formatted =
kFormatUrlOmitUsernamePassword, net::UnescapeRule::NORMAL,
&parsed, nullptr, nullptr);
L"/\x30B0/?q=\x30B0#\x30B0"), formatted);
formatted.substr(parsed.port.begin, parsed.port.len));
formatted.substr(parsed.path.begin, parsed.path.len));
formatted.substr(parsed.query.begin, parsed.query.len));
formatted.substr(parsed.ref.begin, parsed.ref.len));
// View-source case.
formatted =
kFormatUrlOmitUsernamePassword, net::UnescapeRule::NORMAL,
&parsed, nullptr, nullptr);
formatted.substr(parsed.scheme.begin, parsed.scheme.len));
formatted.substr(parsed.port.begin, parsed.port.len));
formatted.substr(parsed.path.begin, parsed.path.len));
formatted.substr(parsed.query.begin, parsed.query.len));
formatted.substr(parsed.ref.begin, parsed.ref.len));
// omit http case.
formatted = FormatUrl(GURL("http://host:8000/a?b=c#d"), kFormatUrlOmitHTTP,
net::UnescapeRule::NORMAL, &parsed, nullptr, nullptr);
EXPECT_EQ(WideToUTF16(L"host:8000/a?b=c#d"), formatted);
formatted.substr(parsed.port.begin, parsed.port.len));
formatted.substr(parsed.path.begin, parsed.path.len));
formatted.substr(parsed.query.begin, parsed.query.len));
formatted.substr(parsed.ref.begin, parsed.ref.len));
// omit http starts with ftp case.
formatted = FormatUrl(GURL(""),
kFormatUrlOmitHTTP, net::UnescapeRule::NORMAL, &parsed,
nullptr, nullptr);
EXPECT_EQ(WideToUTF16(L""), formatted);
formatted.substr(parsed.scheme.begin, parsed.scheme.len));
formatted.substr(parsed.port.begin, parsed.port.len));
formatted.substr(parsed.path.begin, parsed.path.len));
formatted.substr(parsed.query.begin, parsed.query.len));
formatted.substr(parsed.ref.begin, parsed.ref.len));
// omit http starts with 'f' case.
formatted = FormatUrl(GURL("http://f/"), kFormatUrlOmitHTTP,
net::UnescapeRule::NORMAL, &parsed, nullptr, nullptr);
EXPECT_EQ(WideToUTF16(L"f/"), formatted);
formatted.substr(parsed.path.begin, parsed.path.len));
// Make sure that calling FormatUrl on a GURL and then converting back to a GURL
// results in the original GURL, for each ASCII character in the path.
TEST(UrlFormatterTest, FormatUrlRoundTripPathASCII) {
for (unsigned char test_char = 32; test_char < 128; ++test_char) {
GURL url(std::string("") +
size_t prefix_len;
base::string16 formatted =
FormatUrl(url, kFormatUrlOmitUsernamePassword,
net::UnescapeRule::NORMAL, nullptr, &prefix_len, nullptr);
EXPECT_EQ(url.spec(), GURL(formatted).spec());
// Make sure that calling FormatUrl on a GURL and then converting back to a GURL
// results in the original GURL, for each escaped ASCII character in the path.
TEST(UrlFormatterTest, FormatUrlRoundTripPathEscaped) {
for (unsigned char test_char = 32; test_char < 128; ++test_char) {
std::string original_url("");
original_url.append(base::HexEncode(&test_char, 1));
GURL url(original_url);
size_t prefix_len;
base::string16 formatted = FormatUrl(url, kFormatUrlOmitUsernamePassword,
net::UnescapeRule::NORMAL, nullptr, &prefix_len, nullptr);
EXPECT_EQ(url.spec(), GURL(formatted).spec());
// Make sure that calling FormatUrl on a GURL and then converting back to a GURL
// results in the original GURL, for each ASCII character in the query.
TEST(UrlFormatterTest, FormatUrlRoundTripQueryASCII) {
for (unsigned char test_char = 32; test_char < 128; ++test_char) {
GURL url(std::string("") +
size_t prefix_len;
base::string16 formatted =
FormatUrl(url, kFormatUrlOmitUsernamePassword,
net::UnescapeRule::NORMAL, nullptr, &prefix_len, nullptr);
EXPECT_EQ(url.spec(), GURL(formatted).spec());
// Make sure that calling FormatUrl on a GURL and then converting back to a GURL
// only results in a different GURL for certain characters.
TEST(UrlFormatterTest, FormatUrlRoundTripQueryEscaped) {
// A full list of characters which FormatURL should unescape and GURL should
// not escape again, when they appear in a query string.
const char kUnescapedCharacters[] =
for (unsigned char test_char = 0; test_char < 128; ++test_char) {
std::string original_url("");
original_url.append(base::HexEncode(&test_char, 1));
GURL url(original_url);
size_t prefix_len;
base::string16 formatted =
FormatUrl(url, kFormatUrlOmitUsernamePassword,
net::UnescapeRule::NORMAL, nullptr, &prefix_len, nullptr);
if (test_char &&
strchr(kUnescapedCharacters, static_cast<char>(test_char))) {
EXPECT_NE(url.spec(), GURL(formatted).spec());
} else {
EXPECT_EQ(url.spec(), GURL(formatted).spec());
TEST(UrlFormatterTest, FormatUrlWithOffsets) {
CheckAdjustedOffsets(std::string(), kFormatUrlOmitNothing,
net::UnescapeRule::NORMAL, nullptr);
const size_t basic_offsets[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25
kFormatUrlOmitNothing, net::UnescapeRule::NORMAL,
const size_t omit_auth_offsets_1[] = {
0, 1, 2, 3, 4, 5, 6, 7, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 7,
8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
net::UnescapeRule::NORMAL, omit_auth_offsets_1);
const size_t omit_auth_offsets_2[] = {
0, 1, 2, 3, 4, 5, 6, 7, kNpos, kNpos, kNpos, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21
net::UnescapeRule::NORMAL, omit_auth_offsets_2);
const size_t dont_omit_auth_offsets[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos,
kNpos, kNpos, 11, 12, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos,
kNpos, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
30, 31
// Unescape to "http://foo\x30B0:\".
kFormatUrlOmitNothing, net::UnescapeRule::NORMAL,
const size_t view_source_offsets[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, kNpos,
kNpos, kNpos, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33
net::UnescapeRule::NORMAL, view_source_offsets);
const size_t idn_hostname_offsets_1[] = {
0, 1, 2, 3, 4, 5, 6, 7, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos,
kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 12,
13, 14, 15, 16, 17, 18, 19
// Convert punycode to "http://\x671d\x65e5\x3042\x3055\".
kFormatUrlOmitNothing, net::UnescapeRule::NORMAL,
const size_t idn_hostname_offsets_2[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, kNpos, kNpos, kNpos, kNpos, kNpos,
kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 14, 15, kNpos, kNpos, kNpos,
kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos,
kNpos, 19, 20, 21, 22, 23, 24
// Convert punycode to
// "http://test.\x89c6\x9891.\x5317\x4eac\x5927\x5b78.test/".
net::UnescapeRule::NORMAL, idn_hostname_offsets_2);
const size_t unescape_offsets[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25, kNpos, kNpos, 26, 27, 28, 29, 30, kNpos, kNpos, kNpos,
kNpos, kNpos, kNpos, kNpos, kNpos, 31, kNpos, kNpos, kNpos, kNpos, kNpos,
kNpos, kNpos, kNpos, 32, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos,
kNpos, 33, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos
// Unescape to " bar/\x30B0\x30FC\x30B0\x30EB".
kFormatUrlOmitNothing, net::UnescapeRule::SPACES, unescape_offsets);
const size_t ref_offsets[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, kNpos, kNpos, 32, kNpos, kNpos,
// Unescape to "\x30B0\x30B0z".
kFormatUrlOmitNothing, net::UnescapeRule::NORMAL, ref_offsets);
const size_t omit_http_offsets[] = {
0, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14
CheckAdjustedOffsets("", kFormatUrlOmitHTTP,
net::UnescapeRule::NORMAL, omit_http_offsets);
const size_t omit_http_start_with_ftp_offsets[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
CheckAdjustedOffsets("", kFormatUrlOmitHTTP,
const size_t omit_all_offsets[] = {
0, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 0, kNpos, kNpos, kNpos, kNpos,
0, 1, 2, 3, 4, 5, 6, 7
CheckAdjustedOffsets("", kFormatUrlOmitAll,
net::UnescapeRule::NORMAL, omit_all_offsets);
} // namespace
} // namespace url_formatter