blob: 9834545f8bd9ebe80778183cdd7cd21970827717 [file] [log] [blame]
// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Functions to canonicalize non-special URLs.
#include "url/url_canon.h"
#include "url/url_canon_internal.h"
namespace url {
namespace {
template <typename CHAR>
bool DoCanonicalizeNonSpecialURL(const URLComponentSource<CHAR>& source,
const Parsed& parsed,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
// The implementation is similar to `DoCanonicalizeStandardURL()`, but there
// are many subtle differences. So we have a different function for
// canonicalizing non-special URLs.
// Since canonicalization is also used from url::ReplaceComponents(),
// we have to handle an invalid URL replacement here, such as:
// > const url = "git:///";
// > url.username = "x";
// > url.href
// "git:///" (this should not be "git://x@").
// Scheme: this will append the colon.
bool success = CanonicalizeScheme(source.scheme, parsed.scheme, &output,
bool have_authority =
(parsed.username.is_valid() || parsed.password.is_valid() || || parsed.port.is_valid());
// Non-special URL examples which should be carefully handled:
// | URL | parsed.user | | have_authority | Valid URL? |
// |----------+---------------+---------------+----------------+------------|
// | git:/a | invalid | invalid | false | valid |
// | git://@/ | valid (empty) | invalid | true | invalid |
// | git:/// | invalid | valid (empty) | true | valid |
if (have_authority) {
// Only write the authority separators when we have a scheme.
if (parsed.scheme.is_valid()) {
// Username and Password
// URL Standard:
// -
// -
// -
if ( {
// User info: the canonicalizer will handle the : and @.
success &= CanonicalizeUserInfo(
source.username, parsed.username, source.password, parsed.password,
&output, &new_parsed.username, &new_parsed.password);
} else {
// Host
if ( {
success &= CanonicalizeNonSpecialHost(,, output,;
} else {;
// URL is invalid if `have_authority` is true, but `` is
// invalid. Example: "git://@/".
success = false;
// Port
// URL Standard:
// -
// -
if ( {
success &= CanonicalizePort(source.port, parsed.port, PORT_UNSPECIFIED,
&output, &new_parsed.port);
} else {
} else {
// No authority, clear the components.;
// Path
if (parsed.path.is_valid()) {
if (! && parsed.path.is_empty()) {
// Handle an edge case: Replacing non-special path-only URL's pathname
// with an empty path.
// Path-only non-special URLs cannot have their paths erased.
// Example:
// > const url = new URL("git:/a");
// > url.pathname = '';
// > url.href
// => The result should be "git:/", instead of "git:".
// > url.pathname
// => The result should be "/", instead of "".
// URL Standard is, however,
// it would take some time to understand why url.pathname ends up as "/"
// in this case. Please read the URL Standard carefully to understand
// that.
new_parsed.path.begin = output.length();
new_parsed.path.len = output.length() - new_parsed.path.begin;
} else {
success &=
CanonicalizePath(source.path, parsed.path, CanonMode::kNonSpecialURL,
&output, &new_parsed.path);
if (! && new_parsed.path.is_valid() &&
.starts_with("//")) {
// To avoid path being treated as the host, prepend "/." to the path".
// Examples:
// > const url = new URL("git:/.//a");
// > url.href
// => The result should be "git:/.//a", instead of "git://a".
// > const url = new URL("git:/");
// > url.pathname = "/.//a"
// > url.href
// => The result should be "git:/.//a", instead of "git://a".
// URL Standard:
// > 3. If url’s host is null, url does not have an opaque path, url’s
// > path’s size is greater than 1, and url’s path[0] is the empty
// > string, then append U+002F (/) followed by U+002E (.) to output.
// Since the path length is unknown in advance, we post-process the new
// path here. This case is likely to be infrequent, so the performance
// impact should be minimal.
size_t prior_output_length = output.length();
output.Insert(new_parsed.path.begin, "/.");
// Adjust path.
new_parsed.path.begin += output.length() - prior_output_length;
} else {
// Query
CanonicalizeQuery(source.query, parsed.query, query_converter, &output,
// Ref: ignore failure for this, since the page can probably still be loaded.
CanonicalizeRef(source.ref, parsed.ref, &output, &new_parsed.ref);
// Carry over the flag for potentially dangling markup:
if (parsed.potentially_dangling_markup) {
new_parsed.potentially_dangling_markup = true;
return success;
} // namespace
bool CanonicalizeNonSpecialURL(const char* spec,
int spec_len,
const Parsed& parsed,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
// Carry over the flag.
new_parsed.has_opaque_path = parsed.has_opaque_path;
if (parsed.has_opaque_path) {
return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed);
return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed,
query_converter, output, new_parsed);
bool CanonicalizeNonSpecialURL(const char16_t* spec,
int spec_len,
const Parsed& parsed,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
// Carry over the flag.
new_parsed.has_opaque_path = parsed.has_opaque_path;
if (parsed.has_opaque_path) {
return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed);
return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed,
query_converter, output, new_parsed);
bool ReplaceNonSpecialURL(const char* base,
const Parsed& base_parsed,
const Replacements<char>& replacements,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
if (base_parsed.has_opaque_path) {
return ReplacePathURL(base, base_parsed, replacements, &output,
URLComponentSource<char> source(base);
Parsed parsed(base_parsed);
SetupOverrideComponents(base, replacements, &source, &parsed);
return DoCanonicalizeNonSpecialURL(source, parsed, query_converter, output,
// For 16-bit replacements, we turn all the replacements into UTF-8 so the
// regular code path can be used.
bool ReplaceNonSpecialURL(const char* base,
const Parsed& base_parsed,
const Replacements<char16_t>& replacements,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
if (base_parsed.has_opaque_path) {
return ReplacePathURL(base, base_parsed, replacements, &output,
RawCanonOutput<1024> utf8;
URLComponentSource<char> source(base);
Parsed parsed(base_parsed);
SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
return DoCanonicalizeNonSpecialURL(source, parsed, query_converter, output,
} // namespace url