blob: 9834545f8bd9ebe80778183cdd7cd21970827717 [file] [log] [blame]
// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Functions to canonicalize non-special URLs.
#include "url/url_canon.h"
#include "url/url_canon_internal.h"
namespace url {
namespace {
template <typename CHAR>
bool DoCanonicalizeNonSpecialURL(const URLComponentSource<CHAR>& source,
const Parsed& parsed,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
// The implementation is similar to `DoCanonicalizeStandardURL()`, but there
// are many subtle differences. So we have a different function for
// canonicalizing non-special URLs.
//
// Since canonicalization is also used from url::ReplaceComponents(),
// we have to handle an invalid URL replacement here, such as:
//
// > const url = "git:///";
// > url.username = "x";
// > url.href
// "git:///" (this should not be "git://x@").
DCHECK(!parsed.has_opaque_path);
// Scheme: this will append the colon.
bool success = CanonicalizeScheme(source.scheme, parsed.scheme, &output,
&new_parsed.scheme);
bool have_authority =
(parsed.username.is_valid() || parsed.password.is_valid() ||
parsed.host.is_valid() || parsed.port.is_valid());
// Non-special URL examples which should be carefully handled:
//
// | URL | parsed.user | parsed.host | have_authority | Valid URL? |
// |----------+---------------+---------------+----------------+------------|
// | git:/a | invalid | invalid | false | valid |
// | git://@/ | valid (empty) | invalid | true | invalid |
// | git:/// | invalid | valid (empty) | true | valid |
if (have_authority) {
// Only write the authority separators when we have a scheme.
if (parsed.scheme.is_valid()) {
output.push_back('/');
output.push_back('/');
}
// Username and Password
//
// URL Standard:
// - https://url.spec.whatwg.org/#cannot-have-a-username-password-port
// - https://url.spec.whatwg.org/#dom-url-username
// - https://url.spec.whatwg.org/#dom-url-password
if (parsed.host.is_nonempty()) {
// User info: the canonicalizer will handle the : and @.
success &= CanonicalizeUserInfo(
source.username, parsed.username, source.password, parsed.password,
&output, &new_parsed.username, &new_parsed.password);
} else {
new_parsed.username.reset();
new_parsed.password.reset();
}
// Host
if (parsed.host.is_valid()) {
success &= CanonicalizeNonSpecialHost(source.host, parsed.host, output,
new_parsed.host);
} else {
new_parsed.host.reset();
// URL is invalid if `have_authority` is true, but `parsed.host` is
// invalid. Example: "git://@/".
success = false;
}
// Port
//
// URL Standard:
// - https://url.spec.whatwg.org/#cannot-have-a-username-password-port
// - https://url.spec.whatwg.org/#dom-url-port
if (parsed.host.is_nonempty()) {
success &= CanonicalizePort(source.port, parsed.port, PORT_UNSPECIFIED,
&output, &new_parsed.port);
} else {
new_parsed.port.reset();
}
} else {
// No authority, clear the components.
new_parsed.host.reset();
new_parsed.username.reset();
new_parsed.password.reset();
new_parsed.port.reset();
}
// Path
if (parsed.path.is_valid()) {
if (!parsed.host.is_valid() && parsed.path.is_empty()) {
// Handle an edge case: Replacing non-special path-only URL's pathname
// with an empty path.
//
// Path-only non-special URLs cannot have their paths erased.
//
// Example:
//
// > const url = new URL("git:/a");
// > url.pathname = '';
// > url.href
// => The result should be "git:/", instead of "git:".
// > url.pathname
// => The result should be "/", instead of "".
//
// URL Standard is https://url.spec.whatwg.org/#dom-url-pathname, however,
// it would take some time to understand why url.pathname ends up as "/"
// in this case. Please read the URL Standard carefully to understand
// that.
new_parsed.path.begin = output.length();
output.push_back('/');
new_parsed.path.len = output.length() - new_parsed.path.begin;
} else {
success &=
CanonicalizePath(source.path, parsed.path, CanonMode::kNonSpecialURL,
&output, &new_parsed.path);
if (!parsed.host.is_valid() && new_parsed.path.is_valid() &&
new_parsed.path.as_string_view_on(output.view().data())
.starts_with("//")) {
// To avoid path being treated as the host, prepend "/." to the path".
//
// Examples:
//
// > const url = new URL("git:/.//a");
// > url.href
// => The result should be "git:/.//a", instead of "git://a".
//
// > const url = new URL("git:/");
// > url.pathname = "/.//a"
// > url.href
// => The result should be "git:/.//a", instead of "git://a".
//
// URL Standard: https://url.spec.whatwg.org/#concept-url-serializer
//
// > 3. If url’s host is null, url does not have an opaque path, url’s
// > path’s size is greater than 1, and url’s path[0] is the empty
// > string, then append U+002F (/) followed by U+002E (.) to output.
//
// Since the path length is unknown in advance, we post-process the new
// path here. This case is likely to be infrequent, so the performance
// impact should be minimal.
size_t prior_output_length = output.length();
output.Insert(new_parsed.path.begin, "/.");
// Adjust path.
new_parsed.path.begin += output.length() - prior_output_length;
}
}
} else {
new_parsed.path.reset();
}
// Query
CanonicalizeQuery(source.query, parsed.query, query_converter, &output,
&new_parsed.query);
// Ref: ignore failure for this, since the page can probably still be loaded.
CanonicalizeRef(source.ref, parsed.ref, &output, &new_parsed.ref);
// Carry over the flag for potentially dangling markup:
if (parsed.potentially_dangling_markup) {
new_parsed.potentially_dangling_markup = true;
}
return success;
}
} // namespace
bool CanonicalizeNonSpecialURL(const char* spec,
int spec_len,
const Parsed& parsed,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
// Carry over the flag.
new_parsed.has_opaque_path = parsed.has_opaque_path;
if (parsed.has_opaque_path) {
return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed);
}
return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed,
query_converter, output, new_parsed);
}
bool CanonicalizeNonSpecialURL(const char16_t* spec,
int spec_len,
const Parsed& parsed,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
// Carry over the flag.
new_parsed.has_opaque_path = parsed.has_opaque_path;
if (parsed.has_opaque_path) {
return CanonicalizePathURL(spec, spec_len, parsed, &output, &new_parsed);
}
return DoCanonicalizeNonSpecialURL(URLComponentSource(spec), parsed,
query_converter, output, new_parsed);
}
bool ReplaceNonSpecialURL(const char* base,
const Parsed& base_parsed,
const Replacements<char>& replacements,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
if (base_parsed.has_opaque_path) {
return ReplacePathURL(base, base_parsed, replacements, &output,
&new_parsed);
}
URLComponentSource<char> source(base);
Parsed parsed(base_parsed);
SetupOverrideComponents(base, replacements, &source, &parsed);
return DoCanonicalizeNonSpecialURL(source, parsed, query_converter, output,
new_parsed);
}
// For 16-bit replacements, we turn all the replacements into UTF-8 so the
// regular code path can be used.
bool ReplaceNonSpecialURL(const char* base,
const Parsed& base_parsed,
const Replacements<char16_t>& replacements,
CharsetConverter* query_converter,
CanonOutput& output,
Parsed& new_parsed) {
if (base_parsed.has_opaque_path) {
return ReplacePathURL(base, base_parsed, replacements, &output,
&new_parsed);
}
RawCanonOutput<1024> utf8;
URLComponentSource<char> source(base);
Parsed parsed(base_parsed);
SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
return DoCanonicalizeNonSpecialURL(source, parsed, query_converter, output,
new_parsed);
}
} // namespace url