blob: 75b0fb96c07680b2de1dc0b5e06d849c7edd5ef2 [file] [log] [blame]
// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "net/base/url_util.h"
#include "build/build_config.h"
#if BUILDFLAG(IS_POSIX)
#include <netinet/in.h>
#elif BUILDFLAG(IS_WIN)
#include <ws2tcpip.h>
#endif
#include <optional>
#include <string_view>
#include "base/check_op.h"
#include "base/containers/fixed_flat_set.h"
#include "base/strings/escape.h"
#include "base/strings/strcat.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversions.h"
#include "net/base/ip_address.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "url/gurl.h"
#include "url/scheme_host_port.h"
#include "url/url_canon.h"
#include "url/url_canon_internal.h"
#include "url/url_canon_ip.h"
#include "url/url_constants.h"
#include "url/url_util.h"
namespace net {
namespace {
bool IsHostCharAlphanumeric(char c) {
// We can just check lowercase because uppercase characters have already been
// normalized.
return ((c >= 'a') && (c <= 'z')) || ((c >= '0') && (c <= '9'));
}
bool IsNormalizedLocalhostTLD(std::string_view host) {
return base::EndsWith(host, ".localhost",
base::CompareCase::INSENSITIVE_ASCII);
}
// Helper function used by GetIdentityFromURL. If |escaped_text| can be "safely
// unescaped" to a valid UTF-8 string, return that string, as UTF-16. Otherwise,
// convert it as-is to UTF-16. "Safely unescaped" is defined as having no
// escaped character between '0x00' and '0x1F', inclusive.
std::u16string UnescapeIdentityString(std::string_view escaped_text) {
std::string unescaped_text;
if (base::UnescapeBinaryURLComponentSafe(
escaped_text, false /* fail_on_path_separators */, &unescaped_text)) {
std::u16string result;
if (base::UTF8ToUTF16(unescaped_text.data(), unescaped_text.length(),
&result)) {
return result;
}
}
return base::UTF8ToUTF16(escaped_text);
}
} // namespace
GURL AppendQueryParameter(const GURL& url,
std::string_view name,
std::string_view value) {
std::string query(url.query());
if (!query.empty())
query += "&";
query += (base::EscapeQueryParamValue(name, true) + "=" +
base::EscapeQueryParamValue(value, true));
GURL::Replacements replacements;
replacements.SetQueryStr(query);
return url.ReplaceComponents(replacements);
}
GURL AppendOrReplaceQueryParameter(const GURL& url,
std::string_view name,
std::optional<std::string_view> value) {
bool replaced = false;
std::string param_name = base::EscapeQueryParamValue(name, true);
bool should_keep_param = value.has_value();
std::string param_value;
if (should_keep_param)
param_value = base::EscapeQueryParamValue(value.value(), true);
const std::string_view input = url.query_piece();
url::Component cursor(0, input.size());
std::string output;
url::Component key_range, value_range;
while (url::ExtractQueryKeyValue(input, &cursor, &key_range, &value_range)) {
const std::string_view key = input.substr(key_range.begin, key_range.len);
std::string key_value_pair;
// Check |replaced| as only the first pair should be replaced.
if (!replaced && key == param_name) {
replaced = true;
if (!should_keep_param)
continue;
key_value_pair = param_name + "=" + param_value;
} else {
key_value_pair = std::string(
input.substr(key_range.begin, value_range.end() - key_range.begin));
}
if (!output.empty())
output += "&";
output += key_value_pair;
}
if (!replaced && should_keep_param) {
if (!output.empty())
output += "&";
output += (param_name + "=" + param_value);
}
GURL::Replacements replacements;
replacements.SetQueryStr(output);
return url.ReplaceComponents(replacements);
}
GURL AppendOrReplaceRef(const GURL& url, const std::string_view& ref) {
GURL::Replacements replacements;
replacements.SetRefStr(ref);
return url.ReplaceComponents(replacements);
}
QueryIterator::QueryIterator(const GURL& url)
: url_(url), at_end_(!url.is_valid()) {
if (!at_end_) {
query_ = url.parsed_for_possibly_invalid_spec().query;
Advance();
}
}
QueryIterator::~QueryIterator() = default;
std::string_view QueryIterator::GetKey() const {
DCHECK(!at_end_);
if (key_.is_nonempty())
return std::string_view(url_->spec()).substr(key_.begin, key_.len);
return std::string_view();
}
std::string_view QueryIterator::GetValue() const {
DCHECK(!at_end_);
if (value_.is_nonempty())
return std::string_view(url_->spec()).substr(value_.begin, value_.len);
return std::string_view();
}
const std::string& QueryIterator::GetUnescapedValue() {
DCHECK(!at_end_);
if (value_.is_nonempty() && unescaped_value_.empty()) {
unescaped_value_ = base::UnescapeURLComponent(
GetValue(),
base::UnescapeRule::SPACES | base::UnescapeRule::PATH_SEPARATORS |
base::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS |
base::UnescapeRule::REPLACE_PLUS_WITH_SPACE);
}
return unescaped_value_;
}
bool QueryIterator::IsAtEnd() const {
return at_end_;
}
void QueryIterator::Advance() {
DCHECK(!at_end_);
key_.reset();
value_.reset();
unescaped_value_.clear();
at_end_ = !url::ExtractQueryKeyValue(url_->spec(), &query_, &key_, &value_);
}
bool GetValueForKeyInQuery(const GURL& url,
std::string_view search_key,
std::string* out_value) {
for (QueryIterator it(url); !it.IsAtEnd(); it.Advance()) {
if (it.GetKey() == search_key) {
*out_value = it.GetUnescapedValue();
return true;
}
}
return false;
}
bool ParseHostAndPort(std::string_view input, std::string* host, int* port) {
if (input.empty())
return false;
url::Component auth_component(0, input.size());
url::Component username_component;
url::Component password_component;
url::Component hostname_component;
url::Component port_component;
// `input` is not NUL-terminated, so `input.data()` must be accompanied by a
// length. In these calls, `url::Component` provides an offset and length.
url::ParseAuthority(input.data(), auth_component, &username_component,
&password_component, &hostname_component,
&port_component);
// There shouldn't be a username/password.
if (username_component.is_valid() || password_component.is_valid())
return false;
if (hostname_component.is_empty())
return false; // Failed parsing.
int parsed_port_number = -1;
if (port_component.is_nonempty()) {
parsed_port_number = url::ParsePort(input.data(), port_component);
// If parsing failed, port_number will be either PORT_INVALID or
// PORT_UNSPECIFIED, both of which are negative.
if (parsed_port_number < 0)
return false; // Failed parsing the port number.
}
if (port_component.len == 0)
return false; // Reject inputs like "foo:"
unsigned char tmp_ipv6_addr[16];
// If the hostname starts with a bracket, it is either an IPv6 literal or
// invalid. If it is an IPv6 literal then strip the brackets.
if (hostname_component.len > 0 && input[hostname_component.begin] == '[') {
if (input[hostname_component.end() - 1] == ']' &&
url::IPv6AddressToNumber(input.data(), hostname_component,
tmp_ipv6_addr)) {
// Strip the brackets.
hostname_component.begin++;
hostname_component.len -= 2;
} else {
return false;
}
}
// Pass results back to caller.
*host = std::string(
input.substr(hostname_component.begin, hostname_component.len));
*port = parsed_port_number;
return true; // Success.
}
std::string GetHostAndPort(const GURL& url) {
// For IPv6 literals, GURL::host() already includes the brackets so it is
// safe to just append a colon.
return base::StringPrintf("%s:%d", url.host().c_str(),
url.EffectiveIntPort());
}
std::string GetHostAndOptionalPort(const GURL& url) {
// For IPv6 literals, GURL::host() already includes the brackets
// so it is safe to just append a colon.
if (url.has_port())
return base::StringPrintf("%s:%s", url.host().c_str(), url.port().c_str());
return url.host();
}
NET_EXPORT std::string GetHostAndOptionalPort(
const url::SchemeHostPort& scheme_host_port) {
int default_port = url::DefaultPortForScheme(
scheme_host_port.scheme().data(),
static_cast<int>(scheme_host_port.scheme().length()));
if (default_port != scheme_host_port.port()) {
return base::StringPrintf("%s:%i", scheme_host_port.host().c_str(),
scheme_host_port.port());
}
return scheme_host_port.host();
}
std::string TrimEndingDot(std::string_view host) {
std::string_view host_trimmed = host;
size_t len = host_trimmed.length();
if (len > 1 && host_trimmed[len - 1] == '.') {
host_trimmed.remove_suffix(1);
}
return std::string(host_trimmed);
}
std::string GetHostOrSpecFromURL(const GURL& url) {
return url.has_host() ? TrimEndingDot(url.host_piece()) : url.spec();
}
std::string GetSuperdomain(std::string_view domain) {
size_t dot_pos = domain.find('.');
if (dot_pos == std::string::npos)
return "";
return std::string(domain.substr(dot_pos + 1));
}
bool IsSubdomainOf(std::string_view subdomain, std::string_view superdomain) {
// Subdomain must be identical or have strictly more labels than the
// superdomain.
if (subdomain.length() <= superdomain.length())
return subdomain == superdomain;
// Superdomain must be suffix of subdomain, and the last character not
// included in the matching substring must be a dot.
if (!subdomain.ends_with(superdomain)) {
return false;
}
subdomain.remove_suffix(superdomain.length());
return subdomain.back() == '.';
}
std::string CanonicalizeHost(std::string_view host,
url::CanonHostInfo* host_info) {
// Try to canonicalize the host.
const url::Component raw_host_component(0, static_cast<int>(host.length()));
std::string canon_host;
url::StdStringCanonOutput canon_host_output(&canon_host);
// A url::StdStringCanonOutput starts off with a zero length buffer. The
// first time through Grow() immediately resizes it to 32 bytes, incurring
// a malloc. With libcxx a 22 byte or smaller request can be accommodated
// within the std::string itself (i.e. no malloc occurs). Start the buffer
// off at the max size to avoid a malloc on short strings.
// NOTE: To ensure the final size is correctly reflected, it's necessary
// to call Complete() which will adjust the size to the actual bytes written.
// This is handled below for success cases, while failure cases discard all
// the output.
const int kCxxMaxStringBufferSizeWithoutMalloc = 22;
canon_host_output.Resize(kCxxMaxStringBufferSizeWithoutMalloc);
url::CanonicalizeHostVerbose(host.data(), raw_host_component,
&canon_host_output, host_info);
if (host_info->out_host.is_nonempty() &&
host_info->family != url::CanonHostInfo::BROKEN) {
// Success! Assert that there's no extra garbage.
canon_host_output.Complete();
DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length()));
} else {
// Empty host, or canonicalization failed. We'll return empty.
canon_host.clear();
}
return canon_host;
}
bool IsCanonicalizedHostCompliant(std::string_view host) {
if (host.empty() || host.size() > 254 ||
(host.back() != '.' && host.size() == 254)) {
return false;
}
bool in_component = false;
bool most_recent_component_started_alphanumeric = false;
size_t label_size = 0;
for (char c : host) {
++label_size;
if (!in_component) {
most_recent_component_started_alphanumeric = IsHostCharAlphanumeric(c);
if (!most_recent_component_started_alphanumeric && (c != '-') &&
(c != '_')) {
return false;
}
in_component = true;
} else if (c == '.') {
in_component = false;
if (label_size > 64 || label_size == 1) {
// Label should not be empty or longer than 63 characters (+1 for '.'
// character included in `label_size`).
return false;
} else {
label_size = 0;
}
} else if (!IsHostCharAlphanumeric(c) && (c != '-') && (c != '_')) {
return false;
}
}
// Check for too-long label when not ended with final '.'.
if (label_size > 63)
return false;
return most_recent_component_started_alphanumeric;
}
bool IsHostnameNonUnique(std::string_view hostname) {
// CanonicalizeHost requires surrounding brackets to parse an IPv6 address.
const std::string host_or_ip = hostname.find(':') != std::string::npos
? base::StrCat({"[", hostname, "]"})
: std::string(hostname);
url::CanonHostInfo host_info;
std::string canonical_name = CanonicalizeHost(host_or_ip, &host_info);
// If canonicalization fails, then the input is truly malformed. However,
// to avoid mis-reporting bad inputs as "non-unique", treat them as unique.
if (canonical_name.empty())
return false;
// If |hostname| is an IP address, check to see if it's in an IANA-reserved
// range reserved for non-publicly routable networks.
if (host_info.IsIPAddress()) {
IPAddress host_addr;
if (!host_addr.AssignFromIPLiteral(hostname.substr(
host_info.out_host.begin, host_info.out_host.len))) {
return false;
}
switch (host_info.family) {
case url::CanonHostInfo::IPV4:
case url::CanonHostInfo::IPV6:
return !host_addr.IsPubliclyRoutable();
case url::CanonHostInfo::NEUTRAL:
case url::CanonHostInfo::BROKEN:
return false;
}
}
// Check for a registry controlled portion of |hostname|, ignoring private
// registries, as they already chain to ICANN-administered registries,
// and explicitly ignoring unknown registries.
//
// Note: This means that as new gTLDs are introduced on the Internet, they
// will be treated as non-unique until the registry controlled domain list
// is updated. However, because gTLDs are expected to provide significant
// advance notice to deprecate older versions of this code, this an
// acceptable tradeoff.
return !registry_controlled_domains::HostHasRegistryControlledDomain(
canonical_name, registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
}
bool IsLocalhost(const GURL& url) {
return HostStringIsLocalhost(url.HostNoBracketsPiece());
}
bool HostStringIsLocalhost(std::string_view host) {
IPAddress ip_address;
if (ip_address.AssignFromIPLiteral(host))
return ip_address.IsLoopback();
return IsLocalHostname(host);
}
GURL SimplifyUrlForRequest(const GURL& url) {
DCHECK(url.is_valid());
// Fast path to avoid re-canonicalization via ReplaceComponents.
if (!url.has_username() && !url.has_password() && !url.has_ref())
return url;
GURL::Replacements replacements;
replacements.ClearUsername();
replacements.ClearPassword();
replacements.ClearRef();
return url.ReplaceComponents(replacements);
}
GURL ChangeWebSocketSchemeToHttpScheme(const GURL& url) {
DCHECK(url.SchemeIsWSOrWSS());
GURL::Replacements replace_scheme;
replace_scheme.SetSchemeStr(url.SchemeIs(url::kWssScheme) ? url::kHttpsScheme
: url::kHttpScheme);
return url.ReplaceComponents(replace_scheme);
}
bool IsStandardSchemeWithNetworkHost(std::string_view scheme) {
// file scheme is special. Windows file share origins can have network hosts.
if (scheme == url::kFileScheme)
return true;
url::SchemeType scheme_type;
if (!url::GetStandardSchemeType(
scheme.data(), url::Component(0, scheme.length()), &scheme_type)) {
return false;
}
return scheme_type == url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION ||
scheme_type == url::SCHEME_WITH_HOST_AND_PORT;
}
void GetIdentityFromURL(const GURL& url,
std::u16string* username,
std::u16string* password) {
*username = UnescapeIdentityString(url.username());
*password = UnescapeIdentityString(url.password());
}
bool HasGoogleHost(const GURL& url) {
return IsGoogleHost(url.host_piece());
}
bool IsGoogleHost(std::string_view host) {
static const char* kGoogleHostSuffixes[] = {
".google.com",
".youtube.com",
".gmail.com",
".doubleclick.net",
".gstatic.com",
".googlevideo.com",
".googleusercontent.com",
".googlesyndication.com",
".google-analytics.com",
".googleadservices.com",
".googleapis.com",
".ytimg.com",
};
for (const char* suffix : kGoogleHostSuffixes) {
// Here it's possible to get away with faster case-sensitive comparisons
// because the list above is all lowercase, and a GURL's host name will
// always be canonicalized to lowercase as well.
if (host.ends_with(suffix)) {
return true;
}
}
return false;
}
bool IsGoogleHostWithAlpnH3(std::string_view host) {
return base::EqualsCaseInsensitiveASCII(host, "google.com") ||
base::EqualsCaseInsensitiveASCII(host, "www.google.com");
}
bool IsLocalHostname(std::string_view host) {
// Remove any trailing '.'.
if (!host.empty() && *host.rbegin() == '.')
host.remove_suffix(1);
return base::EqualsCaseInsensitiveASCII(host, "localhost") ||
IsNormalizedLocalhostTLD(host);
}
std::string UnescapePercentEncodedUrl(std::string_view input) {
std::string result(input);
// Replace any 0x2B (+) with 0x20 (SP).
for (char& c : result) {
if (c == '+') {
c = ' ';
}
}
// Run UTF-8 decoding without BOM on the percent-decoding.
url::RawCanonOutputT<char16_t> canon_output;
url::DecodeURLEscapeSequences(result, url::DecodeURLMode::kUTF8,
&canon_output);
return base::UTF16ToUTF8(canon_output.view());
}
} // namespace net