blob: 52d2323d1d5d25d02517b0685c09eee688532748 [file] [log] [blame]
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/40285824): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif
#include "components/url_pattern/url_pattern_util.h"
#include <string_view>
#include "base/numerics/safe_conversions.h"
#include "base/ranges/ranges.h"
#include "base/strings/strcat.h"
#include "base/strings/string_util.h"
#include "url/url_util.h"
namespace url_pattern {
namespace {
std::string StdStringFromCanonOutput(const url::CanonOutput& output,
const url::Component& component) {
return std::string(output.data() + component.begin, component.len);
}
bool ContainsForbiddenHostnameCodePoint(std::string_view input) {
// The full list of forbidden code points is defined at:
//
// https://url.spec.whatwg.org/#forbidden-host-code-point
//
// We only check the code points the chromium URL parser incorrectly permits.
// See: crbug.com/1065667#c18
return base::ranges::any_of(input, [](char c) {
return c == ' ' || c == '#' || c == ':' || c == '<' || c == '>' ||
c == '@' || c == '[' || c == ']' || c == '|';
});
}
} // namespace
absl::StatusOr<std::string> ProtocolEncodeCallback(std::string_view input) {
if (input.empty()) {
return std::string();
}
url::RawCanonOutputT<char> canon_output;
url::Component component;
bool result = url::CanonicalizeScheme(
input.data(), url::Component(0, base::checked_cast<int>(input.size())),
&canon_output, &component);
if (!result) {
return absl::InvalidArgumentError(
base::StrCat({"Invalid protocol '", input, "'."}));
}
return StdStringFromCanonOutput(canon_output, component);
}
absl::StatusOr<std::string> UsernameEncodeCallback(std::string_view input) {
if (input.empty()) {
return std::string();
}
url::RawCanonOutputT<char> canon_output;
url::Component username_component;
url::Component password_component;
bool result = url::CanonicalizeUserInfo(
input.data(), url::Component(0, base::checked_cast<int>(input.size())),
"", url::Component(0, 0), &canon_output, &username_component,
&password_component);
if (!result) {
return absl::InvalidArgumentError(
base::StrCat({"Invalid username pattern '", input, "'."}));
}
return StdStringFromCanonOutput(canon_output, username_component);
}
absl::StatusOr<std::string> PasswordEncodeCallback(std::string_view input) {
if (input.empty()) {
return std::string();
}
url::RawCanonOutputT<char> canon_output;
url::Component username_component;
url::Component password_component;
bool result = url::CanonicalizeUserInfo(
"", url::Component(0, 0), input.data(),
url::Component(0, base::checked_cast<int>(input.size())), &canon_output,
&username_component, &password_component);
if (!result) {
return absl::InvalidArgumentError(
base::StrCat({"Invalid password pattern '", input, "'."}));
}
return StdStringFromCanonOutput(canon_output, password_component);
}
absl::StatusOr<std::string> IPv6HostnameEncodeCallback(std::string_view input) {
std::string result;
result.reserve(input.size());
// This implements a light validation and canonicalization of IPv6 hostname
// content. Ideally we would use the URL parser's hostname canonicalizer
// here, but that is too strict for the encoding callback. The callback may
// see only bits and pieces of the hostname pattern; e.g. for `[:address]` it
// sees the `[` and `]` strings as separate calls. Since the full URL
// hostname parser wants to completely parse IPv6 hostnames, this will always
// trigger an error. Therefore, to allow pattern syntax within IPv6 brackets
// we simply check for valid characters and lowercase any hex digits.
for (size_t i = 0; i < input.size(); ++i) {
char c = input[i];
if (!base::IsHexDigit(c) && c != '[' && c != ']' && c != ':') {
return absl::InvalidArgumentError(
base::StrCat({"Invalid IPv6 hostname character '",
std::string_view(&c, 1), "' in '", input, "'."}));
}
result += base::ToLowerASCII(c);
}
return result;
}
absl::StatusOr<std::string> HostnameEncodeCallback(std::string_view input) {
if (input.empty()) {
return std::string();
}
// Due to crbug.com/1065667 the url::CanonicalizeHost() call below will
// permit and possibly encode some illegal code points. Since we want
// to ultimately fix that in the future we don't want to encourage more
// use of these characters in URLPattern. Therefore we apply an additional
// restrictive check for these forbidden code points.
//
// TODO(crbug.com/40124263): Remove this check after the URL parser is fixed.
if (ContainsForbiddenHostnameCodePoint(input)) {
return absl::InvalidArgumentError(
base::StrCat({"Invalid hostname pattern '", input, "'."}));
}
url::RawCanonOutputT<char> canon_output;
url::Component component;
bool result = url::CanonicalizeHost(
input.data(), url::Component(0, base::checked_cast<int>(input.size())),
&canon_output, &component);
if (!result) {
return absl::InvalidArgumentError(
base::StrCat({"Invalid hostname pattern '", input, "'."}));
}
return StdStringFromCanonOutput(canon_output, component);
}
absl::StatusOr<std::string> PortEncodeCallback(std::string_view input) {
if (input.empty()) {
return std::string();
}
url::RawCanonOutputT<char> canon_output;
url::Component component;
bool result = url::CanonicalizePort(
input.data(), url::Component(0, base::checked_cast<int>(input.size())),
url::PORT_UNSPECIFIED, &canon_output, &component);
if (!result) {
return absl::InvalidArgumentError(
base::StrCat({"Invalid port pattern '", input, "'."}));
}
return StdStringFromCanonOutput(canon_output, component);
}
absl::StatusOr<std::string> StandardURLPathnameEncodeCallback(
std::string_view input) {
if (input.empty()) {
return std::string();
}
url::RawCanonOutputT<char> canon_output;
url::Component component;
bool result = url::CanonicalizePartialPath(
input.data(), url::Component(0, base::checked_cast<int>(input.size())),
&canon_output, &component);
if (!result) {
return absl::InvalidArgumentError(
base::StrCat({"Invalid pathname pattern '", input, "'."}));
}
return StdStringFromCanonOutput(canon_output, component);
}
absl::StatusOr<std::string> PathURLPathnameEncodeCallback(
std::string_view input) {
if (input.empty()) {
return std::string();
}
url::RawCanonOutputT<char> canon_output;
url::Component component;
url::CanonicalizePathURLPath(
input.data(), url::Component(0, base::checked_cast<int>(input.size())),
&canon_output, &component);
return StdStringFromCanonOutput(canon_output, component);
}
absl::StatusOr<std::string> SearchEncodeCallback(std::string_view input) {
if (input.empty()) {
return std::string();
}
url::RawCanonOutputT<char> canon_output;
url::Component component;
url::CanonicalizeQuery(
input.data(), url::Component(0, base::checked_cast<int>(input.size())),
/*converter=*/nullptr, &canon_output, &component);
return StdStringFromCanonOutput(canon_output, component);
}
absl::StatusOr<std::string> HashEncodeCallback(std::string_view input) {
if (input.empty()) {
return std::string();
}
url::RawCanonOutputT<char> canon_output;
url::Component component;
url::CanonicalizeRef(input.data(),
url::Component(0, base::checked_cast<int>(input.size())),
&canon_output, &component);
return StdStringFromCanonOutput(canon_output, component);
}
// Utility method to determine if a particular hostname pattern should be
// treated as an IPv6 hostname. This implements a simple and fast heuristic
// looking for a leading `[`. It is intended to catch the most common cases
// with minimum overhead.
bool TreatAsIPv6Hostname(std::string_view pattern_utf8) {
// The `[` string cannot be a valid IPv6 hostname. We need at least two
// characters to represent `[*`.
if (pattern_utf8.size() < 2) {
return false;
}
if (pattern_utf8[0] == '[') {
return true;
}
// We do a bit of extra work to detect brackets behind an escape and
// within a grouping.
if ((pattern_utf8[0] == '\\' || pattern_utf8[0] == '{') &&
pattern_utf8[1] == '[') {
return true;
}
return false;
}
} // namespace url_pattern