components/url_pattern/url_pattern_util.cc - chromium/src - Git at Google

 // Copyright 2024 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifdef UNSAFE_BUFFERS_BUILD
 // TODO(crbug.com/40285824): Remove this and convert code to safer constructs.
 #pragma allow_unsafe_buffers
 #endif

 #include "components/url_pattern/url_pattern_util.h"

 #include <string_view>

 #include "base/numerics/safe_conversions.h"
 #include "base/ranges/ranges.h"
 #include "base/strings/strcat.h"
 #include "base/strings/string_util.h"
 #include "url/url_util.h"

 namespace url_pattern {
 namespace {

 std::string StdStringFromCanonOutput(const url::CanonOutput& output,
                                      const url::Component& component) {
   return std::string(output.data() + component.begin, component.len);
 }

 bool ContainsForbiddenHostnameCodePoint(std::string_view input) {
   // The full list of forbidden code points is defined at:
   //
   //  https://url.spec.whatwg.org/#forbidden-host-code-point
   //
   // We only check the code points the chromium URL parser incorrectly permits.
   // See: crbug.com/1065667#c18
   return base::ranges::any_of(input, [](char c) {
     return c == ' ' || c == '#' || c == ':' || c == '<' || c == '>' ||
            c == '@' || c == '[' || c == ']' || c == '|';
   });
 }

 }  // namespace

 absl::StatusOr<std::string> ProtocolEncodeCallback(std::string_view input) {
   if (input.empty()) {
     return std::string();
   }

   url::RawCanonOutputT<char> canon_output;
   url::Component component;

   bool result = url::CanonicalizeScheme(
       input.data(), url::Component(0, base::checked_cast<int>(input.size())),
       &canon_output, &component);

   if (!result) {
     return absl::InvalidArgumentError(
         base::StrCat({"Invalid protocol '", input, "'."}));
   }

   return StdStringFromCanonOutput(canon_output, component);
 }

 absl::StatusOr<std::string> UsernameEncodeCallback(std::string_view input) {
   if (input.empty()) {
     return std::string();
   }

   url::RawCanonOutputT<char> canon_output;
   url::Component username_component;
   url::Component password_component;

   bool result = url::CanonicalizeUserInfo(
       input.data(), url::Component(0, base::checked_cast<int>(input.size())),
       "", url::Component(0, 0), &canon_output, &username_component,
       &password_component);

   if (!result) {
     return absl::InvalidArgumentError(
         base::StrCat({"Invalid username pattern '", input, "'."}));
   }

   return StdStringFromCanonOutput(canon_output, username_component);
 }

 absl::StatusOr<std::string> PasswordEncodeCallback(std::string_view input) {
   if (input.empty()) {
     return std::string();
   }

   url::RawCanonOutputT<char> canon_output;
   url::Component username_component;
   url::Component password_component;

   bool result = url::CanonicalizeUserInfo(
       "", url::Component(0, 0), input.data(),
       url::Component(0, base::checked_cast<int>(input.size())), &canon_output,
       &username_component, &password_component);

   if (!result) {
     return absl::InvalidArgumentError(
         base::StrCat({"Invalid password pattern '", input, "'."}));
   }

   return StdStringFromCanonOutput(canon_output, password_component);
 }

 absl::StatusOr<std::string> IPv6HostnameEncodeCallback(std::string_view input) {
   std::string result;
   result.reserve(input.size());
   // This implements a light validation and canonicalization of IPv6 hostname
   // content.  Ideally we would use the URL parser's hostname canonicalizer
   // here, but that is too strict for the encoding callback.  The callback may
   // see only bits and pieces of the hostname pattern; e.g. for `[:address]` it
   // sees the `[` and `]` strings as separate calls.  Since the full URL
   // hostname parser wants to completely parse IPv6 hostnames, this will always
   // trigger an error.  Therefore, to allow pattern syntax within IPv6 brackets
   // we simply check for valid characters and lowercase any hex digits.
   for (size_t i = 0; i < input.size(); ++i) {
     char c = input[i];
     if (!base::IsHexDigit(c) && c != '[' && c != ']' && c != ':') {
       return absl::InvalidArgumentError(
           base::StrCat({"Invalid IPv6 hostname character '",
                         std::string_view(&c, 1), "' in '", input, "'."}));
     }
     result += base::ToLowerASCII(c);
   }
   return result;
 }

 absl::StatusOr<std::string> HostnameEncodeCallback(std::string_view input) {
   if (input.empty()) {
     return std::string();
   }

   // Due to crbug.com/1065667 the url::CanonicalizeHost() call below will
   // permit and possibly encode some illegal code points.  Since we want
   // to ultimately fix that in the future we don't want to encourage more
   // use of these characters in URLPattern.  Therefore we apply an additional
   // restrictive check for these forbidden code points.
   //
   // TODO(crbug.com/40124263): Remove this check after the URL parser is fixed.
   if (ContainsForbiddenHostnameCodePoint(input)) {
     return absl::InvalidArgumentError(
         base::StrCat({"Invalid hostname pattern '", input, "'."}));
   }

   url::RawCanonOutputT<char> canon_output;
   url::Component component;

   bool result = url::CanonicalizeHost(
       input.data(), url::Component(0, base::checked_cast<int>(input.size())),
       &canon_output, &component);

   if (!result) {
     return absl::InvalidArgumentError(
         base::StrCat({"Invalid hostname pattern '", input, "'."}));
   }

   return StdStringFromCanonOutput(canon_output, component);
 }

 absl::StatusOr<std::string> PortEncodeCallback(std::string_view input) {
   if (input.empty()) {
     return std::string();
   }

   url::RawCanonOutputT<char> canon_output;
   url::Component component;

   bool result = url::CanonicalizePort(
       input.data(), url::Component(0, base::checked_cast<int>(input.size())),
       url::PORT_UNSPECIFIED, &canon_output, &component);

   if (!result) {
     return absl::InvalidArgumentError(
         base::StrCat({"Invalid port pattern '", input, "'."}));
   }

   return StdStringFromCanonOutput(canon_output, component);
 }

 absl::StatusOr<std::string> StandardURLPathnameEncodeCallback(
     std::string_view input) {
   if (input.empty()) {
     return std::string();
   }

   url::RawCanonOutputT<char> canon_output;
   url::Component component;

   bool result = url::CanonicalizePartialPath(
       input.data(), url::Component(0, base::checked_cast<int>(input.size())),
       &canon_output, &component);

   if (!result) {
     return absl::InvalidArgumentError(
         base::StrCat({"Invalid pathname pattern '", input, "'."}));
   }

   return StdStringFromCanonOutput(canon_output, component);
 }

 absl::StatusOr<std::string> PathURLPathnameEncodeCallback(
     std::string_view input) {
   if (input.empty()) {
     return std::string();
   }

   url::RawCanonOutputT<char> canon_output;
   url::Component component;

   url::CanonicalizePathURLPath(
       input.data(), url::Component(0, base::checked_cast<int>(input.size())),
       &canon_output, &component);

   return StdStringFromCanonOutput(canon_output, component);
 }

 absl::StatusOr<std::string> SearchEncodeCallback(std::string_view input) {
   if (input.empty()) {
     return std::string();
   }

   url::RawCanonOutputT<char> canon_output;
   url::Component component;

   url::CanonicalizeQuery(
       input.data(), url::Component(0, base::checked_cast<int>(input.size())),
       /*converter=*/nullptr, &canon_output, &component);

   return StdStringFromCanonOutput(canon_output, component);
 }

 absl::StatusOr<std::string> HashEncodeCallback(std::string_view input) {
   if (input.empty()) {
     return std::string();
   }

   url::RawCanonOutputT<char> canon_output;
   url::Component component;

   url::CanonicalizeRef(input.data(),
                        url::Component(0, base::checked_cast<int>(input.size())),
                        &canon_output, &component);

   return StdStringFromCanonOutput(canon_output, component);
 }

 // Utility method to determine if a particular hostname pattern should be
 // treated as an IPv6 hostname.  This implements a simple and fast heuristic
 // looking for a leading `[`.  It is intended to catch the most common cases
 // with minimum overhead.
 bool TreatAsIPv6Hostname(std::string_view pattern_utf8) {
   // The `[` string cannot be a valid IPv6 hostname.  We need at least two
   // characters to represent `[*`.
   if (pattern_utf8.size() < 2) {
     return false;
   }

   if (pattern_utf8[0] == '[') {
     return true;
   }

   // We do a bit of extra work to detect brackets behind an escape and
   // within a grouping.
   if ((pattern_utf8[0] == '\\' || pattern_utf8[0] == '{') &&
       pattern_utf8[1] == '[') {
     return true;
   }

   return false;
 }

 }  // namespace url_pattern
	// Copyright 2024 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#ifdef UNSAFE_BUFFERS_BUILD
	// TODO(crbug.com/40285824): Remove this and convert code to safer constructs.
	#pragma allow_unsafe_buffers
	#endif

	#include "components/url_pattern/url_pattern_util.h"

	#include <string_view>

	#include "base/numerics/safe_conversions.h"
	#include "base/ranges/ranges.h"
	#include "base/strings/strcat.h"
	#include "base/strings/string_util.h"
	#include "url/url_util.h"

	namespace url_pattern {
	namespace {

	std::string StdStringFromCanonOutput(const url::CanonOutput& output,
	const url::Component& component) {
	return std::string(output.data() + component.begin, component.len);
	}

	bool ContainsForbiddenHostnameCodePoint(std::string_view input) {
	// The full list of forbidden code points is defined at:
	//
	// https://url.spec.whatwg.org/#forbidden-host-code-point
	//
	// We only check the code points the chromium URL parser incorrectly permits.
	// See: crbug.com/1065667#c18
	return base::ranges::any_of(input, [](char c) {
	return c == ' ' \|\| c == '#' \|\| c == ':' \|\| c == '<' \|\| c == '>' \|\|
	c == '@' \|\| c == '[' \|\| c == ']' \|\| c == '\|';
	});
	}

	} // namespace

	absl::StatusOr<std::string> ProtocolEncodeCallback(std::string_view input) {
	if (input.empty()) {
	return std::string();
	}

	url::RawCanonOutputT<char> canon_output;
	url::Component component;

	bool result = url::CanonicalizeScheme(
	input.data(), url::Component(0, base::checked_cast<int>(input.size())),
	&canon_output, &component);

	if (!result) {
	return absl::InvalidArgumentError(
	base::StrCat({"Invalid protocol '", input, "'."}));
	}

	return StdStringFromCanonOutput(canon_output, component);
	}

	absl::StatusOr<std::string> UsernameEncodeCallback(std::string_view input) {
	if (input.empty()) {
	return std::string();
	}

	url::RawCanonOutputT<char> canon_output;
	url::Component username_component;
	url::Component password_component;

	bool result = url::CanonicalizeUserInfo(
	input.data(), url::Component(0, base::checked_cast<int>(input.size())),
	"", url::Component(0, 0), &canon_output, &username_component,
	&password_component);

	if (!result) {
	return absl::InvalidArgumentError(
	base::StrCat({"Invalid username pattern '", input, "'."}));
	}

	return StdStringFromCanonOutput(canon_output, username_component);
	}

	absl::StatusOr<std::string> PasswordEncodeCallback(std::string_view input) {
	if (input.empty()) {
	return std::string();
	}

	url::RawCanonOutputT<char> canon_output;
	url::Component username_component;
	url::Component password_component;

	bool result = url::CanonicalizeUserInfo(
	"", url::Component(0, 0), input.data(),
	url::Component(0, base::checked_cast<int>(input.size())), &canon_output,
	&username_component, &password_component);

	if (!result) {
	return absl::InvalidArgumentError(
	base::StrCat({"Invalid password pattern '", input, "'."}));
	}

	return StdStringFromCanonOutput(canon_output, password_component);
	}

	absl::StatusOr<std::string> IPv6HostnameEncodeCallback(std::string_view input) {
	std::string result;
	result.reserve(input.size());
	// This implements a light validation and canonicalization of IPv6 hostname
	// content. Ideally we would use the URL parser's hostname canonicalizer
	// here, but that is too strict for the encoding callback. The callback may
	// see only bits and pieces of the hostname pattern; e.g. for `[:address]` it
	// sees the `[` and `]` strings as separate calls. Since the full URL
	// hostname parser wants to completely parse IPv6 hostnames, this will always
	// trigger an error. Therefore, to allow pattern syntax within IPv6 brackets
	// we simply check for valid characters and lowercase any hex digits.
	for (size_t i = 0; i < input.size(); ++i) {
	char c = input[i];
	if (!base::IsHexDigit(c) && c != '[' && c != ']' && c != ':') {
	return absl::InvalidArgumentError(
	base::StrCat({"Invalid IPv6 hostname character '",
	std::string_view(&c, 1), "' in '", input, "'."}));
	}
	result += base::ToLowerASCII(c);
	}
	return result;
	}

	absl::StatusOr<std::string> HostnameEncodeCallback(std::string_view input) {
	if (input.empty()) {
	return std::string();
	}

	// Due to crbug.com/1065667 the url::CanonicalizeHost() call below will
	// permit and possibly encode some illegal code points. Since we want
	// to ultimately fix that in the future we don't want to encourage more
	// use of these characters in URLPattern. Therefore we apply an additional
	// restrictive check for these forbidden code points.
	//
	// TODO(crbug.com/40124263): Remove this check after the URL parser is fixed.
	if (ContainsForbiddenHostnameCodePoint(input)) {
	return absl::InvalidArgumentError(
	base::StrCat({"Invalid hostname pattern '", input, "'."}));
	}

	url::RawCanonOutputT<char> canon_output;
	url::Component component;

	bool result = url::CanonicalizeHost(
	input.data(), url::Component(0, base::checked_cast<int>(input.size())),
	&canon_output, &component);

	if (!result) {
	return absl::InvalidArgumentError(
	base::StrCat({"Invalid hostname pattern '", input, "'."}));
	}

	return StdStringFromCanonOutput(canon_output, component);
	}

	absl::StatusOr<std::string> PortEncodeCallback(std::string_view input) {
	if (input.empty()) {
	return std::string();
	}

	url::RawCanonOutputT<char> canon_output;
	url::Component component;

	bool result = url::CanonicalizePort(
	input.data(), url::Component(0, base::checked_cast<int>(input.size())),
	url::PORT_UNSPECIFIED, &canon_output, &component);

	if (!result) {
	return absl::InvalidArgumentError(
	base::StrCat({"Invalid port pattern '", input, "'."}));
	}

	return StdStringFromCanonOutput(canon_output, component);
	}

	absl::StatusOr<std::string> StandardURLPathnameEncodeCallback(
	std::string_view input) {
	if (input.empty()) {
	return std::string();
	}

	url::RawCanonOutputT<char> canon_output;
	url::Component component;

	bool result = url::CanonicalizePartialPath(
	input.data(), url::Component(0, base::checked_cast<int>(input.size())),
	&canon_output, &component);

	if (!result) {
	return absl::InvalidArgumentError(
	base::StrCat({"Invalid pathname pattern '", input, "'."}));
	}

	return StdStringFromCanonOutput(canon_output, component);
	}

	absl::StatusOr<std::string> PathURLPathnameEncodeCallback(
	std::string_view input) {
	if (input.empty()) {
	return std::string();
	}

	url::RawCanonOutputT<char> canon_output;
	url::Component component;

	url::CanonicalizePathURLPath(
	input.data(), url::Component(0, base::checked_cast<int>(input.size())),
	&canon_output, &component);

	return StdStringFromCanonOutput(canon_output, component);
	}

	absl::StatusOr<std::string> SearchEncodeCallback(std::string_view input) {
	if (input.empty()) {
	return std::string();
	}

	url::RawCanonOutputT<char> canon_output;
	url::Component component;

	url::CanonicalizeQuery(
	input.data(), url::Component(0, base::checked_cast<int>(input.size())),
	/converter=/nullptr, &canon_output, &component);

	return StdStringFromCanonOutput(canon_output, component);
	}

	absl::StatusOr<std::string> HashEncodeCallback(std::string_view input) {
	if (input.empty()) {
	return std::string();
	}

	url::RawCanonOutputT<char> canon_output;
	url::Component component;

	url::CanonicalizeRef(input.data(),
	url::Component(0, base::checked_cast<int>(input.size())),
	&canon_output, &component);

	return StdStringFromCanonOutput(canon_output, component);
	}

	// Utility method to determine if a particular hostname pattern should be
	// treated as an IPv6 hostname. This implements a simple and fast heuristic
	// looking for a leading `[`. It is intended to catch the most common cases
	// with minimum overhead.
	bool TreatAsIPv6Hostname(std::string_view pattern_utf8) {
	// The `[` string cannot be a valid IPv6 hostname. We need at least two
	// characters to represent `[*`.
	if (pattern_utf8.size() < 2) {
	return false;
	}

	if (pattern_utf8[0] == '[') {
	return true;
	}

	// We do a bit of extra work to detect brackets behind an escape and
	// within a grouping.
	if ((pattern_utf8[0] == '\\' \|\| pattern_utf8[0] == '{') &&
	pattern_utf8[1] == '[') {
	return true;
	}

	return false;
	}

	} // namespace url_pattern