blob: 1521417a3904b0b11d9c303280ce2d3956fd1d34 [file] [log] [blame]
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "net/base/url_unescape_iterator.h"
#include <iterator>
#include <limits>
#include <ranges>
#include <string_view>
#include <utility>
#include "base/containers/span.h"
#include "base/containers/to_vector.h"
#include "base/strings/escape.h"
#include "base/strings/strcat.h"
#include "base/strings/stringprintf.h"
#include "net/base/url_util.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "third_party/fuzztest/src/fuzztest/fuzztest.h"
namespace net {
namespace {
static_assert(std::forward_iterator<UrlUnescapeIterator>);
// A single test case. Does not own the referenced strings.
struct Case {
std::string_view input;
std::string_view expected_output;
std::string_view description;
};
// A test case that is constructed at runtime. Can be converted to a Case.
struct OwningCase {
std::string input;
std::string expected_output;
std::string description;
OwningCase(std::string_view in,
std::string_view expected,
std::string_view desc)
: input(in), expected_output(expected), description(desc) {}
explicit operator Case() const {
return Case(input, expected_output, description);
}
};
// Convenience function used in most tests.
std::string UnescapeToString(std::string_view in) {
auto as_range = MakeUrlUnescapeRange(in);
static_assert(std::ranges::forward_range<decltype(as_range)>);
return std::string(std::ranges::begin(as_range), std::ranges::end(as_range));
}
// Test a contiguous range of cases.
void TestCases(base::span<const Case> cases) {
for (const auto [input, expected_output, description] : cases) {
EXPECT_EQ(UnescapeToString(input), expected_output) << description;
}
}
// Same as above, but for OwningCase.
void TestCases(base::span<const OwningCase> cases) {
auto unowned =
base::ToVector(cases, [](const OwningCase& c) { return Case(c); });
TestCases(unowned);
}
// Converts the test cases in `cases` to percent-encoded form by escaping all
// non-ASCII characters as %xx, then runs them.
void EncodeThenTestCases(base::span<const Case> cases) {
auto encoded = base::ToVector(cases, [](const Case& in) {
auto [input, expected, description] = in;
auto escaped = base::EscapeNonASCII(input);
return OwningCase(escaped, expected, description);
});
TestCases(encoded);
}
TEST(UrlUnescapeIteratorTest, DefaultConstructor) {
constexpr UrlUnescapeIterator a;
constexpr UrlUnescapeIterator b;
EXPECT_EQ(a, b);
static_assert(a == b);
}
TEST(UrlUnescapeIteratorTest, CopyAndAssignAndEquality) {
auto [a, b] = MakeUrlUnescapeRange("walk");
EXPECT_NE(a, b);
b = a;
EXPECT_EQ(a, b);
const UrlUnescapeIterator c = a;
EXPECT_EQ(a, c);
const UrlUnescapeIterator d = c;
EXPECT_EQ(c, d);
b = d;
EXPECT_EQ(b, d);
}
TEST(UrlUnescapeIteratorTest, PostIncrement) {
auto [it, end] = MakeUrlUnescapeRange("a");
const UrlUnescapeIterator old_it = it;
EXPECT_EQ(old_it, it++);
EXPECT_NE(old_it, it);
EXPECT_EQ(it, end);
}
TEST(UrlUnescapeIteratorTest, GoodAscii) {
static constexpr std::string_view kNul("\0", 1u);
static constexpr Case cases[] = {
{"", "", "empty"},
{"a", "a", "one letter"},
{"word", "word", "multiple letters"},
{"two words", "two words", "space"},
{"two+words", "two words", "plus"},
{"two%20words", "two words", "escaped space"},
{"%2b", "+", "escaped plus"},
{"%2B", "+", "escaped plus, uppercase hex"},
{"++", " ", "double plus"},
{"+%20+", " ", "plus, escaped space, plus"},
{"%61b", "ab", "escaped start"},
{"a%62", "ab", "escaped end"},
{"%00", kNul, "escaped nul byte"},
{"line%0a", "line\x0a", "escaped newline"},
{"l%7D", "l\x7d", "escaped del control code"},
};
TestCases(cases);
}
TEST(UrlUnescapeIteratorTest, BadPercentEncoding) {
static constexpr Case cases[] = {
{"%", "%", "percent at end of string"},
{"%2", "%2", "not followed by two characters"},
{"%g1", "%g1", "first character not hex"},
{"%1 ", "%1 ", "second character not hex"},
{"%+20", "% 20", "first character is plus"},
{"% 20", "% 20", "first character is space"},
{"%1%20", "%1 ", "second character is percent"},
{"%%34%31", "%41", "no double expansion"},
};
TestCases(cases);
}
static constexpr Case kGoodUtf8[] = {
{"\xc2\xa5", "\xc2\xa5", "two bytes"},
{"\xef\xbf\xa5", "\xef\xbf\xa5", "three bytes"},
{"\xf0\x9f\x86\x91", "\xf0\x9f\x86\x91", "four bytes"},
{"\xef\xb7\x90", "\xef\xb7\x90", "non-character"},
};
TEST(UrlUnescapeIteratorTest, GoodUtf8) {
TestCases(kGoodUtf8);
}
TEST(UrlUnescapeIteratorTest, GoodUtf8Encoded) {
EncodeThenTestCases(kGoodUtf8);
}
// Verifies that mixing encoded and unencoded bytes in a single character
// works.
TEST(UrlUnescapeIteratorTest, GoodUtf8MixedEncoded) {
std::vector<OwningCase> encoded;
// Not the correct size, just an estimate to reduce resizes.
encoded.reserve(std::size(kGoodUtf8) * 2);
for (const auto [input, expected, description] : kGoodUtf8) {
for (int byte_to_encode = 0; byte_to_encode < input.size();
++byte_to_encode) {
const std::string encoded_byte =
base::EscapeNonASCII(input.substr(byte_to_encode, 1));
const std::string encoded_input =
base::StrCat({input.substr(0, byte_to_encode), encoded_byte,
input.substr(byte_to_encode + 1)});
encoded.emplace_back(encoded_input, expected,
base::StringPrintf("%s, encoded byte %zu",
description, byte_to_encode));
}
}
TestCases(encoded);
}
#define REPLACEMENT_CHAR "\xef\xbf\xbd"
constexpr char kReplacementChar[] = REPLACEMENT_CHAR;
constexpr char kReplacementCharx2[] = REPLACEMENT_CHAR REPLACEMENT_CHAR;
constexpr char kReplacementCharx3[] =
REPLACEMENT_CHAR REPLACEMENT_CHAR REPLACEMENT_CHAR;
constexpr char kReplacementCharx4[] =
REPLACEMENT_CHAR REPLACEMENT_CHAR REPLACEMENT_CHAR REPLACEMENT_CHAR;
constexpr char kReplacementCharx5[] = REPLACEMENT_CHAR REPLACEMENT_CHAR
REPLACEMENT_CHAR REPLACEMENT_CHAR REPLACEMENT_CHAR;
constexpr char kReplacementCharx6[] = REPLACEMENT_CHAR REPLACEMENT_CHAR
REPLACEMENT_CHAR REPLACEMENT_CHAR REPLACEMENT_CHAR REPLACEMENT_CHAR;
std::string ReplacementCharNTimes(size_t n) {
const std::vector<std::string_view> to_concat(n, kReplacementChar);
return base::StrCat(to_concat);
}
TEST(UrlUnescapeIteratorTest, TruncatedUtf8) {
std::vector<OwningCase> truncated;
truncated.reserve(std::size(kGoodUtf8) * 4);
for (const auto [input, expected, description] : kGoodUtf8) {
for (int truncate_pos = 1; truncate_pos < input.size(); ++truncate_pos) {
const std::string truncated_input(input.substr(0, truncate_pos));
// We expect one replacement character per UTF-8 start byte, regardless
// of length.
truncated.emplace_back(truncated_input, kReplacementChar,
base::StringPrintf("%s, truncated to length %zu",
description, truncate_pos));
truncated.emplace_back(
base::EscapeNonASCII(truncated_input), kReplacementChar,
base::StringPrintf("%s, truncated to length %zu, encoded",
description, truncate_pos));
}
}
TestCases(truncated);
}
TEST(UrlUnescapeIteratorTest, CorruptedUtf8) {
std::vector<OwningCase> corrupted;
corrupted.reserve(std::size(kGoodUtf8) * 4);
for (const auto [input, expected, description] : kGoodUtf8) {
for (int corrupt_byte = 0; corrupt_byte < input.size(); ++corrupt_byte) {
const std::string corrupted_input =
base::StrCat({"-", input.substr(0, corrupt_byte), "X",
input.substr(corrupt_byte + 1), "-"});
// A valid initial sequence will be replaced with a single replacement
// character. Unexpected continuation bytes will be replaced with one
// replacement character each.
const std::string expected_output = base::StrCat(
{"-", corrupt_byte > 0 ? kReplacementChar : "", "X",
ReplacementCharNTimes(input.size() - corrupt_byte - 1), "-"});
corrupted.emplace_back(corrupted_input, expected_output,
base::StringPrintf("%s, with byte %zu corrupted",
description, corrupt_byte));
corrupted.emplace_back(
base::EscapeNonASCII(corrupted_input), expected_output,
base::StringPrintf("%s, with byte %zu corrupted, encoded",
description, corrupt_byte));
}
}
TestCases(corrupted);
}
constexpr Case kBadUtf8[] = {
{"\xC0\x80", kReplacementCharx2,
"Overlong encoding of U+0000 (null). 0xC0 is never a valid start."},
{"\xC1\xBF", kReplacementCharx2,
"Overlong encoding of U+007F. 0xC1 is never a valid start."},
{"\xE0\x80\x80", kReplacementCharx3,
"Overlong encoding of U+0000 (null) as 3 bytes."},
{"\xE0\x9F\xBF", kReplacementCharx3,
"Overlong encoding of U+07FF as 3 bytes (should be 2)."},
{"\xF0\x80\x80\x80", kReplacementCharx4,
"Overlong encoding of U+0000 (null) as 4 bytes."},
{"\xF0\x8F\xBF\xBF", kReplacementCharx4,
"Overlong encoding of U+FFFF as 4 bytes (should be 3)."},
{"\xED\xA0\x80", kReplacementCharx3,
"Invalid surrogate half U+D800 (start of surrogate range)"},
{"\xED\xBF\xBF", kReplacementCharx3,
"Invalid surrogate half U+DFFF (end of surrogate range)"},
{"\xED\xA0\x81\xED\xB0\x80", kReplacementCharx6,
"Incorrectly encoded surrogate pair"},
{"\xF4\x90\x80\x80", kReplacementCharx4,
"Invalid code point U+110000 (beyond Unicode max U+10FFFF)"},
{"\xF5\x80\x80\x80", kReplacementCharx4,
"Invalid start byte 0xF5 (would encode > U+10FFFF)"},
{"\xF8\x80\x80\x80\x80", kReplacementCharx5,
"Invalid start byte 0xF8 (formerly 5-byte sequence)"},
{"\xFC\x80\x80\x80\x80\x80", kReplacementCharx6,
"Invalid start byte 0xFC (formerly 6-byte sequence)"},
{"\xFE", kReplacementChar, "Invalid byte 0xFE (never used)"},
{"\xFF", kReplacementChar, "Invalid byte 0xFF (never used)"},
{"\xc2\xa5\xc1\xc2\xa5", "\xc2\xa5" REPLACEMENT_CHAR "\xc2\xa5",
"Valid followed by invalid followed by valid"},
{"\xE2\xE2", kReplacementCharx2, "Overshort with error"},
};
TEST(UrlUnescapeIteratorTest, OtherBadUtf8) {
TestCases(kBadUtf8);
}
TEST(UrlUnescapeIteratorTest, OtherBadUtf8Encoded) {
EncodeThenTestCases(kBadUtf8);
}
void SameOutputAsUnescapePercentEncodedUrl(std::string_view input) {
EXPECT_EQ(UnescapeToString(input), UnescapePercentEncodedUrl(input));
}
// Exhaustively test the output is the same as UnescapePercentEncodedUrl() for
// all single-byte inputs.
TEST(UrlUnescapeIteratorTest, OneByteSameAsUnescapePercentEncodedUrl) {
// `i` is int to avoid problems with overflowing.
for (int i = std::numeric_limits<char>::min();
i <= std::numeric_limits<char>::max(); ++i) {
const char c = static_cast<char>(i);
SameOutputAsUnescapePercentEncodedUrl(std::string_view(&c, 1u));
}
}
// Same thing, but %-encoded.
TEST(UrlUnescapeIteratorTest, OneByteSameAsUnescapePercentEncodedUrlEncoded) {
for (int i = 0; i <= 0xFF; ++i) {
const std::string input = base::StringPrintf("%%%02x", i);
SameOutputAsUnescapePercentEncodedUrl(input);
}
}
FUZZ_TEST(UrlUnescapeIteratorTest, SameOutputAsUnescapePercentEncodedUrl);
TEST(UrlUnescapeIteratorTest, TrivialSelfEquals) {
auto expect_self_equals = [](base::span<const Case> cases) {
for (const auto [input, _, description] : cases) {
EXPECT_TRUE(EqualsAfterUrlDecoding(input, input)) << description;
}
};
for (const char* input : {"", "a", "word", " ", "+", "%", "%2", "%20"}) {
EXPECT_TRUE(EqualsAfterUrlDecoding(input, input)) << input;
}
expect_self_equals(kGoodUtf8);
expect_self_equals(kBadUtf8);
}
TEST(UrlUnescapeIteratorTest, EqualsAfterEscaping) {
auto expect_equals_after_escaping = [](base::span<const Case> cases) {
for (const auto [input, _, description] : cases) {
EXPECT_TRUE(
EqualsAfterUrlDecoding(input, base::EscapeAllExceptUnreserved(input)))
<< description;
EXPECT_TRUE(
EqualsAfterUrlDecoding(base::EscapeAllExceptUnreserved(input), input))
<< description << ", backwards";
}
};
expect_equals_after_escaping(kGoodUtf8);
expect_equals_after_escaping(kBadUtf8);
}
struct StringPair {
std::string_view a;
std::string_view b;
};
TEST(UrlUnescapeIteratorTest, InterestinglyEqual) {
static constexpr StringPair cases[] = {
{" ", "+"}, {"+", "%20"}, {"%", "%25"},
{"%2a", "%2A"}, {"%c2%A5", "%C2%a5"}, {"%c2\xa5", "\xc2%a5"},
{"%c0", "%c1"}, // both become replacement character
{"%c2", "%ef%bf"}, // both are truncated UTF-8 codepoints
};
for (const auto [a, b] : cases) {
EXPECT_TRUE(EqualsAfterUrlDecoding(a, b))
<< "(\"" << a << "\", \"" << b << "\")";
}
}
TEST(UrlUnescapeIteratorTest, Unequal) {
static constexpr StringPair cases[] = {
{"", "%00"}, {"abc", "ABC"}, {"\xc2\xa5", "\xc2\xa6"},
{"%c2%a5", "%c2%a6"}, {"%a", "%A"}, {"%2g", "%2G"},
{"%00a", "%00A"},
};
for (const auto [a, b] : cases) {
EXPECT_FALSE(EqualsAfterUrlDecoding(a, b))
<< "(\"" << a << "\", \"" << b << "\")";
}
}
#undef REPLACEMENT_CHAR
} // namespace
} // namespace net