net/base/url_unescape_iterator.cc - chromium/src - Git at Google

 // Copyright 2025 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "net/base/url_unescape_iterator.h"

 #include <algorithm>

 #include "base/containers/span.h"
 #include "base/strings/string_util.h"
 #include "base/strings/utf_string_conversion_utils.h"
 #include "base/third_party/icu/icu_utf.h"

 namespace net {

 namespace {

 // Returns true if `s` contains any characters whose interpretation may be
 // changed by UrlUnescapeIterator. ASCII characters are passed through
 // unchanged, except for '+' and '%'.
 bool ContainsCharactersChangedByUnescaping(std::string_view s) {
   return std::ranges::any_of(
       s, [](char c) { return c == '+' || c == '%' || (c & 0x80) != 0; });
 }

 }  // namespace

 void UrlUnescapeIterator::IncrementReplacementChar() {
   value_ = kReplacementCharacterInUTF8[replacement_character_byte_];
   ++replacement_character_byte_;
   if (replacement_character_byte_ == std::size(kReplacementCharacterInUTF8)) {
     replacement_character_byte_ = 0;
   }
 }

 std::pair<char, UrlUnescapeIterator::WrappedIterator>
 UrlUnescapeIterator::DecodePercent(WrappedIterator next) {
   if (next == end_) {
     return {'%', next};
   }
   const char most_sig_digit = *next;
   if (!base::IsHexDigit(most_sig_digit)) {
     return {'%', next};
   }
   const auto next2 = std::next(next);
   if (next2 == end_) {
     return {'%', next};
   }
   const char least_sig_digit = *next2;
   if (!base::IsHexDigit(least_sig_digit)) {
     return {'%', next};
   }

   const char value =
       static_cast<char>(base::HexDigitToInt(most_sig_digit) << 4 |
                         base::HexDigitToInt(least_sig_digit));
   return {value, std::next(next2)};
 }

 void UrlUnescapeIterator::CheckNonAscii() {
   static constexpr size_t kMaxUtf8CharacterLength = 4u;
   // It would be ideal to use base::StreamingUtf8Validator here, but
   // unfortunately it is not compiled into Cronet builds. Instead, we determine
   // the length of the UTF-8 character based on the first byte and then decode
   // it into a temporary buffer so that we can use base::ReadUnicodeCharacter()
   // to check it for validity.
   std::array<char, kMaxUtf8CharacterLength> current_codepoint = {};
   size_t current_codepoint_size = 1u;
   if ((value_ & 0xE0) == 0xC0) {
     current_codepoint_size = 2u;
   } else if ((value_ & 0xF0) == 0xE0) {
     current_codepoint_size = 3u;
   } else if ((value_ & 0xF8) == 0xF0) {
     current_codepoint_size = 4u;
   } else {
     EmitReplacementCharacter();
     return;
   }
   current_codepoint[0] = value_;
   // Since a byte in `current_codepoint` corresponds to 1 or 3 bytes in the
   // input string, we need to keep track of where each byte was found. We don't
   // keep track of the first byte, as we will never set `next_` to point to
   // that, so iterators[0] points to current_codepoint[1] and so on.
   std::array<WrappedIterator, kMaxUtf8CharacterLength> iterators = {next_};
   for (size_t i = 1u; i < current_codepoint_size; ++i) {
     const auto current = iterators[i - 1];
     if (current == end_) {
       // There may have been a bad byte already, so we still need to call
       // ReadUnicodeCharacter() to ensure we emit the correct number of
       // replacement characters.
       break;
     }
     const auto [value, next] = DecodeAt(current);
     current_codepoint[i] = value;
     iterators[i] = next;
   }
   size_t char_index = 0;
   base_icu::UChar32 code_point_out = 0;
   const bool ok = base::ReadUnicodeCharacter(current_codepoint.data(),
                                              current_codepoint_size,
                                              &char_index, &code_point_out);
   if (!ok) {
     next_ = iterators[char_index];
     EmitReplacementCharacter();
     return;
   }

   remaining_checked_output_bytes_ = current_codepoint_size - 1;
 }

 void UrlUnescapeIterator::EmitReplacementCharacter() {
   value_ = kReplacementCharacterInUTF8[0];
   replacement_character_byte_ = 1;
 }

 bool EqualsAfterUrlDecoding(std::string_view a, std::string_view b) {
   if (a == b) {
     // UrlUnescapeIterator is deterministic, so if they are the same before
     // decoding they will also be the same afterwards.
     return true;
   }

   if (!ContainsCharactersChangedByUnescaping(a) &&
       !ContainsCharactersChangedByUnescaping(b)) {
     return false;
   }

   return std::ranges::equal(MakeUrlUnescapeRange(a), MakeUrlUnescapeRange(b));
 }

 }  // namespace net
	// Copyright 2025 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "net/base/url_unescape_iterator.h"

	#include <algorithm>

	#include "base/containers/span.h"
	#include "base/strings/string_util.h"
	#include "base/strings/utf_string_conversion_utils.h"
	#include "base/third_party/icu/icu_utf.h"

	namespace net {

	namespace {

	// Returns true if `s` contains any characters whose interpretation may be
	// changed by UrlUnescapeIterator. ASCII characters are passed through
	// unchanged, except for '+' and '%'.
	bool ContainsCharactersChangedByUnescaping(std::string_view s) {
	return std::ranges::any_of(
	s, [](char c) { return c == '+' \|\| c == '%' \|\| (c & 0x80) != 0; });
	}

	} // namespace

	void UrlUnescapeIterator::IncrementReplacementChar() {
	value_ = kReplacementCharacterInUTF8[replacement_character_byte_];
	++replacement_character_byte_;
	if (replacement_character_byte_ == std::size(kReplacementCharacterInUTF8)) {
	replacement_character_byte_ = 0;
	}
	}

	std::pair<char, UrlUnescapeIterator::WrappedIterator>
	UrlUnescapeIterator::DecodePercent(WrappedIterator next) {
	if (next == end_) {
	return {'%', next};
	}
	const char most_sig_digit = *next;
	if (!base::IsHexDigit(most_sig_digit)) {
	return {'%', next};
	}
	const auto next2 = std::next(next);
	if (next2 == end_) {
	return {'%', next};
	}
	const char least_sig_digit = *next2;
	if (!base::IsHexDigit(least_sig_digit)) {
	return {'%', next};
	}

	const char value =
	static_cast<char>(base::HexDigitToInt(most_sig_digit) << 4 \|
	base::HexDigitToInt(least_sig_digit));
	return {value, std::next(next2)};
	}

	void UrlUnescapeIterator::CheckNonAscii() {
	static constexpr size_t kMaxUtf8CharacterLength = 4u;
	// It would be ideal to use base::StreamingUtf8Validator here, but
	// unfortunately it is not compiled into Cronet builds. Instead, we determine
	// the length of the UTF-8 character based on the first byte and then decode
	// it into a temporary buffer so that we can use base::ReadUnicodeCharacter()
	// to check it for validity.
	std::array<char, kMaxUtf8CharacterLength> current_codepoint = {};
	size_t current_codepoint_size = 1u;
	if ((value_ & 0xE0) == 0xC0) {
	current_codepoint_size = 2u;
	} else if ((value_ & 0xF0) == 0xE0) {
	current_codepoint_size = 3u;
	} else if ((value_ & 0xF8) == 0xF0) {
	current_codepoint_size = 4u;
	} else {
	EmitReplacementCharacter();
	return;
	}
	current_codepoint[0] = value_;
	// Since a byte in `current_codepoint` corresponds to 1 or 3 bytes in the
	// input string, we need to keep track of where each byte was found. We don't
	// keep track of the first byte, as we will never set `next_` to point to
	// that, so iterators[0] points to current_codepoint[1] and so on.
	std::array<WrappedIterator, kMaxUtf8CharacterLength> iterators = {next_};
	for (size_t i = 1u; i < current_codepoint_size; ++i) {
	const auto current = iterators[i - 1];
	if (current == end_) {
	// There may have been a bad byte already, so we still need to call
	// ReadUnicodeCharacter() to ensure we emit the correct number of
	// replacement characters.
	break;
	}
	const auto [value, next] = DecodeAt(current);
	current_codepoint[i] = value;
	iterators[i] = next;
	}
	size_t char_index = 0;
	base_icu::UChar32 code_point_out = 0;
	const bool ok = base::ReadUnicodeCharacter(current_codepoint.data(),
	current_codepoint_size,
	&char_index, &code_point_out);
	if (!ok) {
	next_ = iterators[char_index];
	EmitReplacementCharacter();
	return;
	}

	remaining_checked_output_bytes_ = current_codepoint_size - 1;
	}

	void UrlUnescapeIterator::EmitReplacementCharacter() {
	value_ = kReplacementCharacterInUTF8[0];
	replacement_character_byte_ = 1;
	}

	bool EqualsAfterUrlDecoding(std::string_view a, std::string_view b) {
	if (a == b) {
	// UrlUnescapeIterator is deterministic, so if they are the same before
	// decoding they will also be the same afterwards.
	return true;
	}

	if (!ContainsCharactersChangedByUnescaping(a) &&
	!ContainsCharactersChangedByUnescaping(b)) {
	return false;
	}

	return std::ranges::equal(MakeUrlUnescapeRange(a), MakeUrlUnescapeRange(b));
	}

	} // namespace net