Reland "Prevent unsafe narrowing: base/strings" This reverts commit 224425dc19627040de412f578767d3e53e076e7e. Bug: 1292951 Change-Id: I82336d78a30a164c629e7d9018c23cc60b1833ab Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3708189 Reviewed-by: danakj <danakj@chromium.org> Owners-Override: danakj <danakj@chromium.org> Commit-Queue: danakj <danakj@chromium.org> Cr-Commit-Position: refs/heads/main@{#1015046} NOKEYCHECK=True GitOrigin-RevId: 8bb45c228957681d85d0c445ef6bcb0fca3f9bcc

commit: 6254d3de5842d5c629ad9686ea11182c1e4775d6 [log] [tgz]
author: Peter Kasting <pkasting@chromium.org> Thu Jun 16 19:39:27 2022
committer: Copybara-Service <copybara-worker@google.com> Thu Jun 16 19:56:06 2022
tree: b5b8e4844943fbb5fed5a58860a23172308a0e80
parent: ba371975bdb23514d9d8dcccc55f05d78ad87af9 [diff]
diff --git a/url_canon_etc.cc b/url_canon_etc.cc
index 851926d..53b71ea 100644
--- a/url_canon_etc.cc
+++ b/url_canon_etc.cc

@@ -101,7 +101,7 @@
               const Component& scheme,
               CanonOutput* output,
               Component* out_scheme) {
-  if (scheme.len <= 0) {
+  if (!scheme.is_nonempty()) {
     // Scheme is unspecified or empty, convert to empty by appending a colon.
     *out_scheme = Component(output->length(), 0);
     output->push_back(':');
@@ -117,12 +117,13 @@
   // FindAndCompareScheme, which could cause some security checks on
   // schemes to be incorrect.
   bool success = true;
-  int end = scheme.end();
-  for (int i = scheme.begin; i < end; i++) {
+  size_t begin = static_cast<size_t>(scheme.begin);
+  size_t end = static_cast<size_t>(scheme.end());
+  for (size_t i = begin; i < end; i++) {
     UCHAR ch = static_cast<UCHAR>(spec[i]);
     char replacement = 0;
     if (ch < 0x80) {
-      if (i == scheme.begin) {
+      if (i == begin) {
         // Need to do a special check for the first letter of the scheme.
         if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
           replacement = kSchemeCanonical[ch];
@@ -179,8 +180,9 @@
   out_username->begin = output->length();
   if (username.len > 0) {
     // This will escape characters not valid for the username.
-    AppendStringOfType(&username_spec[username.begin], username.len,
-                       CHAR_USERINFO, output);
+    AppendStringOfType(&username_spec[username.begin],
+                       static_cast<size_t>(username.len), CHAR_USERINFO,
+                       output);
   }
   out_username->len = output->length() - out_username->begin;
 
@@ -189,8 +191,9 @@
   if (password.len > 0) {
     output->push_back(':');
     out_password->begin = output->length();
-    AppendStringOfType(&password_spec[password.begin], password.len,
-                       CHAR_USERINFO, output);
+    AppendStringOfType(&password_spec[password.begin],
+                       static_cast<size_t>(password.len), CHAR_USERINFO,
+                       output);
     out_password->len = output->length() - out_password->begin;
   } else {
     *out_password = Component();
@@ -223,7 +226,8 @@
     // what the error was, and mark the URL as invalid by returning false.
     output->push_back(':');
     out_port->begin = output->length();
-    AppendInvalidNarrowString(spec, port.begin, port.end(), output);
+    AppendInvalidNarrowString(spec, static_cast<size_t>(port.begin),
+                              static_cast<size_t>(port.end()), output);
     out_port->len = output->length() - out_port->begin;
     return false;
   }
@@ -285,7 +289,7 @@
                        const Component& ref,
                        CanonOutput* output,
                        Component* out_ref) {
-  if (ref.len < 0) {
+  if (!ref.is_valid()) {
     // Common case of no ref.
     *out_ref = Component();
     return;
@@ -297,8 +301,8 @@
   out_ref->begin = output->length();
 
   // Now iterate through all the characters, converting to UTF-8 and validating.
-  int end = ref.end();
-  for (int i = ref.begin; i < end; i++) {
+  size_t end = static_cast<size_t>(ref.end());
+  for (size_t i = static_cast<size_t>(ref.begin); i < end; i++) {
     UCHAR current_char = static_cast<UCHAR>(spec[i]);
     if (current_char < 0x80) {
       if (kShouldEscapeCharInFragment[current_char])

diff --git a/url_canon_host.cc b/url_canon_host.cc
index c2cd9d1..5f7bf71 100644
--- a/url_canon_host.cc
+++ b/url_canon_host.cc

@@ -123,15 +123,15 @@
 //    |*has_non_ascii| flag.
 //
 // The return value indicates if the output is a potentially valid host name.
-template<typename INCHAR, typename OUTCHAR>
+template <typename INCHAR, typename OUTCHAR>
 bool DoSimpleHost(const INCHAR* host,
-                  int host_len,
+                  size_t host_len,
                   CanonOutputT<OUTCHAR>* output,
                   bool* has_non_ascii) {
   *has_non_ascii = false;
 
   bool success = true;
-  for (int i = 0; i < host_len; ++i) {
+  for (size_t i = 0; i < host_len; ++i) {
     unsigned int source = host[i];
     if (source == '%') {
       // Unescape first, if possible.
@@ -175,7 +175,7 @@
 }
 
 // Canonicalizes a host that requires IDN conversion. Returns true on success
-bool DoIDNHost(const char16_t* src, int src_len, CanonOutput* output) {
+bool DoIDNHost(const char16_t* src, size_t src_len, CanonOutput* output) {
   int original_output_len = output->length();  // So we can rewind below.
 
   // We need to escape URL before doing IDN conversion, since punicode strings
@@ -202,8 +202,8 @@
   // unescaping. Although we unescaped everything before this function call, if
   // somebody does %00 as fullwidth, ICU will convert this to ASCII.
   bool success = DoSimpleHost(wide_output.data(),
-                              wide_output.length(),
-                              output, &has_non_ascii);
+                              static_cast<size_t>(wide_output.length()), output,
+                              &has_non_ascii);
   if (has_non_ascii) {
     // ICU generated something that DoSimpleHost didn't think looked like
     // ASCII. This is quite rare, but ICU might convert some characters to
@@ -220,7 +220,8 @@
     // ASCII isn't strictly necessary, but DoSimpleHost handles this case
     // anyway so we handle it/
     output->set_length(original_output_len);
-    AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(),
+    AppendInvalidNarrowString(wide_output.data(), 0,
+                              static_cast<size_t>(wide_output.length()),
                               output);
     return false;
   }
@@ -230,8 +231,11 @@
 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to
 // UTF-16. The has_escaped flag should be set if the input string requires
 // unescaping.
-bool DoComplexHost(const char* host, int host_len,
-                   bool has_non_ascii, bool has_escaped, CanonOutput* output) {
+bool DoComplexHost(const char* host,
+                   size_t host_len,
+                   bool has_non_ascii,
+                   bool has_escaped,
+                   CanonOutput* output) {
   // Save the current position in the output. We may write stuff and rewind it
   // below, so we need to know where to rewind to.
   int begin_length = output->length();
@@ -239,7 +243,7 @@
   // Points to the UTF-8 data we want to convert. This will either be the
   // input or the unescaped version written to |*output| if necessary.
   const char* utf8_source;
-  int utf8_source_len;
+  size_t utf8_source_len;
   bool are_all_escaped_valid = true;
   if (has_escaped) {
     // Unescape before converting to UTF-16 for IDN. We write this into the
@@ -264,7 +268,7 @@
     // Save the pointer into the data was just converted (it may be appended to
     // other data in the output buffer).
     utf8_source = &output->data()[begin_length];
-    utf8_source_len = output->length() - begin_length;
+    utf8_source_len = static_cast<size_t>(output->length() - begin_length);
   } else {
     // We don't need to unescape, use input for IDNization later. (We know the
     // input has non-ASCII, or the simple version would have been called
@@ -280,17 +284,18 @@
   if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
     // In this error case, the input may or may not be the output.
     StackBuffer utf8;
-    for (int i = 0; i < utf8_source_len; i++)
+    for (size_t i = 0; i < utf8_source_len; i++)
       utf8.push_back(utf8_source[i]);
     output->set_length(begin_length);
-    AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
+    AppendInvalidNarrowString(utf8.data(), 0,
+                              static_cast<size_t>(utf8.length()), output);
     return false;
   }
   output->set_length(begin_length);
 
   // This will call DoSimpleHost which will do normal ASCII canonicalization
   // and also check for IP addresses in the outpt.
-  return DoIDNHost(utf16.data(), utf16.length(), output) &&
+  return DoIDNHost(utf16.data(), static_cast<size_t>(utf16.length()), output) &&
          are_all_escaped_valid;
 }
 
@@ -298,7 +303,7 @@
 // the backend, so we just pass through. The has_escaped flag should be set if
 // the input string requires unescaping.
 bool DoComplexHost(const char16_t* host,
-                   int host_len,
+                   size_t host_len,
                    bool has_non_ascii,
                    bool has_escaped,
                    CanonOutput* output) {
@@ -319,8 +324,8 @@
 
     // Once we convert to UTF-8, we can use the 8-bit version of the complex
     // host handling code above.
-    return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
-                         has_escaped, output);
+    return DoComplexHost(utf8.data(), static_cast<size_t>(utf8.length()),
+                         has_non_ascii, has_escaped, output);
   }
 
   // No unescaping necessary, we can safely pass the input to ICU. This
@@ -334,16 +339,18 @@
 bool DoHostSubstring(const CHAR* spec,
                      const Component& host,
                      CanonOutput* output) {
+  DCHECK(host.is_valid());
+
   bool has_non_ascii, has_escaped;
   ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
 
   if (has_non_ascii || has_escaped) {
-    return DoComplexHost(&spec[host.begin], host.len, has_non_ascii,
-                         has_escaped, output);
+    return DoComplexHost(&spec[host.begin], static_cast<size_t>(host.len),
+                         has_non_ascii, has_escaped, output);
   }
 
-  const bool success =
-      DoSimpleHost(&spec[host.begin], host.len, output, &has_non_ascii);
+  const bool success = DoSimpleHost(
+      &spec[host.begin], static_cast<size_t>(host.len), output, &has_non_ascii);
   DCHECK(!has_non_ascii);
   return success;
 }
@@ -353,7 +360,7 @@
             const Component& host,
             CanonOutput* output,
             CanonHostInfo* host_info) {
-  if (host.len <= 0) {
+  if (!host.is_nonempty()) {
     // Empty hosts don't need anything.
     host_info->family = CanonHostInfo::NEUTRAL;
     host_info->out_host = Component();

diff --git a/url_canon_internal.cc b/url_canon_internal.cc
index 48a1e74..192feb9 100644
--- a/url_canon_internal.cc
+++ b/url_canon_internal.cc

@@ -11,17 +11,19 @@
 #include <cstdio>
 #include <string>
 
+#include "base/numerics/safe_conversions.h"
 #include "base/strings/utf_string_conversion_utils.h"
 
 namespace url {
 
 namespace {
 
-template<typename CHAR, typename UCHAR>
-void DoAppendStringOfType(const CHAR* source, int length,
+template <typename CHAR, typename UCHAR>
+void DoAppendStringOfType(const CHAR* source,
+                          size_t length,
                           SharedCharTypes type,
                           CanonOutput* output) {
-  for (int i = 0; i < length; i++) {
+  for (size_t i = 0; i < length; i++) {
     if (static_cast<UCHAR>(source[i]) >= 0x80) {
       // ReadChar will fill the code point with kUnicodeReplacementCharacter
       // when the input is invalid, which is what we want.
@@ -41,10 +43,12 @@
 
 // This function assumes the input values are all contained in 8-bit,
 // although it allows any type. Returns true if input is valid, false if not.
-template<typename CHAR, typename UCHAR>
-void DoAppendInvalidNarrowString(const CHAR* spec, int begin, int end,
+template <typename CHAR, typename UCHAR>
+void DoAppendInvalidNarrowString(const CHAR* spec,
+                                 size_t begin,
+                                 size_t end,
                                  CanonOutput* output) {
-  for (int i = begin; i < end; i++) {
+  for (size_t i = begin; i < end; i++) {
     UCHAR uch = static_cast<UCHAR>(spec[i]);
     if (uch >= 0x80) {
       // Handle UTF-8/16 encodings. This call will correctly handle the error
@@ -98,7 +102,8 @@
       // Convert to UTF-8.
       dest_component->begin = utf8_buffer->length();
       success = ConvertUTF16ToUTF8(&override_source[override_component.begin],
-                                   override_component.len, utf8_buffer);
+                                   static_cast<size_t>(override_component.len),
+                                   utf8_buffer);
       dest_component->len = utf8_buffer->length() - dest_component->begin;
     }
   }
@@ -235,26 +240,24 @@
 
 const base_icu::UChar32 kUnicodeReplacementCharacter = 0xfffd;
 
-void AppendStringOfType(const char* source, int length,
+void AppendStringOfType(const char* source,
+                        size_t length,
                         SharedCharTypes type,
                         CanonOutput* output) {
   DoAppendStringOfType<char, unsigned char>(source, length, type, output);
 }
 
 void AppendStringOfType(const char16_t* source,
-                        int length,
+                        size_t length,
                         SharedCharTypes type,
                         CanonOutput* output) {
   DoAppendStringOfType<char16_t, char16_t>(source, length, type, output);
 }
 
 bool ReadUTFChar(const char* str,
-                 int* begin,
-                 int length,
+                 size_t* begin,
+                 size_t length,
                  base_icu::UChar32* code_point_out) {
-  // This depends on ints and int32s being the same thing. If they're not, it
-  // will fail to compile.
-  // TODO(mmenke): This should probably be fixed.
   if (!base::ReadUnicodeCharacter(str, length, begin, code_point_out) ||
       !base::IsValidCharacter(*code_point_out)) {
     *code_point_out = kUnicodeReplacementCharacter;
@@ -264,12 +267,9 @@
 }
 
 bool ReadUTFChar(const char16_t* str,
-                 int* begin,
-                 int length,
+                 size_t* begin,
+                 size_t length,
                  base_icu::UChar32* code_point_out) {
-  // This depends on ints and int32s being the same thing. If they're not, it
-  // will fail to compile.
-  // TODO(mmenke): This should probably be fixed.
   if (!base::ReadUnicodeCharacter(str, length, begin, code_point_out) ||
       !base::IsValidCharacter(*code_point_out)) {
     *code_point_out = kUnicodeReplacementCharacter;
@@ -278,23 +278,25 @@
   return true;
 }
 
-void AppendInvalidNarrowString(const char* spec, int begin, int end,
+void AppendInvalidNarrowString(const char* spec,
+                               size_t begin,
+                               size_t end,
                                CanonOutput* output) {
   DoAppendInvalidNarrowString<char, unsigned char>(spec, begin, end, output);
 }
 
 void AppendInvalidNarrowString(const char16_t* spec,
-                               int begin,
-                               int end,
+                               size_t begin,
+                               size_t end,
                                CanonOutput* output) {
   DoAppendInvalidNarrowString<char16_t, char16_t>(spec, begin, end, output);
 }
 
 bool ConvertUTF16ToUTF8(const char16_t* input,
-                        int input_len,
+                        size_t input_len,
                         CanonOutput* output) {
   bool success = true;
-  for (int i = 0; i < input_len; i++) {
+  for (size_t i = 0; i < input_len; i++) {
     base_icu::UChar32 code_point;
     success &= ReadUTFChar(input, &i, input_len, &code_point);
     AppendUTF8Value(code_point, output);
@@ -303,10 +305,10 @@
 }
 
 bool ConvertUTF8ToUTF16(const char* input,
-                        int input_len,
+                        size_t input_len,
                         CanonOutputT<char16_t>* output) {
   bool success = true;
-  for (int i = 0; i < input_len; i++) {
+  for (size_t i = 0; i < input_len; i++) {
     base_icu::UChar32 code_point;
     success &= ReadUTFChar(input, &i, input_len, &code_point);
     AppendUTF16Value(code_point, output);

diff --git a/url_canon_internal.h b/url_canon_internal.h
index def4636..ec5b61c 100644
--- a/url_canon_internal.h
+++ b/url_canon_internal.h

@@ -77,11 +77,12 @@
 
 // Appends the given string to the output, escaping characters that do not
 // match the given |type| in SharedCharTypes.
-void AppendStringOfType(const char* source, int length,
+void AppendStringOfType(const char* source,
+                        size_t length,
                         SharedCharTypes type,
                         CanonOutput* output);
 void AppendStringOfType(const char16_t* source,
-                        int length,
+                        size_t length,
                         SharedCharTypes type,
                         CanonOutput* output);
 
@@ -107,8 +108,8 @@
 // Indicates if the given character is a dot or dot equivalent, returning the
 // number of characters taken by it. This will be one for a literal dot, 3 for
 // an escaped dot. If the character is not a dot, this will return 0.
-template<typename CHAR>
-inline int IsDot(const CHAR* spec, int offset, int end) {
+template <typename CHAR>
+inline size_t IsDot(const CHAR* spec, size_t offset, size_t end) {
   if (spec[offset] == '.') {
     return 1;
   } else if (spec[offset] == '%' && offset + 3 <= end &&
@@ -154,8 +155,8 @@
 // (for a single-byte ASCII character, it will not be changed).
 COMPONENT_EXPORT(URL)
 bool ReadUTFChar(const char* str,
-                 int* begin,
-                 int length,
+                 size_t* begin,
+                 size_t length,
                  base_icu::UChar32* code_point_out);
 
 // Generic To-UTF-8 converter. This will call the given append method for each
@@ -231,8 +232,8 @@
 // (for a single-16-bit-word character, it will not be changed).
 COMPONENT_EXPORT(URL)
 bool ReadUTFChar(const char16_t* str,
-                 int* begin,
-                 int length,
+                 size_t* begin,
+                 size_t length,
                  base_icu::UChar32* code_point_out);
 
 // Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method.
@@ -268,8 +269,8 @@
 // Assumes that ch[begin] is within range in the array, but does not assume
 // that any following characters are.
 inline bool AppendUTF8EscapedChar(const char16_t* str,
-                                  int* begin,
-                                  int length,
+                                  size_t* begin,
+                                  size_t length,
                                   CanonOutput* output) {
   // UTF-16 input. ReadUTFChar will handle invalid characters for us and give
   // us the kUnicodeReplacementCharacter, so we don't have to do special
@@ -281,7 +282,9 @@
 }
 
 // Handles UTF-8 input. See the wide version above for usage.
-inline bool AppendUTF8EscapedChar(const char* str, int* begin, int length,
+inline bool AppendUTF8EscapedChar(const char* str,
+                                  size_t* begin,
+                                  size_t length,
                                   CanonOutput* output) {
   // ReadUTF8Char will handle invalid characters for us and give us the
   // kUnicodeReplacementCharacter, so we don't have to do special checking
@@ -308,8 +311,10 @@
   return c <= 255;
 }
 
-template<typename CHAR>
-inline bool DecodeEscaped(const CHAR* spec, int* begin, int end,
+template <typename CHAR>
+inline bool DecodeEscaped(const CHAR* spec,
+                          size_t* begin,
+                          size_t end,
                           unsigned char* unescaped_value) {
   if (*begin + 3 > end ||
       !Is8BitChar(spec[*begin + 1]) || !Is8BitChar(spec[*begin + 2])) {
@@ -338,11 +343,13 @@
 // This is used in error cases to append invalid output so that it looks
 // approximately correct. Non-error cases should not call this function since
 // the escaping rules are not guaranteed!
-void AppendInvalidNarrowString(const char* spec, int begin, int end,
+void AppendInvalidNarrowString(const char* spec,
+                               size_t begin,
+                               size_t end,
                                CanonOutput* output);
 void AppendInvalidNarrowString(const char16_t* spec,
-                               int begin,
-                               int end,
+                               size_t begin,
+                               size_t end,
                                CanonOutput* output);
 
 // Misc canonicalization helpers ----------------------------------------------
@@ -357,11 +364,11 @@
 // normal.
 COMPONENT_EXPORT(URL)
 bool ConvertUTF16ToUTF8(const char16_t* input,
-                        int input_len,
+                        size_t input_len,
                         CanonOutput* output);
 COMPONENT_EXPORT(URL)
 bool ConvertUTF8ToUTF16(const char* input,
-                        int input_len,
+                        size_t input_len,
                         CanonOutputT<char16_t>* output);
 
 // Converts from UTF-16 to 8-bit using the character set converter. If the

diff --git a/url_canon_mailtourl.cc b/url_canon_mailtourl.cc
index f4fe2b4..ff62bea 100644
--- a/url_canon_mailtourl.cc
+++ b/url_canon_mailtourl.cc

@@ -57,8 +57,8 @@
     // Copy the path using path URL's more lax escaping rules.
     // We convert to UTF-8 and escape non-ASCII, but leave most
     // ASCII characters alone.
-    int end = parsed.path.end();
-    for (int i = parsed.path.begin; i < end; ++i) {
+    size_t end = static_cast<size_t>(parsed.path.end());
+    for (size_t i = static_cast<size_t>(parsed.path.begin); i < end; ++i) {
       UCHAR uch = static_cast<UCHAR>(source.path[i]);
       if (ShouldEncodeMailboxCharacter<UCHAR>(uch))
         success &= AppendUTF8EscapedChar(source.path, &i, end, output);

diff --git a/url_canon_path.cc b/url_canon_path.cc
index d6fb64b..f50e107 100644
--- a/url_canon_path.cc
+++ b/url_canon_path.cc

@@ -101,9 +101,11 @@
 // If the input is "../foo", |after_dot| = 1, |end| = 6, and
 // at the end, |*consumed_len| = 2 for the "./" this function consumed. The
 // original dot length should be handled by the caller.
-template<typename CHAR>
-DotDisposition ClassifyAfterDot(const CHAR* spec, int after_dot,
-                                int end, int* consumed_len) {
+template <typename CHAR>
+DotDisposition ClassifyAfterDot(const CHAR* spec,
+                                size_t after_dot,
+                                size_t end,
+                                size_t* consumed_len) {
   if (after_dot == end) {
     // Single dot at the end.
     *consumed_len = 0;
@@ -115,9 +117,9 @@
     return DIRECTORY_CUR;
   }
 
-  int second_dot_len = IsDot(spec, after_dot, end);
+  size_t second_dot_len = IsDot(spec, after_dot, end);
   if (second_dot_len) {
-    int after_second_dot = after_dot + second_dot_len;
+    size_t after_second_dot = after_dot + second_dot_len;
     if (after_second_dot == end) {
       // Double dot at the end.
       *consumed_len = second_dot_len;
@@ -193,10 +195,10 @@
 // ends with a '%' followed by one or two characters, and the '%' is the one
 // pointed to by |last_invalid_percent_index|.  The last character in the string
 // was just unescaped.
-template<typename CHAR>
+template <typename CHAR>
 void CheckForNestedEscapes(const CHAR* spec,
-                           int next_input_index,
-                           int input_len,
+                           size_t next_input_index,
+                           size_t input_len,
                            int last_invalid_percent_index,
                            CanonOutput* output) {
   const int length = output->length();
@@ -218,9 +220,10 @@
   }
 
   // Now output ends like "%cc".  Try to unescape this.
-  int begin = last_invalid_percent_index;
+  size_t begin = static_cast<size_t>(last_invalid_percent_index);
   unsigned char temp;
-  if (DecodeEscaped(output->data(), &begin, output->length(), &temp)) {
+  if (DecodeEscaped(output->data(), &begin,
+                    static_cast<size_t>(output->length()), &temp)) {
     // New escape sequence found.  Overwrite the characters following the '%'
     // with "25", and push_back() the one or two characters that were following
     // the '%' when we were called.
@@ -252,7 +255,10 @@
                            const Component& path,
                            int path_begin_in_output,
                            CanonOutput* output) {
-  int end = path.end();
+  if (!path.is_nonempty())
+    return true;
+
+  size_t end = static_cast<size_t>(path.end());
 
   // We use this variable to minimize the amount of work done when unescaping --
   // we'll only call CheckForNestedEscapes() when this points at one of the last
@@ -260,7 +266,7 @@
   int last_invalid_percent_index = INT_MIN;
 
   bool success = true;
-  for (int i = path.begin; i < end; i++) {
+  for (size_t i = static_cast<size_t>(path.begin); i < end; i++) {
     DCHECK_LT(last_invalid_percent_index, output->length());
     UCHAR uch = static_cast<UCHAR>(spec[i]);
     if (sizeof(CHAR) > 1 && uch >= 0x80) {
@@ -276,7 +282,7 @@
       unsigned char flags = kPathCharLookup[out_ch];
       if (flags & SPECIAL) {
         // Needs special handling of some sort.
-        int dotlen;
+        size_t dotlen;
         if ((dotlen = IsDot(spec, i, end)) > 0) {
           // See if this dot was preceded by a slash in the output.
           //
@@ -287,7 +293,7 @@
           if (output->length() > path_begin_in_output &&
               output->at(output->length() - 1) == '/') {
             // Slash followed by a dot, check to see if this is means relative
-            int consumed_len;
+            size_t consumed_len;
             switch (ClassifyAfterDot<CHAR>(spec, i + dotlen, end,
                                            &consumed_len)) {
               case NOT_A_DIRECTORY:

diff --git a/url_canon_pathurl.cc b/url_canon_pathurl.cc
index e726cfb..d8d65f3 100644
--- a/url_canon_pathurl.cc
+++ b/url_canon_pathurl.cc

@@ -32,8 +32,8 @@
     // https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state
     // https://url.spec.whatwg.org/#c0-control-percent-encode-set
     new_component->begin = output->length();
-    int end = component.end();
-    for (int i = component.begin; i < end; i++) {
+    size_t end = static_cast<size_t>(component.end());
+    for (size_t i = static_cast<size_t>(component.begin); i < end; i++) {
       UCHAR uch = static_cast<UCHAR>(source[i]);
       if (uch < 0x20 || uch > 0x7E)
         AppendUTF8EscapedChar(source, &i, end, output);

diff --git a/url_canon_query.cc b/url_canon_query.cc
index b3a1118..d23b45f 100644
--- a/url_canon_query.cc
+++ b/url_canon_query.cc

@@ -72,10 +72,12 @@
                   const Component& query,
                   CharsetConverter* converter,
                   CanonOutput* output) {
+  DCHECK(query.is_valid());
   // This function will replace any misencoded values with the invalid
   // character. This is what we want so we don't have to check for error.
   RawCanonOutputW<1024> utf16;
-  ConvertUTF8ToUTF16(&spec[query.begin], query.len, &utf16);
+  ConvertUTF8ToUTF16(&spec[query.begin], static_cast<size_t>(query.len),
+                     &utf16);
   converter->ConvertFromUTF16(utf16.data(), utf16.length(), output);
 }
 
@@ -86,7 +88,9 @@
                   const Component& query,
                   CharsetConverter* converter,
                   CanonOutput* output) {
-  converter->ConvertFromUTF16(&spec[query.begin], query.len, output);
+  DCHECK(query.is_valid());
+  converter->ConvertFromUTF16(&spec[query.begin],
+                              static_cast<size_t>(query.len), output);
 }
 
 template<typename CHAR, typename UCHAR>
@@ -109,7 +113,8 @@
 
     } else {
       // No converter, do our own UTF-8 conversion.
-      AppendStringOfType(&spec[query.begin], query.len, CHAR_QUERY, output);
+      AppendStringOfType(&spec[query.begin], static_cast<size_t>(query.len),
+                         CHAR_QUERY, output);
     }
   }
 }

diff --git a/url_canon_unittest.cc b/url_canon_unittest.cc
index f6ac9d4..96aa55a 100644
--- a/url_canon_unittest.cc
+++ b/url_canon_unittest.cc

@@ -173,9 +173,9 @@
       out_str.clear();
       StdStringCanonOutput output(&out_str);
 
-      int input_len = static_cast<int>(strlen(utf_cases[i].input8));
+      size_t input_len = strlen(utf_cases[i].input8);
       bool success = true;
-      for (int ch = 0; ch < input_len; ch++) {
+      for (size_t ch = 0; ch < input_len; ch++) {
         success &= AppendUTF8EscapedChar(utf_cases[i].input8, &ch, input_len,
                                          &output);
       }
@@ -189,9 +189,9 @@
 
       std::u16string input_str(
           test_utils::TruncateWStringToUTF16(utf_cases[i].input16));
-      int input_len = static_cast<int>(input_str.length());
+      size_t input_len = input_str.length();
       bool success = true;
-      for (int ch = 0; ch < input_len; ch++) {
+      for (size_t ch = 0; ch < input_len; ch++) {
         success &= AppendUTF8EscapedChar(input_str.c_str(), &ch, input_len,
                                          &output);
       }

diff --git a/url_util.cc b/url_util.cc
index 1f0de84..cd8e2e1 100644
--- a/url_util.cc
+++ b/url_util.cc

@@ -811,11 +811,15 @@
                               int length,
                               DecodeURLMode mode,
                               CanonOutputW* output) {
+  if (length <= 0)
+    return;
+
   STACK_UNINITIALIZED RawCanonOutputT<char> unescaped_chars;
-  for (int i = 0; i < length; i++) {
+  size_t length_size_t = static_cast<size_t>(length);
+  for (size_t i = 0; i < length_size_t; i++) {
     if (input[i] == '%') {
       unsigned char ch;
-      if (DecodeEscaped(input, &i, length, &ch)) {
+      if (DecodeEscaped(input, &i, length_size_t, &ch)) {
         unescaped_chars.push_back(ch);
       } else {
         // Invalid escape sequence, copy the percent literal.
@@ -830,18 +834,20 @@
   int output_initial_length = output->length();
   // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
   // JavaScript URLs, but Firefox and Safari do.
-  for (int i = 0; i < unescaped_chars.length(); i++) {
-    unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
+  size_t unescaped_length = static_cast<size_t>(unescaped_chars.length());
+  for (size_t i = 0; i < unescaped_length; i++) {
+    unsigned char uch =
+        static_cast<unsigned char>(unescaped_chars.at(static_cast<int>(i)));
     if (uch < 0x80) {
       // Non-UTF-8, just append directly
       output->push_back(uch);
     } else {
       // next_ch will point to the last character of the decoded
       // character.
-      int next_character = i;
+      size_t next_character = i;
       base_icu::UChar32 code_point;
-      if (ReadUTFChar(unescaped_chars.data(), &next_character,
-                      unescaped_chars.length(), &code_point)) {
+      if (ReadUTFChar(unescaped_chars.data(), &next_character, unescaped_length,
+                      &code_point)) {
         // Valid UTF-8 character, convert to UTF-16.
         AppendUTF16Value(code_point, output);
         i = next_character;
commit	6254d3de5842d5c629ad9686ea11182c1e4775d6	[log] [tgz]
author	Peter Kasting <pkasting@chromium.org>	Thu Jun 16 19:39:27 2022
committer	Copybara-Service <copybara-worker@google.com>	Thu Jun 16 19:56:06 2022
tree	b5b8e4844943fbb5fed5a58860a23172308a0e80
parent	ba371975bdb23514d9d8dcccc55f05d78ad87af9 [diff]