third_party/blink/renderer/platform/wtf/text/utf8.cc - chromium/src.git - Git at Google

 /*
  * Copyright (C) 2007 Apple Inc.  All rights reserved.
  * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "third_party/blink/renderer/platform/wtf/text/utf8.h"

 #include <unicode/utf16.h>

 #include <array>

 #include "base/check.h"
 #include "base/notreached.h"
 #include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
 #include "third_party/blink/renderer/platform/wtf/text/character_names.h"
 #include "third_party/blink/renderer/platform/wtf/text/string_hasher.h"

 namespace blink::unicode {

 namespace {

 inline size_t InlineUtf8SequenceLengthNonAscii(uint8_t b0) {
   if ((b0 & 0xC0) != 0xC0)
     return 0;
   if ((b0 & 0xE0) == 0xC0)
     return 2;
   if ((b0 & 0xF0) == 0xE0)
     return 3;
   if ((b0 & 0xF8) == 0xF0)
     return 4;
   return 0;
 }

 inline size_t InlineUtf8SequenceLength(uint8_t b0) {
   return IsASCII(b0) ? 1 : InlineUtf8SequenceLengthNonAscii(b0);
 }

 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
 // into the first byte, depending on how many bytes follow.  There are
 // as many entries in this table as there are UTF-8 sequence types.
 // (I.e., one byte sequence, two byte... etc.). Remember that sequences
 // for *legal* UTF-8 will be 4 or fewer bytes total.
 static constexpr std::array<uint8_t, 7> kFirstByteMark = {
     0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};

 ConversionStatus ConvertLatin1ToUtf8Internal(base::span<const LChar>& source,
                                              base::span<uint8_t>& target) {
   ConversionStatus status = kConversionOK;
   size_t source_cursor = 0;
   size_t target_cursor = 0;
   size_t target_end = target.size();

   while (source_cursor < source.size()) {
     UChar32 ch;
     uint8_t bytes_to_write = 0;
     const UChar32 kByteMask = 0xBF;
     const UChar32 kByteMark = 0x80;
     const size_t old_source_cursor = source_cursor;
     ch = static_cast<UChar32>(source[source_cursor++]);

     // Figure out how many bytes the result will require
     if (ch < static_cast<UChar32>(0x80)) {
       bytes_to_write = 1;
     } else {
       bytes_to_write = 2;
     }

     target_cursor += bytes_to_write;
     if (target_cursor > target_end) {
       source_cursor = old_source_cursor;  // Back up source index!
       target_cursor -= bytes_to_write;
       status = kTargetExhausted;
       break;
     }
     switch (bytes_to_write) {
       case 2:
         target[--target_cursor] =
             static_cast<uint8_t>((ch | kByteMark) & kByteMask);
         ch >>= 6;
         [[fallthrough]];
       case 1:
         target[--target_cursor] =
             static_cast<uint8_t>(ch | kFirstByteMark[bytes_to_write]);
     }
     target_cursor += bytes_to_write;
   }
   source = source.subspan(source_cursor);
   target = target.subspan(target_cursor);
   return status;
 }

 ConversionStatus ConvertUtf16ToUtf8Internal(base::span<const UChar>& source,
                                             base::span<uint8_t>& target,
                                             bool strict) {
   ConversionStatus status = kConversionOK;
   size_t source_cursor = 0;
   size_t target_cursor = 0;
   size_t source_end = source.size();
   size_t target_end = target.size();

   while (source_cursor < source_end) {
     UChar32 ch;
     uint8_t bytes_to_write = 0;
     const UChar32 kByteMask = 0xBF;
     const UChar32 kByteMark = 0x80;
     const size_t old_source_cursor = source_cursor;
     ch = static_cast<UChar32>(source[source_cursor++]);
     // If we have a surrogate pair, convert to UChar32 first.
     if (ch >= 0xD800 && ch <= 0xDBFF) {
       // If the 16 bits following the high surrogate are in the source buffer...
       if (source_cursor < source_end) {
         UChar32 ch2 = static_cast<UChar32>(source[source_cursor]);
         // If it's a low surrogate, convert to UChar32.
         if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
           ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
           ++source_cursor;
         } else if (strict) {  // it's an unpaired high surrogate
           --source_cursor;    // return to the illegal value itself
           status = kSourceIllegal;
           break;
         }
       } else {  // We don't have the 16 bits following the high surrogate.
         --source_cursor;  // return to the high surrogate
         status = kSourceExhausted;
         break;
       }
     } else if (strict) {
       // UTF-16 surrogate values are illegal in UTF-32
       if (ch >= 0xDC00 && ch <= 0xDFFF) {
         --source_cursor;  // return to the illegal value itself
         status = kSourceIllegal;
         break;
       }
     }
     // Figure out how many bytes the result will require
     if (ch < static_cast<UChar32>(0x80)) {
       bytes_to_write = 1;
     } else if (ch < static_cast<UChar32>(0x800)) {
       bytes_to_write = 2;
     } else if (ch < static_cast<UChar32>(0x10000)) {
       bytes_to_write = 3;
     } else if (ch < static_cast<UChar32>(0x110000)) {
       bytes_to_write = 4;
     } else {
       // Surrogate pairs cannot represent codepoints higher than 0x10FFFF, so
       // this should not be reachable.
       NOTREACHED();
     }

     target_cursor += bytes_to_write;
     if (target_cursor > target_end) {
       source_cursor = old_source_cursor;  // Back up source index!
       target_cursor -= bytes_to_write;
       status = kTargetExhausted;
       break;
     }
     switch (bytes_to_write) {
       case 4:
         target[--target_cursor] =
             static_cast<uint8_t>((ch | kByteMark) & kByteMask);
         ch >>= 6;
         [[fallthrough]];
       case 3:
         target[--target_cursor] =
             static_cast<uint8_t>((ch | kByteMark) & kByteMask);
         ch >>= 6;
         [[fallthrough]];
       case 2:
         target[--target_cursor] =
             static_cast<uint8_t>((ch | kByteMark) & kByteMask);
         ch >>= 6;
         [[fallthrough]];
       case 1:
         target[--target_cursor] =
             static_cast<uint8_t>(ch | kFirstByteMark[bytes_to_write]);
     }
     target_cursor += bytes_to_write;
   }
   source = source.subspan(source_cursor);
   target = target.subspan(target_cursor);
   return status;
 }

 // This must be called with the length pre-determined by the first byte.
 // If presented with a length > 4, this returns false.  The Unicode
 // definition of UTF-8 goes up to 4-byte sequences.
 bool IsLegalUtf8(const base::span<const uint8_t> source) {
   uint8_t a;
   size_t src_cursor = source.size();
   switch (source.size()) {
     default:
       return false;
     case 4:
       if ((a = (source[--src_cursor])) < 0x80 || a > 0xBF) {
         return false;
       }
       [[fallthrough]];
     case 3:
       if ((a = (source[--src_cursor])) < 0x80 || a > 0xBF) {
         return false;
       }
       [[fallthrough]];
     case 2:
       if ((a = (source[--src_cursor])) > 0xBF) {
         return false;
       }

       // no fall-through in this inner switch
       switch (source[0]) {
         case 0xE0:
           if (a < 0xA0)
             return false;
           break;
         case 0xED:
           if (a < 0x80 || a > 0x9F) {
             return false;
           }
           break;
         case 0xF0:
           if (a < 0x90)
             return false;
           break;
         case 0xF4:
           if (a < 0x80 || a > 0x8F) {
             return false;
           }
           break;
         default:
           if (a < 0x80)
             return false;
       }
       [[fallthrough]];

     case 1:
       if ((a = source[0]) >= 0x80 && a < 0xC2) {
         return false;
       }
   }
   if (source[0] > 0xF4) {
     return false;
   }
   return true;
 }

 inline UChar32 ReadUtf8Sequence(base::span<const uint8_t> source,
                                 size_t length) {
   DCHECK_LT(0u, length);
   DCHECK_GT(5u, length);

   if (length == 1) {
     return source[0];
   }

   const uint8_t b0 = source[0];
   const uint8_t b1 = source[1];

   if (length == 2) {
     // 2-byte sequence: 110xxxxx 10xxxxxx
     return ((b0 & 0x1F) << 6) | (b1 & 0x3F);
   }

   const uint8_t b2 = source[2];
   if (length == 3) {
     // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
     return ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
   }

   // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
   const uint8_t b3 = source[3];
   return ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) |
          (b3 & 0x3F);
 }

 ConversionStatus ConvertUtf8ToUtf16Internal(base::span<const uint8_t>& source,
                                             base::span<UChar>& target,
                                             bool strict) {
   ConversionStatus status = kConversionOK;

   using MachineWord = uintptr_t;
   constexpr size_t kWordWidth = sizeof(MachineWord);
   constexpr MachineWord kAsciiMask =
       (kWordWidth == 8) ? 0x8080808080808080ULL : 0x80808080UL;
   constexpr uintptr_t kMachineWordAlignmentMask = kWordWidth - 1;

   while (!source.empty()) {
     // Attempt the fast path if we have enough data for a full, aligned word.
     if (source.size() >= kWordWidth && target.size() >= kWordWidth &&
         !(reinterpret_cast<uintptr_t>(source.data()) &
           kMachineWordAlignmentMask)) {
       const MachineWord word =
           *reinterpret_cast<const MachineWord*>(source.data());

       if ((word & kAsciiMask) == 0) {
         // All bytes in the aligned word are ASCII. Convert them in a simple
         // loop.
         for (size_t i = 0; i < kWordWidth; ++i) {
           target[i] = source[i];
         }
         source = source.subspan(kWordWidth);
         target = target.subspan(kWordWidth);
         continue;
       }
     }

     // Process one character using the scalar path.
     const size_t utf8_sequence_length = InlineUtf8SequenceLength(source[0]);
     if (source.size() < utf8_sequence_length) {
       status = kSourceExhausted;
       break;
     }

     // Do this check whether lenient or strict
     if (!IsLegalUtf8(source.first(utf8_sequence_length))) {
       status = kSourceIllegal;
       break;
     }

     const UChar32 character = ReadUtf8Sequence(source, utf8_sequence_length);

     if (U_IS_BMP(character)) {
       if (target.empty()) {
         status = kTargetExhausted;
         break;
       }
       // UTF-16 surrogate values are illegal in UTF-32
       if (U_IS_SURROGATE(character)) {
         if (strict) {
           status = kSourceIllegal;
           break;
         }
         target[0] = blink::uchar::kReplacementCharacter;
       } else {
         target[0] = static_cast<UChar>(character);
       }
       source = source.subspan(utf8_sequence_length);
       target = target.subspan(1u);
     } else if (U_IS_SUPPLEMENTARY(character)) {
       // target is a character in range 0xFFFF - 0x10FFFF
       if (target.size() < 2u) {
         status = kTargetExhausted;
         break;
       }
       target[0] = U16_LEAD(character);
       target[1] = U16_TRAIL(character);
       source = source.subspan(utf8_sequence_length);
       target = target.subspan(2u);
     } else {
       // This should never happen; InlineUTF8SequenceLength() can never return
       // a value higher than 4, and a 4-byte UTF-8 sequence can never encode
       // anything higher than 0x10FFFF.
       NOTREACHED();
     }
   }

   return status;
 }

 }  // namespace

 ConversionResult<uint8_t> ConvertLatin1ToUtf8(base::span<const LChar> source,
                                               base::span<uint8_t> target) {
   auto original_source = source;
   auto original_target = target;
   auto status = ConvertLatin1ToUtf8Internal(source, target);
   return {
       original_target.first(original_target.size() - target.size()),
       original_source.size() - source.size(),
       status,
   };
 }

 ConversionResult<uint8_t> ConvertUtf16ToUtf8(base::span<const UChar> source,
                                              base::span<uint8_t> target,
                                              bool strict) {
   auto original_source = source;
   auto original_target = target;
   auto status = ConvertUtf16ToUtf8Internal(source, target, strict);
   return {
       original_target.first(original_target.size() - target.size()),
       original_source.size() - source.size(),
       status,
   };
 }

 ConversionResult<UChar> ConvertUtf8ToUtf16(base::span<const uint8_t> source,
                                            base::span<UChar> target,
                                            bool strict) {
   auto original_source = source;
   auto original_target = target;
   auto status = ConvertUtf8ToUtf16Internal(source, target, strict);
   return {
       original_target.first(original_target.size() - target.size()),
       original_source.size() - source.size(),
       status,
   };
 }

 unsigned CalculateStringLengthFromUtf8(base::span<const uint8_t> data,
                                        bool& seen_non_ascii,
                                        bool& seen_non_latin1) {
   seen_non_ascii = false;
   seen_non_latin1 = false;
   if (data.empty()) {
     return 0;
   }

   unsigned utf16_length = 0;

   size_t data_cursor = 0;
   size_t data_end = data.size();

   while (data_cursor < data_end) {
     if (IsASCII(data[data_cursor])) {
       data_cursor++;
       utf16_length++;
       continue;
     }

     seen_non_ascii = true;
     size_t utf8_sequence_length =
         InlineUtf8SequenceLengthNonAscii(data[data_cursor]);

     if (data_end - data_cursor < utf8_sequence_length) {
       return 0;
     }

     if (!IsLegalUtf8(data.subspan(data_cursor, utf8_sequence_length))) {
       return 0;
     }

     UChar32 character =
         ReadUtf8Sequence(data.subspan(data_cursor), utf8_sequence_length);
     DCHECK(!IsASCII(character));
     data_cursor += utf8_sequence_length;

     if (character > 0xff) {
       seen_non_latin1 = true;
     }

     if (U_IS_BMP(character)) {
       // UTF-16 surrogate values are illegal in UTF-32
       if (U_IS_SURROGATE(character))
         return 0;
       utf16_length++;
     } else if (U_IS_SUPPLEMENTARY(character)) {
       utf16_length += 2;
     } else {
       return 0;
     }
   }

   data = data.first(data_cursor);
   return utf16_length;
 }

 }  // namespace blink::unicode
	/*
	* Copyright (C) 2007 Apple Inc. All rights reserved.
	* Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "third_party/blink/renderer/platform/wtf/text/utf8.h"

	#include <unicode/utf16.h>

	#include <array>

	#include "base/check.h"
	#include "base/notreached.h"
	#include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
	#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
	#include "third_party/blink/renderer/platform/wtf/text/string_hasher.h"

	namespace blink::unicode {

	namespace {

	inline size_t InlineUtf8SequenceLengthNonAscii(uint8_t b0) {
	if ((b0 & 0xC0) != 0xC0)
	return 0;
	if ((b0 & 0xE0) == 0xC0)
	return 2;
	if ((b0 & 0xF0) == 0xE0)
	return 3;
	if ((b0 & 0xF8) == 0xF0)
	return 4;
	return 0;
	}

	inline size_t InlineUtf8SequenceLength(uint8_t b0) {
	return IsASCII(b0) ? 1 : InlineUtf8SequenceLengthNonAscii(b0);
	}

	// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
	// into the first byte, depending on how many bytes follow. There are
	// as many entries in this table as there are UTF-8 sequence types.
	// (I.e., one byte sequence, two byte... etc.). Remember that sequences
	// for legal UTF-8 will be 4 or fewer bytes total.
	static constexpr std::array<uint8_t, 7> kFirstByteMark = {
	0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};

	ConversionStatus ConvertLatin1ToUtf8Internal(base::span<const LChar>& source,
	base::span<uint8_t>& target) {
	ConversionStatus status = kConversionOK;
	size_t source_cursor = 0;
	size_t target_cursor = 0;
	size_t target_end = target.size();

	while (source_cursor < source.size()) {
	UChar32 ch;
	uint8_t bytes_to_write = 0;
	const UChar32 kByteMask = 0xBF;
	const UChar32 kByteMark = 0x80;
	const size_t old_source_cursor = source_cursor;
	ch = static_cast<UChar32>(source[source_cursor++]);

	// Figure out how many bytes the result will require
	if (ch < static_cast<UChar32>(0x80)) {
	bytes_to_write = 1;
	} else {
	bytes_to_write = 2;
	}

	target_cursor += bytes_to_write;
	if (target_cursor > target_end) {
	source_cursor = old_source_cursor; // Back up source index!
	target_cursor -= bytes_to_write;
	status = kTargetExhausted;
	break;
	}
	switch (bytes_to_write) {
	case 2:
	target[--target_cursor] =
	static_cast<uint8_t>((ch \| kByteMark) & kByteMask);
	ch >>= 6;
	[[fallthrough]];
	case 1:
	target[--target_cursor] =
	static_cast<uint8_t>(ch \| kFirstByteMark[bytes_to_write]);
	}
	target_cursor += bytes_to_write;
	}
	source = source.subspan(source_cursor);
	target = target.subspan(target_cursor);
	return status;
	}

	ConversionStatus ConvertUtf16ToUtf8Internal(base::span<const UChar>& source,
	base::span<uint8_t>& target,
	bool strict) {
	ConversionStatus status = kConversionOK;
	size_t source_cursor = 0;
	size_t target_cursor = 0;
	size_t source_end = source.size();
	size_t target_end = target.size();

	while (source_cursor < source_end) {
	UChar32 ch;
	uint8_t bytes_to_write = 0;
	const UChar32 kByteMask = 0xBF;
	const UChar32 kByteMark = 0x80;
	const size_t old_source_cursor = source_cursor;
	ch = static_cast<UChar32>(source[source_cursor++]);
	// If we have a surrogate pair, convert to UChar32 first.
	if (ch >= 0xD800 && ch <= 0xDBFF) {
	// If the 16 bits following the high surrogate are in the source buffer...
	if (source_cursor < source_end) {
	UChar32 ch2 = static_cast<UChar32>(source[source_cursor]);
	// If it's a low surrogate, convert to UChar32.
	if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
	ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
	++source_cursor;
	} else if (strict) { // it's an unpaired high surrogate
	--source_cursor; // return to the illegal value itself
	status = kSourceIllegal;
	break;
	}
	} else { // We don't have the 16 bits following the high surrogate.
	--source_cursor; // return to the high surrogate
	status = kSourceExhausted;
	break;
	}
	} else if (strict) {
	// UTF-16 surrogate values are illegal in UTF-32
	if (ch >= 0xDC00 && ch <= 0xDFFF) {
	--source_cursor; // return to the illegal value itself
	status = kSourceIllegal;
	break;
	}
	}
	// Figure out how many bytes the result will require
	if (ch < static_cast<UChar32>(0x80)) {
	bytes_to_write = 1;
	} else if (ch < static_cast<UChar32>(0x800)) {
	bytes_to_write = 2;
	} else if (ch < static_cast<UChar32>(0x10000)) {
	bytes_to_write = 3;
	} else if (ch < static_cast<UChar32>(0x110000)) {
	bytes_to_write = 4;
	} else {
	// Surrogate pairs cannot represent codepoints higher than 0x10FFFF, so
	// this should not be reachable.
	NOTREACHED();
	}

	target_cursor += bytes_to_write;
	if (target_cursor > target_end) {
	source_cursor = old_source_cursor; // Back up source index!
	target_cursor -= bytes_to_write;
	status = kTargetExhausted;
	break;
	}
	switch (bytes_to_write) {
	case 4:
	target[--target_cursor] =
	static_cast<uint8_t>((ch \| kByteMark) & kByteMask);
	ch >>= 6;
	[[fallthrough]];
	case 3:
	target[--target_cursor] =
	static_cast<uint8_t>((ch \| kByteMark) & kByteMask);
	ch >>= 6;
	[[fallthrough]];
	case 2:
	target[--target_cursor] =
	static_cast<uint8_t>((ch \| kByteMark) & kByteMask);
	ch >>= 6;
	[[fallthrough]];
	case 1:
	target[--target_cursor] =
	static_cast<uint8_t>(ch \| kFirstByteMark[bytes_to_write]);
	}
	target_cursor += bytes_to_write;
	}
	source = source.subspan(source_cursor);
	target = target.subspan(target_cursor);
	return status;
	}

	// This must be called with the length pre-determined by the first byte.
	// If presented with a length > 4, this returns false. The Unicode
	// definition of UTF-8 goes up to 4-byte sequences.
	bool IsLegalUtf8(const base::span<const uint8_t> source) {
	uint8_t a;
	size_t src_cursor = source.size();
	switch (source.size()) {
	default:
	return false;
	case 4:
	if ((a = (source[--src_cursor])) < 0x80 \|\| a > 0xBF) {
	return false;
	}
	[[fallthrough]];
	case 3:
	if ((a = (source[--src_cursor])) < 0x80 \|\| a > 0xBF) {
	return false;
	}
	[[fallthrough]];
	case 2:
	if ((a = (source[--src_cursor])) > 0xBF) {
	return false;
	}

	// no fall-through in this inner switch
	switch (source[0]) {
	case 0xE0:
	if (a < 0xA0)
	return false;
	break;
	case 0xED:
	if (a < 0x80 \|\| a > 0x9F) {
	return false;
	}
	break;
	case 0xF0:
	if (a < 0x90)
	return false;
	break;
	case 0xF4:
	if (a < 0x80 \|\| a > 0x8F) {
	return false;
	}
	break;
	default:
	if (a < 0x80)
	return false;
	}
	[[fallthrough]];

	case 1:
	if ((a = source[0]) >= 0x80 && a < 0xC2) {
	return false;
	}
	}
	if (source[0] > 0xF4) {
	return false;
	}
	return true;
	}

	inline UChar32 ReadUtf8Sequence(base::span<const uint8_t> source,
	size_t length) {
	DCHECK_LT(0u, length);
	DCHECK_GT(5u, length);

	if (length == 1) {
	return source[0];
	}

	const uint8_t b0 = source[0];
	const uint8_t b1 = source[1];

	if (length == 2) {
	// 2-byte sequence: 110xxxxx 10xxxxxx
	return ((b0 & 0x1F) << 6) \| (b1 & 0x3F);
	}

	const uint8_t b2 = source[2];
	if (length == 3) {
	// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
	return ((b0 & 0x0F) << 12) \| ((b1 & 0x3F) << 6) \| (b2 & 0x3F);
	}

	// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	const uint8_t b3 = source[3];
	return ((b0 & 0x07) << 18) \| ((b1 & 0x3F) << 12) \| ((b2 & 0x3F) << 6) \|
	(b3 & 0x3F);
	}

	ConversionStatus ConvertUtf8ToUtf16Internal(base::span<const uint8_t>& source,
	base::span<UChar>& target,
	bool strict) {
	ConversionStatus status = kConversionOK;

	using MachineWord = uintptr_t;
	constexpr size_t kWordWidth = sizeof(MachineWord);
	constexpr MachineWord kAsciiMask =
	(kWordWidth == 8) ? 0x8080808080808080ULL : 0x80808080UL;
	constexpr uintptr_t kMachineWordAlignmentMask = kWordWidth - 1;

	while (!source.empty()) {
	// Attempt the fast path if we have enough data for a full, aligned word.
	if (source.size() >= kWordWidth && target.size() >= kWordWidth &&
	!(reinterpret_cast<uintptr_t>(source.data()) &
	kMachineWordAlignmentMask)) {
	const MachineWord word =
	reinterpret_cast<const MachineWord>(source.data());

	if ((word & kAsciiMask) == 0) {
	// All bytes in the aligned word are ASCII. Convert them in a simple
	// loop.
	for (size_t i = 0; i < kWordWidth; ++i) {
	target[i] = source[i];
	}
	source = source.subspan(kWordWidth);
	target = target.subspan(kWordWidth);
	continue;
	}
	}

	// Process one character using the scalar path.
	const size_t utf8_sequence_length = InlineUtf8SequenceLength(source[0]);
	if (source.size() < utf8_sequence_length) {
	status = kSourceExhausted;
	break;
	}

	// Do this check whether lenient or strict
	if (!IsLegalUtf8(source.first(utf8_sequence_length))) {
	status = kSourceIllegal;
	break;
	}

	const UChar32 character = ReadUtf8Sequence(source, utf8_sequence_length);

	if (U_IS_BMP(character)) {
	if (target.empty()) {
	status = kTargetExhausted;
	break;
	}
	// UTF-16 surrogate values are illegal in UTF-32
	if (U_IS_SURROGATE(character)) {
	if (strict) {
	status = kSourceIllegal;
	break;
	}
	target[0] = blink::uchar::kReplacementCharacter;
	} else {
	target[0] = static_cast<UChar>(character);
	}
	source = source.subspan(utf8_sequence_length);
	target = target.subspan(1u);
	} else if (U_IS_SUPPLEMENTARY(character)) {
	// target is a character in range 0xFFFF - 0x10FFFF
	if (target.size() < 2u) {
	status = kTargetExhausted;
	break;
	}
	target[0] = U16_LEAD(character);
	target[1] = U16_TRAIL(character);
	source = source.subspan(utf8_sequence_length);
	target = target.subspan(2u);
	} else {
	// This should never happen; InlineUTF8SequenceLength() can never return
	// a value higher than 4, and a 4-byte UTF-8 sequence can never encode
	// anything higher than 0x10FFFF.
	NOTREACHED();
	}
	}

	return status;
	}

	} // namespace

	ConversionResult<uint8_t> ConvertLatin1ToUtf8(base::span<const LChar> source,
	base::span<uint8_t> target) {
	auto original_source = source;
	auto original_target = target;
	auto status = ConvertLatin1ToUtf8Internal(source, target);
	return {
	original_target.first(original_target.size() - target.size()),
	original_source.size() - source.size(),
	status,
	};
	}

	ConversionResult<uint8_t> ConvertUtf16ToUtf8(base::span<const UChar> source,
	base::span<uint8_t> target,
	bool strict) {
	auto original_source = source;
	auto original_target = target;
	auto status = ConvertUtf16ToUtf8Internal(source, target, strict);
	return {
	original_target.first(original_target.size() - target.size()),
	original_source.size() - source.size(),
	status,
	};
	}

	ConversionResult<UChar> ConvertUtf8ToUtf16(base::span<const uint8_t> source,
	base::span<UChar> target,
	bool strict) {
	auto original_source = source;
	auto original_target = target;
	auto status = ConvertUtf8ToUtf16Internal(source, target, strict);
	return {
	original_target.first(original_target.size() - target.size()),
	original_source.size() - source.size(),
	status,
	};
	}

	unsigned CalculateStringLengthFromUtf8(base::span<const uint8_t> data,
	bool& seen_non_ascii,
	bool& seen_non_latin1) {
	seen_non_ascii = false;
	seen_non_latin1 = false;
	if (data.empty()) {
	return 0;
	}

	unsigned utf16_length = 0;

	size_t data_cursor = 0;
	size_t data_end = data.size();

	while (data_cursor < data_end) {
	if (IsASCII(data[data_cursor])) {
	data_cursor++;
	utf16_length++;
	continue;
	}

	seen_non_ascii = true;
	size_t utf8_sequence_length =
	InlineUtf8SequenceLengthNonAscii(data[data_cursor]);

	if (data_end - data_cursor < utf8_sequence_length) {
	return 0;
	}

	if (!IsLegalUtf8(data.subspan(data_cursor, utf8_sequence_length))) {
	return 0;
	}

	UChar32 character =
	ReadUtf8Sequence(data.subspan(data_cursor), utf8_sequence_length);
	DCHECK(!IsASCII(character));
	data_cursor += utf8_sequence_length;

	if (character > 0xff) {
	seen_non_latin1 = true;
	}

	if (U_IS_BMP(character)) {
	// UTF-16 surrogate values are illegal in UTF-32
	if (U_IS_SURROGATE(character))
	return 0;
	utf16_length++;
	} else if (U_IS_SUPPLEMENTARY(character)) {
	utf16_length += 2;
	} else {
	return 0;
	}
	}

	data = data.first(data_cursor);
	return utf16_length;
	}

	} // namespace blink::unicode