third_party/WebKit/Source/platform/wtf/text/TextCodecUTF8.cpp - chromium/src.git - Git at Google

 /*
  * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "platform/wtf/text/TextCodecUTF8.h"

 #include "platform/wtf/PtrUtil.h"
 #include "platform/wtf/text/CString.h"
 #include "platform/wtf/text/CharacterNames.h"
 #include "platform/wtf/text/StringBuffer.h"
 #include "platform/wtf/text/TextCodecASCIIFastPath.h"
 #include <memory>

 namespace WTF {

 using namespace WTF::Unicode;

 // We'll use nonCharacter* constants to signal invalid utf-8.
 // The number in the name signals how many input bytes were invalid.
 const int kNonCharacter1 = -1;
 const int kNonCharacter2 = -2;
 const int kNonCharacter3 = -3;

 bool IsNonCharacter(int character) {
   return character >= kNonCharacter3 && character <= kNonCharacter1;
 }

 std::unique_ptr<TextCodec> TextCodecUTF8::Create(const TextEncoding&,
                                                  const void*) {
   return WTF::WrapUnique(new TextCodecUTF8);
 }

 void TextCodecUTF8::RegisterEncodingNames(EncodingNameRegistrar registrar) {
   registrar("UTF-8", "UTF-8");

   // Additional aliases that originally were present in the encoding
   // table in WebKit on Macintosh, and subsequently added by
   // TextCodecICU. Perhaps we can prove some are not used on the web
   // and remove them.
   registrar("unicode11utf8", "UTF-8");
   registrar("unicode20utf8", "UTF-8");
   registrar("utf8", "UTF-8");
   registrar("x-unicode20utf8", "UTF-8");

   // Additional aliases present in the WHATWG Encoding Standard
   // (http://encoding.spec.whatwg.org/)
   // and Firefox (24), but not in ICU 4.6.
   registrar("unicode-1-1-utf-8", "UTF-8");
 }

 void TextCodecUTF8::RegisterCodecs(TextCodecRegistrar registrar) {
   registrar("UTF-8", Create, nullptr);
 }

 static inline int NonASCIISequenceLength(uint8_t first_byte) {
   static const uint8_t kLengths[256] = {
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   return kLengths[first_byte];
 }

 static inline int DecodeNonASCIISequence(const uint8_t* sequence,
                                          unsigned length) {
   DCHECK(!IsASCII(sequence[0]));
   if (length == 2) {
     DCHECK_LE(sequence[0], 0xDF);
     if (sequence[0] < 0xC2)
       return kNonCharacter1;
     if (sequence[1] < 0x80 || sequence[1] > 0xBF)
       return kNonCharacter1;
     return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
   }
   if (length == 3) {
     DCHECK_GE(sequence[0], 0xE0);
     DCHECK_LE(sequence[0], 0xEF);
     switch (sequence[0]) {
       case 0xE0:
         if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
           return kNonCharacter1;
         break;
       case 0xED:
         if (sequence[1] < 0x80 || sequence[1] > 0x9F)
           return kNonCharacter1;
         break;
       default:
         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
           return kNonCharacter1;
     }
     if (sequence[2] < 0x80 || sequence[2] > 0xBF)
       return kNonCharacter2;
     return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -
            0x000E2080;
   }
   DCHECK_EQ(length, 4u);
   DCHECK_GE(sequence[0], 0xF0);
   DCHECK_LE(sequence[0], 0xF4);
   switch (sequence[0]) {
     case 0xF0:
       if (sequence[1] < 0x90 || sequence[1] > 0xBF)
         return kNonCharacter1;
       break;
     case 0xF4:
       if (sequence[1] < 0x80 || sequence[1] > 0x8F)
         return kNonCharacter1;
       break;
     default:
       if (sequence[1] < 0x80 || sequence[1] > 0xBF)
         return kNonCharacter1;
   }
   if (sequence[2] < 0x80 || sequence[2] > 0xBF)
     return kNonCharacter2;
   if (sequence[3] < 0x80 || sequence[3] > 0xBF)
     return kNonCharacter3;
   return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +
           sequence[3]) -
          0x03C82080;
 }

 static inline UChar* AppendCharacter(UChar* destination, int character) {
   DCHECK(!IsNonCharacter(character));
   DCHECK(!U_IS_SURROGATE(character));
   if (U_IS_BMP(character)) {
     *destination++ = static_cast<UChar>(character);
   } else {
     *destination++ = U16_LEAD(character);
     *destination++ = U16_TRAIL(character);
   }
   return destination;
 }

 void TextCodecUTF8::ConsumePartialSequenceBytes(int num_bytes) {
   DCHECK_GE(partial_sequence_size_, num_bytes);
   partial_sequence_size_ -= num_bytes;
   memmove(partial_sequence_, partial_sequence_ + num_bytes,
           partial_sequence_size_);
 }

 void TextCodecUTF8::HandleError(int character,
                                 UChar*& destination,
                                 bool stop_on_error,
                                 bool& saw_error) {
   saw_error = true;
   if (stop_on_error)
     return;
   // Each error generates a replacement character and consumes 1-3 bytes.
   *destination++ = kReplacementCharacter;
   DCHECK(IsNonCharacter(character));
   int num_bytes_consumed = -character;
   DCHECK_GE(num_bytes_consumed, 1);
   DCHECK_LE(num_bytes_consumed, 3);
   ConsumePartialSequenceBytes(num_bytes_consumed);
 }

 template <>
 bool TextCodecUTF8::HandlePartialSequence<LChar>(LChar*& destination,
                                                  const uint8_t*& source,
                                                  const uint8_t* end,
                                                  bool flush,
                                                  bool,
                                                  bool&) {
   DCHECK(partial_sequence_size_);
   do {
     if (IsASCII(partial_sequence_[0])) {
       *destination++ = partial_sequence_[0];
       ConsumePartialSequenceBytes(1);
       continue;
     }
     int count = NonASCIISequenceLength(partial_sequence_[0]);
     if (!count)
       return true;

     if (count > partial_sequence_size_) {
       if (count - partial_sequence_size_ > end - source) {
         if (!flush) {
           // The new data is not enough to complete the sequence, so
           // add it to the existing partial sequence.
           memcpy(partial_sequence_ + partial_sequence_size_, source,
                  end - source);
           partial_sequence_size_ += end - source;
           return false;
         }
         // An incomplete partial sequence at the end is an error, but it will
         // create a 16 bit string due to the replacementCharacter. Let the 16
         // bit path handle the error.
         return true;
       }
       memcpy(partial_sequence_ + partial_sequence_size_, source,
              count - partial_sequence_size_);
       source += count - partial_sequence_size_;
       partial_sequence_size_ = count;
     }
     int character = DecodeNonASCIISequence(partial_sequence_, count);
     if (character & ~0xff)
       return true;

     partial_sequence_size_ -= count;
     *destination++ = static_cast<LChar>(character);
   } while (partial_sequence_size_);

   return false;
 }

 template <>
 bool TextCodecUTF8::HandlePartialSequence<UChar>(UChar*& destination,
                                                  const uint8_t*& source,
                                                  const uint8_t* end,
                                                  bool flush,
                                                  bool stop_on_error,
                                                  bool& saw_error) {
   DCHECK(partial_sequence_size_);
   do {
     if (IsASCII(partial_sequence_[0])) {
       *destination++ = partial_sequence_[0];
       ConsumePartialSequenceBytes(1);
       continue;
     }
     int count = NonASCIISequenceLength(partial_sequence_[0]);
     if (!count) {
       HandleError(kNonCharacter1, destination, stop_on_error, saw_error);
       if (stop_on_error)
         return false;
       continue;
     }
     if (count > partial_sequence_size_) {
       if (count - partial_sequence_size_ > end - source) {
         if (!flush) {
           // The new data is not enough to complete the sequence, so
           // add it to the existing partial sequence.
           memcpy(partial_sequence_ + partial_sequence_size_, source,
                  end - source);
           partial_sequence_size_ += end - source;
           return false;
         }
         // An incomplete partial sequence at the end is an error.
         HandleError(kNonCharacter1, destination, stop_on_error, saw_error);
         if (stop_on_error)
           return false;
         continue;
       }
       memcpy(partial_sequence_ + partial_sequence_size_, source,
              count - partial_sequence_size_);
       source += count - partial_sequence_size_;
       partial_sequence_size_ = count;
     }
     int character = DecodeNonASCIISequence(partial_sequence_, count);
     if (IsNonCharacter(character)) {
       HandleError(character, destination, stop_on_error, saw_error);
       if (stop_on_error)
         return false;
       continue;
     }

     partial_sequence_size_ -= count;
     destination = AppendCharacter(destination, character);
   } while (partial_sequence_size_);

   return false;
 }

 String TextCodecUTF8::Decode(const char* bytes,
                              size_t length,
                              FlushBehavior flush,
                              bool stop_on_error,
                              bool& saw_error) {
   // Each input byte might turn into a character.
   // That includes all bytes in the partial-sequence buffer because
   // each byte in an invalid sequence will turn into a replacement character.
   StringBuffer<LChar> buffer(partial_sequence_size_ + length);

   const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
   const uint8_t* end = source + length;
   const uint8_t* aligned_end = AlignToMachineWord(end);
   LChar* destination = buffer.Characters();

   do {
     if (partial_sequence_size_) {
       // Explicitly copy destination and source pointers to avoid taking
       // pointers to the local variables, which may harm code generation by
       // disabling some optimizations in some compilers.
       LChar* destination_for_handle_partial_sequence = destination;
       const uint8_t* source_for_handle_partial_sequence = source;
       if (HandlePartialSequence(destination_for_handle_partial_sequence,
                                 source_for_handle_partial_sequence, end, flush,
                                 stop_on_error, saw_error)) {
         source = source_for_handle_partial_sequence;
         goto upConvertTo16Bit;
       }
       destination = destination_for_handle_partial_sequence;
       source = source_for_handle_partial_sequence;
       if (partial_sequence_size_)
         break;
     }

     while (source < end) {
       if (IsASCII(*source)) {
         // Fast path for ASCII. Most UTF-8 text will be ASCII.
         if (IsAlignedToMachineWord(source)) {
           while (source < aligned_end) {
             MachineWord chunk =
                 *reinterpret_cast_ptr<const MachineWord*>(source);
             if (!IsAllASCII<LChar>(chunk))
               break;
             CopyASCIIMachineWord(destination, source);
             source += sizeof(MachineWord);
             destination += sizeof(MachineWord);
           }
           if (source == end)
             break;
           if (!IsASCII(*source))
             continue;
         }
         *destination++ = *source++;
         continue;
       }
       int count = NonASCIISequenceLength(*source);
       int character;
       if (count == 0) {
         character = kNonCharacter1;
       } else {
         if (count > end - source) {
           SECURITY_DCHECK(end - source <
                           static_cast<ptrdiff_t>(sizeof(partial_sequence_)));
           DCHECK(!partial_sequence_size_);
           partial_sequence_size_ = end - source;
           memcpy(partial_sequence_, source, partial_sequence_size_);
           source = end;
           break;
         }
         character = DecodeNonASCIISequence(source, count);
       }
       if (IsNonCharacter(character)) {
         saw_error = true;
         if (stop_on_error)
           break;

         goto upConvertTo16Bit;
       }
       if (character > 0xff)
         goto upConvertTo16Bit;

       source += count;
       *destination++ = static_cast<LChar>(character);
     }
   } while (flush && partial_sequence_size_);

   buffer.Shrink(destination - buffer.Characters());

   return String::Adopt(buffer);

 upConvertTo16Bit:
   StringBuffer<UChar> buffer16(partial_sequence_size_ + length);

   UChar* destination16 = buffer16.Characters();

   // Copy the already converted characters
   for (LChar* converted8 = buffer.Characters(); converted8 < destination;)
     *destination16++ = *converted8++;

   do {
     if (partial_sequence_size_) {
       // Explicitly copy destination and source pointers to avoid taking
       // pointers to the local variables, which may harm code generation by
       // disabling some optimizations in some compilers.
       UChar* destination_for_handle_partial_sequence = destination16;
       const uint8_t* source_for_handle_partial_sequence = source;
       HandlePartialSequence(destination_for_handle_partial_sequence,
                             source_for_handle_partial_sequence, end, flush,
                             stop_on_error, saw_error);
       destination16 = destination_for_handle_partial_sequence;
       source = source_for_handle_partial_sequence;
       if (partial_sequence_size_)
         break;
     }

     while (source < end) {
       if (IsASCII(*source)) {
         // Fast path for ASCII. Most UTF-8 text will be ASCII.
         if (IsAlignedToMachineWord(source)) {
           while (source < aligned_end) {
             MachineWord chunk =
                 *reinterpret_cast_ptr<const MachineWord*>(source);
             if (!IsAllASCII<LChar>(chunk))
               break;
             CopyASCIIMachineWord(destination16, source);
             source += sizeof(MachineWord);
             destination16 += sizeof(MachineWord);
           }
           if (source == end)
             break;
           if (!IsASCII(*source))
             continue;
         }
         *destination16++ = *source++;
         continue;
       }
       int count = NonASCIISequenceLength(*source);
       int character;
       if (count == 0) {
         character = kNonCharacter1;
       } else {
         if (count > end - source) {
           SECURITY_DCHECK(end - source <
                           static_cast<ptrdiff_t>(sizeof(partial_sequence_)));
           DCHECK(!partial_sequence_size_);
           partial_sequence_size_ = end - source;
           memcpy(partial_sequence_, source, partial_sequence_size_);
           source = end;
           break;
         }
         character = DecodeNonASCIISequence(source, count);
       }
       if (IsNonCharacter(character)) {
         saw_error = true;
         if (stop_on_error)
           break;
         // Each error generates one replacement character and consumes the
         // 'largest subpart' of the incomplete character.
         // Note that the nonCharacterX constants go from -1..-3 and contain
         // the negative of number of bytes comprising the broken encoding
         // detected. So subtracting c (when isNonCharacter(c)) adds the number
         // of broken bytes.
         *destination16++ = kReplacementCharacter;
         source -= character;
         continue;
       }
       source += count;
       destination16 = AppendCharacter(destination16, character);
     }
   } while (flush && partial_sequence_size_);

   buffer16.Shrink(destination16 - buffer16.Characters());

   return String::Adopt(buffer16);
 }

 template <typename CharType>
 CString TextCodecUTF8::EncodeCommon(const CharType* characters, size_t length) {
   // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
   // BMP characters take only one UTF-16 code unit and can take up to 3 bytes
   // (3x).
   // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes
   // (2x).
   CHECK_LE(length, std::numeric_limits<size_t>::max() / 3);
   Vector<uint8_t> bytes(length * 3);

   size_t i = 0;
   size_t bytes_written = 0;
   while (i < length) {
     UChar32 character;
     U16_NEXT(characters, i, length, character);
     // U16_NEXT will simply emit a surrogate code point if an unmatched
     // surrogate is encountered; we must convert it to a
     // U+FFFD (REPLACEMENT CHARACTER) here.
     if (0xD800 <= character && character <= 0xDFFF)
       character = kReplacementCharacter;
     U8_APPEND_UNSAFE(bytes.data(), bytes_written, character);
   }

   return CString(reinterpret_cast<char*>(bytes.data()), bytes_written);
 }

 CString TextCodecUTF8::Encode(const UChar* characters,
                               size_t length,
                               UnencodableHandling) {
   return EncodeCommon(characters, length);
 }

 CString TextCodecUTF8::Encode(const LChar* characters,
                               size_t length,
                               UnencodableHandling) {
   return EncodeCommon(characters, length);
 }

 }  // namespace WTF
	/*
	* Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "platform/wtf/text/TextCodecUTF8.h"

	#include "platform/wtf/PtrUtil.h"
	#include "platform/wtf/text/CString.h"
	#include "platform/wtf/text/CharacterNames.h"
	#include "platform/wtf/text/StringBuffer.h"
	#include "platform/wtf/text/TextCodecASCIIFastPath.h"
	#include <memory>

	namespace WTF {

	using namespace WTF::Unicode;

	// We'll use nonCharacter* constants to signal invalid utf-8.
	// The number in the name signals how many input bytes were invalid.
	const int kNonCharacter1 = -1;
	const int kNonCharacter2 = -2;
	const int kNonCharacter3 = -3;

	bool IsNonCharacter(int character) {
	return character >= kNonCharacter3 && character <= kNonCharacter1;
	}

	std::unique_ptr<TextCodec> TextCodecUTF8::Create(const TextEncoding&,
	const void*) {
	return WTF::WrapUnique(new TextCodecUTF8);
	}

	void TextCodecUTF8::RegisterEncodingNames(EncodingNameRegistrar registrar) {
	registrar("UTF-8", "UTF-8");

	// Additional aliases that originally were present in the encoding
	// table in WebKit on Macintosh, and subsequently added by
	// TextCodecICU. Perhaps we can prove some are not used on the web
	// and remove them.
	registrar("unicode11utf8", "UTF-8");
	registrar("unicode20utf8", "UTF-8");
	registrar("utf8", "UTF-8");
	registrar("x-unicode20utf8", "UTF-8");

	// Additional aliases present in the WHATWG Encoding Standard
	// (http://encoding.spec.whatwg.org/)
	// and Firefox (24), but not in ICU 4.6.
	registrar("unicode-1-1-utf-8", "UTF-8");
	}

	void TextCodecUTF8::RegisterCodecs(TextCodecRegistrar registrar) {
	registrar("UTF-8", Create, nullptr);
	}

	static inline int NonASCIISequenceLength(uint8_t first_byte) {
	static const uint8_t kLengths[256] = {
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
	4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
	return kLengths[first_byte];
	}

	static inline int DecodeNonASCIISequence(const uint8_t* sequence,
	unsigned length) {
	DCHECK(!IsASCII(sequence[0]));
	if (length == 2) {
	DCHECK_LE(sequence[0], 0xDF);
	if (sequence[0] < 0xC2)
	return kNonCharacter1;
	if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)
	return kNonCharacter1;
	return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
	}
	if (length == 3) {
	DCHECK_GE(sequence[0], 0xE0);
	DCHECK_LE(sequence[0], 0xEF);
	switch (sequence[0]) {
	case 0xE0:
	if (sequence[1] < 0xA0 \|\| sequence[1] > 0xBF)
	return kNonCharacter1;
	break;
	case 0xED:
	if (sequence[1] < 0x80 \|\| sequence[1] > 0x9F)
	return kNonCharacter1;
	break;
	default:
	if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)
	return kNonCharacter1;
	}
	if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)
	return kNonCharacter2;
	return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -
	0x000E2080;
	}
	DCHECK_EQ(length, 4u);
	DCHECK_GE(sequence[0], 0xF0);
	DCHECK_LE(sequence[0], 0xF4);
	switch (sequence[0]) {
	case 0xF0:
	if (sequence[1] < 0x90 \|\| sequence[1] > 0xBF)
	return kNonCharacter1;
	break;
	case 0xF4:
	if (sequence[1] < 0x80 \|\| sequence[1] > 0x8F)
	return kNonCharacter1;
	break;
	default:
	if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)
	return kNonCharacter1;
	}
	if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)
	return kNonCharacter2;
	if (sequence[3] < 0x80 \|\| sequence[3] > 0xBF)
	return kNonCharacter3;
	return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +
	sequence[3]) -
	0x03C82080;
	}

	static inline UChar* AppendCharacter(UChar* destination, int character) {
	DCHECK(!IsNonCharacter(character));
	DCHECK(!U_IS_SURROGATE(character));
	if (U_IS_BMP(character)) {
	*destination++ = static_cast<UChar>(character);
	} else {
	*destination++ = U16_LEAD(character);
	*destination++ = U16_TRAIL(character);
	}
	return destination;
	}

	void TextCodecUTF8::ConsumePartialSequenceBytes(int num_bytes) {
	DCHECK_GE(partial_sequence_size_, num_bytes);
	partial_sequence_size_ -= num_bytes;
	memmove(partial_sequence_, partial_sequence_ + num_bytes,
	partial_sequence_size_);
	}

	void TextCodecUTF8::HandleError(int character,
	UChar*& destination,
	bool stop_on_error,
	bool& saw_error) {
	saw_error = true;
	if (stop_on_error)
	return;
	// Each error generates a replacement character and consumes 1-3 bytes.
	*destination++ = kReplacementCharacter;
	DCHECK(IsNonCharacter(character));
	int num_bytes_consumed = -character;
	DCHECK_GE(num_bytes_consumed, 1);
	DCHECK_LE(num_bytes_consumed, 3);
	ConsumePartialSequenceBytes(num_bytes_consumed);
	}

	template <>
	bool TextCodecUTF8::HandlePartialSequence<LChar>(LChar*& destination,
	const uint8_t*& source,
	const uint8_t* end,
	bool flush,
	bool,
	bool&) {
	DCHECK(partial_sequence_size_);
	do {
	if (IsASCII(partial_sequence_[0])) {
	*destination++ = partial_sequence_[0];
	ConsumePartialSequenceBytes(1);
	continue;
	}
	int count = NonASCIISequenceLength(partial_sequence_[0]);
	if (!count)
	return true;

	if (count > partial_sequence_size_) {
	if (count - partial_sequence_size_ > end - source) {
	if (!flush) {
	// The new data is not enough to complete the sequence, so
	// add it to the existing partial sequence.
	memcpy(partial_sequence_ + partial_sequence_size_, source,
	end - source);
	partial_sequence_size_ += end - source;
	return false;
	}
	// An incomplete partial sequence at the end is an error, but it will
	// create a 16 bit string due to the replacementCharacter. Let the 16
	// bit path handle the error.
	return true;
	}
	memcpy(partial_sequence_ + partial_sequence_size_, source,
	count - partial_sequence_size_);
	source += count - partial_sequence_size_;
	partial_sequence_size_ = count;
	}
	int character = DecodeNonASCIISequence(partial_sequence_, count);
	if (character & ~0xff)
	return true;

	partial_sequence_size_ -= count;
	*destination++ = static_cast<LChar>(character);
	} while (partial_sequence_size_);

	return false;
	}

	template <>
	bool TextCodecUTF8::HandlePartialSequence<UChar>(UChar*& destination,
	const uint8_t*& source,
	const uint8_t* end,
	bool flush,
	bool stop_on_error,
	bool& saw_error) {
	DCHECK(partial_sequence_size_);
	do {
	if (IsASCII(partial_sequence_[0])) {
	*destination++ = partial_sequence_[0];
	ConsumePartialSequenceBytes(1);
	continue;
	}
	int count = NonASCIISequenceLength(partial_sequence_[0]);
	if (!count) {
	HandleError(kNonCharacter1, destination, stop_on_error, saw_error);
	if (stop_on_error)
	return false;
	continue;
	}
	if (count > partial_sequence_size_) {
	if (count - partial_sequence_size_ > end - source) {
	if (!flush) {
	// The new data is not enough to complete the sequence, so
	// add it to the existing partial sequence.
	memcpy(partial_sequence_ + partial_sequence_size_, source,
	end - source);
	partial_sequence_size_ += end - source;
	return false;
	}
	// An incomplete partial sequence at the end is an error.
	HandleError(kNonCharacter1, destination, stop_on_error, saw_error);
	if (stop_on_error)
	return false;
	continue;
	}
	memcpy(partial_sequence_ + partial_sequence_size_, source,
	count - partial_sequence_size_);
	source += count - partial_sequence_size_;
	partial_sequence_size_ = count;
	}
	int character = DecodeNonASCIISequence(partial_sequence_, count);
	if (IsNonCharacter(character)) {
	HandleError(character, destination, stop_on_error, saw_error);
	if (stop_on_error)
	return false;
	continue;
	}

	partial_sequence_size_ -= count;
	destination = AppendCharacter(destination, character);
	} while (partial_sequence_size_);

	return false;
	}

	String TextCodecUTF8::Decode(const char* bytes,
	size_t length,
	FlushBehavior flush,
	bool stop_on_error,
	bool& saw_error) {
	// Each input byte might turn into a character.
	// That includes all bytes in the partial-sequence buffer because
	// each byte in an invalid sequence will turn into a replacement character.
	StringBuffer<LChar> buffer(partial_sequence_size_ + length);

	const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
	const uint8_t* end = source + length;
	const uint8_t* aligned_end = AlignToMachineWord(end);
	LChar* destination = buffer.Characters();

	do {
	if (partial_sequence_size_) {
	// Explicitly copy destination and source pointers to avoid taking
	// pointers to the local variables, which may harm code generation by
	// disabling some optimizations in some compilers.
	LChar* destination_for_handle_partial_sequence = destination;
	const uint8_t* source_for_handle_partial_sequence = source;
	if (HandlePartialSequence(destination_for_handle_partial_sequence,
	source_for_handle_partial_sequence, end, flush,
	stop_on_error, saw_error)) {
	source = source_for_handle_partial_sequence;
	goto upConvertTo16Bit;
	}
	destination = destination_for_handle_partial_sequence;
	source = source_for_handle_partial_sequence;
	if (partial_sequence_size_)
	break;
	}

	while (source < end) {
	if (IsASCII(*source)) {
	// Fast path for ASCII. Most UTF-8 text will be ASCII.
	if (IsAlignedToMachineWord(source)) {
	while (source < aligned_end) {
	MachineWord chunk =
	reinterpret_cast_ptr<const MachineWord>(source);
	if (!IsAllASCII<LChar>(chunk))
	break;
	CopyASCIIMachineWord(destination, source);
	source += sizeof(MachineWord);
	destination += sizeof(MachineWord);
	}
	if (source == end)
	break;
	if (!IsASCII(*source))
	continue;
	}
	destination++ = source++;
	continue;
	}
	int count = NonASCIISequenceLength(*source);
	int character;
	if (count == 0) {
	character = kNonCharacter1;
	} else {
	if (count > end - source) {
	SECURITY_DCHECK(end - source <
	static_cast<ptrdiff_t>(sizeof(partial_sequence_)));
	DCHECK(!partial_sequence_size_);
	partial_sequence_size_ = end - source;
	memcpy(partial_sequence_, source, partial_sequence_size_);
	source = end;
	break;
	}
	character = DecodeNonASCIISequence(source, count);
	}
	if (IsNonCharacter(character)) {
	saw_error = true;
	if (stop_on_error)
	break;

	goto upConvertTo16Bit;
	}
	if (character > 0xff)
	goto upConvertTo16Bit;

	source += count;
	*destination++ = static_cast<LChar>(character);
	}
	} while (flush && partial_sequence_size_);

	buffer.Shrink(destination - buffer.Characters());

	return String::Adopt(buffer);

	upConvertTo16Bit:
	StringBuffer<UChar> buffer16(partial_sequence_size_ + length);

	UChar* destination16 = buffer16.Characters();

	// Copy the already converted characters
	for (LChar* converted8 = buffer.Characters(); converted8 < destination;)
	destination16++ = converted8++;

	do {
	if (partial_sequence_size_) {
	// Explicitly copy destination and source pointers to avoid taking
	// pointers to the local variables, which may harm code generation by
	// disabling some optimizations in some compilers.
	UChar* destination_for_handle_partial_sequence = destination16;
	const uint8_t* source_for_handle_partial_sequence = source;
	HandlePartialSequence(destination_for_handle_partial_sequence,
	source_for_handle_partial_sequence, end, flush,
	stop_on_error, saw_error);
	destination16 = destination_for_handle_partial_sequence;
	source = source_for_handle_partial_sequence;
	if (partial_sequence_size_)
	break;
	}

	while (source < end) {
	if (IsASCII(*source)) {
	// Fast path for ASCII. Most UTF-8 text will be ASCII.
	if (IsAlignedToMachineWord(source)) {
	while (source < aligned_end) {
	MachineWord chunk =
	reinterpret_cast_ptr<const MachineWord>(source);
	if (!IsAllASCII<LChar>(chunk))
	break;
	CopyASCIIMachineWord(destination16, source);
	source += sizeof(MachineWord);
	destination16 += sizeof(MachineWord);
	}
	if (source == end)
	break;
	if (!IsASCII(*source))
	continue;
	}
	destination16++ = source++;
	continue;
	}
	int count = NonASCIISequenceLength(*source);
	int character;
	if (count == 0) {
	character = kNonCharacter1;
	} else {
	if (count > end - source) {
	SECURITY_DCHECK(end - source <
	static_cast<ptrdiff_t>(sizeof(partial_sequence_)));
	DCHECK(!partial_sequence_size_);
	partial_sequence_size_ = end - source;
	memcpy(partial_sequence_, source, partial_sequence_size_);
	source = end;
	break;
	}
	character = DecodeNonASCIISequence(source, count);
	}
	if (IsNonCharacter(character)) {
	saw_error = true;
	if (stop_on_error)
	break;
	// Each error generates one replacement character and consumes the
	// 'largest subpart' of the incomplete character.
	// Note that the nonCharacterX constants go from -1..-3 and contain
	// the negative of number of bytes comprising the broken encoding
	// detected. So subtracting c (when isNonCharacter(c)) adds the number
	// of broken bytes.
	*destination16++ = kReplacementCharacter;
	source -= character;
	continue;
	}
	source += count;
	destination16 = AppendCharacter(destination16, character);
	}
	} while (flush && partial_sequence_size_);

	buffer16.Shrink(destination16 - buffer16.Characters());

	return String::Adopt(buffer16);
	}

	template <typename CharType>
	CString TextCodecUTF8::EncodeCommon(const CharType* characters, size_t length) {
	// The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
	// BMP characters take only one UTF-16 code unit and can take up to 3 bytes
	// (3x).
	// Non-BMP characters take two UTF-16 code units and can take up to 4 bytes
	// (2x).
	CHECK_LE(length, std::numeric_limits<size_t>::max() / 3);
	Vector<uint8_t> bytes(length * 3);

	size_t i = 0;
	size_t bytes_written = 0;
	while (i < length) {
	UChar32 character;
	U16_NEXT(characters, i, length, character);
	// U16_NEXT will simply emit a surrogate code point if an unmatched
	// surrogate is encountered; we must convert it to a
	// U+FFFD (REPLACEMENT CHARACTER) here.
	if (0xD800 <= character && character <= 0xDFFF)
	character = kReplacementCharacter;
	U8_APPEND_UNSAFE(bytes.data(), bytes_written, character);
	}

	return CString(reinterpret_cast<char*>(bytes.data()), bytes_written);
	}

	CString TextCodecUTF8::Encode(const UChar* characters,
	size_t length,
	UnencodableHandling) {
	return EncodeCommon(characters, length);
	}

	CString TextCodecUTF8::Encode(const LChar* characters,
	size_t length,
	UnencodableHandling) {
	return EncodeCommon(characters, length);
	}

	} // namespace WTF