blob: a1c57cb63c1fc3fe09dc7b9ffce353b5ee32ba37 [file] [log] [blame]
/*
* Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "platform/wtf/text/TextCodecUTF8.h"
#include "platform/wtf/PtrUtil.h"
#include "platform/wtf/text/CString.h"
#include "platform/wtf/text/CharacterNames.h"
#include "platform/wtf/text/StringBuffer.h"
#include "platform/wtf/text/TextCodecASCIIFastPath.h"
#include <memory>
namespace WTF {
using namespace WTF::Unicode;
// We'll use nonCharacter* constants to signal invalid utf-8.
// The number in the name signals how many input bytes were invalid.
const int kNonCharacter1 = -1;
const int kNonCharacter2 = -2;
const int kNonCharacter3 = -3;
bool IsNonCharacter(int character) {
return character >= kNonCharacter3 && character <= kNonCharacter1;
}
std::unique_ptr<TextCodec> TextCodecUTF8::Create(const TextEncoding&,
const void*) {
return WTF::WrapUnique(new TextCodecUTF8);
}
void TextCodecUTF8::RegisterEncodingNames(EncodingNameRegistrar registrar) {
registrar("UTF-8", "UTF-8");
// Additional aliases that originally were present in the encoding
// table in WebKit on Macintosh, and subsequently added by
// TextCodecICU. Perhaps we can prove some are not used on the web
// and remove them.
registrar("unicode11utf8", "UTF-8");
registrar("unicode20utf8", "UTF-8");
registrar("utf8", "UTF-8");
registrar("x-unicode20utf8", "UTF-8");
// Additional aliases present in the WHATWG Encoding Standard
// (http://encoding.spec.whatwg.org/)
// and Firefox (24), but not in ICU 4.6.
registrar("unicode-1-1-utf-8", "UTF-8");
}
void TextCodecUTF8::RegisterCodecs(TextCodecRegistrar registrar) {
registrar("UTF-8", Create, nullptr);
}
static inline int NonASCIISequenceLength(uint8_t first_byte) {
static const uint8_t kLengths[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
return kLengths[first_byte];
}
static inline int DecodeNonASCIISequence(const uint8_t* sequence,
unsigned length) {
DCHECK(!IsASCII(sequence[0]));
if (length == 2) {
DCHECK_LE(sequence[0], 0xDF);
if (sequence[0] < 0xC2)
return kNonCharacter1;
if (sequence[1] < 0x80 || sequence[1] > 0xBF)
return kNonCharacter1;
return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
}
if (length == 3) {
DCHECK_GE(sequence[0], 0xE0);
DCHECK_LE(sequence[0], 0xEF);
switch (sequence[0]) {
case 0xE0:
if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
return kNonCharacter1;
break;
case 0xED:
if (sequence[1] < 0x80 || sequence[1] > 0x9F)
return kNonCharacter1;
break;
default:
if (sequence[1] < 0x80 || sequence[1] > 0xBF)
return kNonCharacter1;
}
if (sequence[2] < 0x80 || sequence[2] > 0xBF)
return kNonCharacter2;
return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -
0x000E2080;
}
DCHECK_EQ(length, 4u);
DCHECK_GE(sequence[0], 0xF0);
DCHECK_LE(sequence[0], 0xF4);
switch (sequence[0]) {
case 0xF0:
if (sequence[1] < 0x90 || sequence[1] > 0xBF)
return kNonCharacter1;
break;
case 0xF4:
if (sequence[1] < 0x80 || sequence[1] > 0x8F)
return kNonCharacter1;
break;
default:
if (sequence[1] < 0x80 || sequence[1] > 0xBF)
return kNonCharacter1;
}
if (sequence[2] < 0x80 || sequence[2] > 0xBF)
return kNonCharacter2;
if (sequence[3] < 0x80 || sequence[3] > 0xBF)
return kNonCharacter3;
return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +
sequence[3]) -
0x03C82080;
}
static inline UChar* AppendCharacter(UChar* destination, int character) {
DCHECK(!IsNonCharacter(character));
DCHECK(!U_IS_SURROGATE(character));
if (U_IS_BMP(character)) {
*destination++ = static_cast<UChar>(character);
} else {
*destination++ = U16_LEAD(character);
*destination++ = U16_TRAIL(character);
}
return destination;
}
void TextCodecUTF8::ConsumePartialSequenceBytes(int num_bytes) {
DCHECK_GE(partial_sequence_size_, num_bytes);
partial_sequence_size_ -= num_bytes;
memmove(partial_sequence_, partial_sequence_ + num_bytes,
partial_sequence_size_);
}
void TextCodecUTF8::HandleError(int character,
UChar*& destination,
bool stop_on_error,
bool& saw_error) {
saw_error = true;
if (stop_on_error)
return;
// Each error generates a replacement character and consumes 1-3 bytes.
*destination++ = kReplacementCharacter;
DCHECK(IsNonCharacter(character));
int num_bytes_consumed = -character;
DCHECK_GE(num_bytes_consumed, 1);
DCHECK_LE(num_bytes_consumed, 3);
ConsumePartialSequenceBytes(num_bytes_consumed);
}
template <>
bool TextCodecUTF8::HandlePartialSequence<LChar>(LChar*& destination,
const uint8_t*& source,
const uint8_t* end,
bool flush,
bool,
bool&) {
DCHECK(partial_sequence_size_);
do {
if (IsASCII(partial_sequence_[0])) {
*destination++ = partial_sequence_[0];
ConsumePartialSequenceBytes(1);
continue;
}
int count = NonASCIISequenceLength(partial_sequence_[0]);
if (!count)
return true;
if (count > partial_sequence_size_) {
if (count - partial_sequence_size_ > end - source) {
if (!flush) {
// The new data is not enough to complete the sequence, so
// add it to the existing partial sequence.
memcpy(partial_sequence_ + partial_sequence_size_, source,
end - source);
partial_sequence_size_ += end - source;
return false;
}
// An incomplete partial sequence at the end is an error, but it will
// create a 16 bit string due to the replacementCharacter. Let the 16
// bit path handle the error.
return true;
}
memcpy(partial_sequence_ + partial_sequence_size_, source,
count - partial_sequence_size_);
source += count - partial_sequence_size_;
partial_sequence_size_ = count;
}
int character = DecodeNonASCIISequence(partial_sequence_, count);
if (character & ~0xff)
return true;
partial_sequence_size_ -= count;
*destination++ = static_cast<LChar>(character);
} while (partial_sequence_size_);
return false;
}
template <>
bool TextCodecUTF8::HandlePartialSequence<UChar>(UChar*& destination,
const uint8_t*& source,
const uint8_t* end,
bool flush,
bool stop_on_error,
bool& saw_error) {
DCHECK(partial_sequence_size_);
do {
if (IsASCII(partial_sequence_[0])) {
*destination++ = partial_sequence_[0];
ConsumePartialSequenceBytes(1);
continue;
}
int count = NonASCIISequenceLength(partial_sequence_[0]);
if (!count) {
HandleError(kNonCharacter1, destination, stop_on_error, saw_error);
if (stop_on_error)
return false;
continue;
}
if (count > partial_sequence_size_) {
if (count - partial_sequence_size_ > end - source) {
if (!flush) {
// The new data is not enough to complete the sequence, so
// add it to the existing partial sequence.
memcpy(partial_sequence_ + partial_sequence_size_, source,
end - source);
partial_sequence_size_ += end - source;
return false;
}
// An incomplete partial sequence at the end is an error.
HandleError(kNonCharacter1, destination, stop_on_error, saw_error);
if (stop_on_error)
return false;
continue;
}
memcpy(partial_sequence_ + partial_sequence_size_, source,
count - partial_sequence_size_);
source += count - partial_sequence_size_;
partial_sequence_size_ = count;
}
int character = DecodeNonASCIISequence(partial_sequence_, count);
if (IsNonCharacter(character)) {
HandleError(character, destination, stop_on_error, saw_error);
if (stop_on_error)
return false;
continue;
}
partial_sequence_size_ -= count;
destination = AppendCharacter(destination, character);
} while (partial_sequence_size_);
return false;
}
String TextCodecUTF8::Decode(const char* bytes,
size_t length,
FlushBehavior flush,
bool stop_on_error,
bool& saw_error) {
// Each input byte might turn into a character.
// That includes all bytes in the partial-sequence buffer because
// each byte in an invalid sequence will turn into a replacement character.
StringBuffer<LChar> buffer(partial_sequence_size_ + length);
const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
const uint8_t* end = source + length;
const uint8_t* aligned_end = AlignToMachineWord(end);
LChar* destination = buffer.Characters();
do {
if (partial_sequence_size_) {
// Explicitly copy destination and source pointers to avoid taking
// pointers to the local variables, which may harm code generation by
// disabling some optimizations in some compilers.
LChar* destination_for_handle_partial_sequence = destination;
const uint8_t* source_for_handle_partial_sequence = source;
if (HandlePartialSequence(destination_for_handle_partial_sequence,
source_for_handle_partial_sequence, end, flush,
stop_on_error, saw_error)) {
source = source_for_handle_partial_sequence;
goto upConvertTo16Bit;
}
destination = destination_for_handle_partial_sequence;
source = source_for_handle_partial_sequence;
if (partial_sequence_size_)
break;
}
while (source < end) {
if (IsASCII(*source)) {
// Fast path for ASCII. Most UTF-8 text will be ASCII.
if (IsAlignedToMachineWord(source)) {
while (source < aligned_end) {
MachineWord chunk =
*reinterpret_cast_ptr<const MachineWord*>(source);
if (!IsAllASCII<LChar>(chunk))
break;
CopyASCIIMachineWord(destination, source);
source += sizeof(MachineWord);
destination += sizeof(MachineWord);
}
if (source == end)
break;
if (!IsASCII(*source))
continue;
}
*destination++ = *source++;
continue;
}
int count = NonASCIISequenceLength(*source);
int character;
if (count == 0) {
character = kNonCharacter1;
} else {
if (count > end - source) {
SECURITY_DCHECK(end - source <
static_cast<ptrdiff_t>(sizeof(partial_sequence_)));
DCHECK(!partial_sequence_size_);
partial_sequence_size_ = end - source;
memcpy(partial_sequence_, source, partial_sequence_size_);
source = end;
break;
}
character = DecodeNonASCIISequence(source, count);
}
if (IsNonCharacter(character)) {
saw_error = true;
if (stop_on_error)
break;
goto upConvertTo16Bit;
}
if (character > 0xff)
goto upConvertTo16Bit;
source += count;
*destination++ = static_cast<LChar>(character);
}
} while (flush && partial_sequence_size_);
buffer.Shrink(destination - buffer.Characters());
return String::Adopt(buffer);
upConvertTo16Bit:
StringBuffer<UChar> buffer16(partial_sequence_size_ + length);
UChar* destination16 = buffer16.Characters();
// Copy the already converted characters
for (LChar* converted8 = buffer.Characters(); converted8 < destination;)
*destination16++ = *converted8++;
do {
if (partial_sequence_size_) {
// Explicitly copy destination and source pointers to avoid taking
// pointers to the local variables, which may harm code generation by
// disabling some optimizations in some compilers.
UChar* destination_for_handle_partial_sequence = destination16;
const uint8_t* source_for_handle_partial_sequence = source;
HandlePartialSequence(destination_for_handle_partial_sequence,
source_for_handle_partial_sequence, end, flush,
stop_on_error, saw_error);
destination16 = destination_for_handle_partial_sequence;
source = source_for_handle_partial_sequence;
if (partial_sequence_size_)
break;
}
while (source < end) {
if (IsASCII(*source)) {
// Fast path for ASCII. Most UTF-8 text will be ASCII.
if (IsAlignedToMachineWord(source)) {
while (source < aligned_end) {
MachineWord chunk =
*reinterpret_cast_ptr<const MachineWord*>(source);
if (!IsAllASCII<LChar>(chunk))
break;
CopyASCIIMachineWord(destination16, source);
source += sizeof(MachineWord);
destination16 += sizeof(MachineWord);
}
if (source == end)
break;
if (!IsASCII(*source))
continue;
}
*destination16++ = *source++;
continue;
}
int count = NonASCIISequenceLength(*source);
int character;
if (count == 0) {
character = kNonCharacter1;
} else {
if (count > end - source) {
SECURITY_DCHECK(end - source <
static_cast<ptrdiff_t>(sizeof(partial_sequence_)));
DCHECK(!partial_sequence_size_);
partial_sequence_size_ = end - source;
memcpy(partial_sequence_, source, partial_sequence_size_);
source = end;
break;
}
character = DecodeNonASCIISequence(source, count);
}
if (IsNonCharacter(character)) {
saw_error = true;
if (stop_on_error)
break;
// Each error generates one replacement character and consumes the
// 'largest subpart' of the incomplete character.
// Note that the nonCharacterX constants go from -1..-3 and contain
// the negative of number of bytes comprising the broken encoding
// detected. So subtracting c (when isNonCharacter(c)) adds the number
// of broken bytes.
*destination16++ = kReplacementCharacter;
source -= character;
continue;
}
source += count;
destination16 = AppendCharacter(destination16, character);
}
} while (flush && partial_sequence_size_);
buffer16.Shrink(destination16 - buffer16.Characters());
return String::Adopt(buffer16);
}
template <typename CharType>
CString TextCodecUTF8::EncodeCommon(const CharType* characters, size_t length) {
// The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
// BMP characters take only one UTF-16 code unit and can take up to 3 bytes
// (3x).
// Non-BMP characters take two UTF-16 code units and can take up to 4 bytes
// (2x).
CHECK_LE(length, std::numeric_limits<size_t>::max() / 3);
Vector<uint8_t> bytes(length * 3);
size_t i = 0;
size_t bytes_written = 0;
while (i < length) {
UChar32 character;
U16_NEXT(characters, i, length, character);
// U16_NEXT will simply emit a surrogate code point if an unmatched
// surrogate is encountered; we must convert it to a
// U+FFFD (REPLACEMENT CHARACTER) here.
if (0xD800 <= character && character <= 0xDFFF)
character = kReplacementCharacter;
U8_APPEND_UNSAFE(bytes.data(), bytes_written, character);
}
return CString(reinterpret_cast<char*>(bytes.data()), bytes_written);
}
CString TextCodecUTF8::Encode(const UChar* characters,
size_t length,
UnencodableHandling) {
return EncodeCommon(characters, length);
}
CString TextCodecUTF8::Encode(const LChar* characters,
size_t length,
UnencodableHandling) {
return EncodeCommon(characters, length);
}
} // namespace WTF