blob: e03eb24ddc6d07d3a01762c033657ca3559cfd47 [file] [log] [blame]
/*
* Copyright (C) 2007 Apple Inc. All rights reserved.
* Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "third_party/blink/renderer/platform/wtf/text/utf8.h"
#include <unicode/utf16.h>
#include <array>
#include "base/check.h"
#include "base/notreached.h"
#include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
#include "third_party/blink/renderer/platform/wtf/text/string_hasher.h"
namespace blink::unicode {
namespace {
inline size_t InlineUtf8SequenceLengthNonAscii(uint8_t b0) {
if ((b0 & 0xC0) != 0xC0)
return 0;
if ((b0 & 0xE0) == 0xC0)
return 2;
if ((b0 & 0xF0) == 0xE0)
return 3;
if ((b0 & 0xF8) == 0xF0)
return 4;
return 0;
}
inline size_t InlineUtf8SequenceLength(uint8_t b0) {
return IsASCII(b0) ? 1 : InlineUtf8SequenceLengthNonAscii(b0);
}
// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
// into the first byte, depending on how many bytes follow. There are
// as many entries in this table as there are UTF-8 sequence types.
// (I.e., one byte sequence, two byte... etc.). Remember that sequences
// for *legal* UTF-8 will be 4 or fewer bytes total.
static constexpr std::array<uint8_t, 7> kFirstByteMark = {
0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
ConversionStatus ConvertLatin1ToUtf8Internal(base::span<const LChar>& source,
base::span<uint8_t>& target) {
ConversionStatus status = kConversionOK;
size_t source_cursor = 0;
size_t target_cursor = 0;
size_t target_end = target.size();
while (source_cursor < source.size()) {
UChar32 ch;
uint8_t bytes_to_write = 0;
const UChar32 kByteMask = 0xBF;
const UChar32 kByteMark = 0x80;
const size_t old_source_cursor = source_cursor;
ch = static_cast<UChar32>(source[source_cursor++]);
// Figure out how many bytes the result will require
if (ch < static_cast<UChar32>(0x80)) {
bytes_to_write = 1;
} else {
bytes_to_write = 2;
}
target_cursor += bytes_to_write;
if (target_cursor > target_end) {
source_cursor = old_source_cursor; // Back up source index!
target_cursor -= bytes_to_write;
status = kTargetExhausted;
break;
}
switch (bytes_to_write) {
case 2:
target[--target_cursor] =
static_cast<uint8_t>((ch | kByteMark) & kByteMask);
ch >>= 6;
[[fallthrough]];
case 1:
target[--target_cursor] =
static_cast<uint8_t>(ch | kFirstByteMark[bytes_to_write]);
}
target_cursor += bytes_to_write;
}
source = source.subspan(source_cursor);
target = target.subspan(target_cursor);
return status;
}
ConversionStatus ConvertUtf16ToUtf8Internal(base::span<const UChar>& source,
base::span<uint8_t>& target,
bool strict) {
ConversionStatus status = kConversionOK;
size_t source_cursor = 0;
size_t target_cursor = 0;
size_t source_end = source.size();
size_t target_end = target.size();
while (source_cursor < source_end) {
UChar32 ch;
uint8_t bytes_to_write = 0;
const UChar32 kByteMask = 0xBF;
const UChar32 kByteMark = 0x80;
const size_t old_source_cursor = source_cursor;
ch = static_cast<UChar32>(source[source_cursor++]);
// If we have a surrogate pair, convert to UChar32 first.
if (ch >= 0xD800 && ch <= 0xDBFF) {
// If the 16 bits following the high surrogate are in the source buffer...
if (source_cursor < source_end) {
UChar32 ch2 = static_cast<UChar32>(source[source_cursor]);
// If it's a low surrogate, convert to UChar32.
if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
++source_cursor;
} else if (strict) { // it's an unpaired high surrogate
--source_cursor; // return to the illegal value itself
status = kSourceIllegal;
break;
}
} else { // We don't have the 16 bits following the high surrogate.
--source_cursor; // return to the high surrogate
status = kSourceExhausted;
break;
}
} else if (strict) {
// UTF-16 surrogate values are illegal in UTF-32
if (ch >= 0xDC00 && ch <= 0xDFFF) {
--source_cursor; // return to the illegal value itself
status = kSourceIllegal;
break;
}
}
// Figure out how many bytes the result will require
if (ch < static_cast<UChar32>(0x80)) {
bytes_to_write = 1;
} else if (ch < static_cast<UChar32>(0x800)) {
bytes_to_write = 2;
} else if (ch < static_cast<UChar32>(0x10000)) {
bytes_to_write = 3;
} else if (ch < static_cast<UChar32>(0x110000)) {
bytes_to_write = 4;
} else {
// Surrogate pairs cannot represent codepoints higher than 0x10FFFF, so
// this should not be reachable.
NOTREACHED();
}
target_cursor += bytes_to_write;
if (target_cursor > target_end) {
source_cursor = old_source_cursor; // Back up source index!
target_cursor -= bytes_to_write;
status = kTargetExhausted;
break;
}
switch (bytes_to_write) {
case 4:
target[--target_cursor] =
static_cast<uint8_t>((ch | kByteMark) & kByteMask);
ch >>= 6;
[[fallthrough]];
case 3:
target[--target_cursor] =
static_cast<uint8_t>((ch | kByteMark) & kByteMask);
ch >>= 6;
[[fallthrough]];
case 2:
target[--target_cursor] =
static_cast<uint8_t>((ch | kByteMark) & kByteMask);
ch >>= 6;
[[fallthrough]];
case 1:
target[--target_cursor] =
static_cast<uint8_t>(ch | kFirstByteMark[bytes_to_write]);
}
target_cursor += bytes_to_write;
}
source = source.subspan(source_cursor);
target = target.subspan(target_cursor);
return status;
}
// This must be called with the length pre-determined by the first byte.
// If presented with a length > 4, this returns false. The Unicode
// definition of UTF-8 goes up to 4-byte sequences.
bool IsLegalUtf8(const base::span<const uint8_t> source) {
uint8_t a;
size_t src_cursor = source.size();
switch (source.size()) {
default:
return false;
case 4:
if ((a = (source[--src_cursor])) < 0x80 || a > 0xBF) {
return false;
}
[[fallthrough]];
case 3:
if ((a = (source[--src_cursor])) < 0x80 || a > 0xBF) {
return false;
}
[[fallthrough]];
case 2:
if ((a = (source[--src_cursor])) > 0xBF) {
return false;
}
// no fall-through in this inner switch
switch (source[0]) {
case 0xE0:
if (a < 0xA0)
return false;
break;
case 0xED:
if (a < 0x80 || a > 0x9F) {
return false;
}
break;
case 0xF0:
if (a < 0x90)
return false;
break;
case 0xF4:
if (a < 0x80 || a > 0x8F) {
return false;
}
break;
default:
if (a < 0x80)
return false;
}
[[fallthrough]];
case 1:
if ((a = source[0]) >= 0x80 && a < 0xC2) {
return false;
}
}
if (source[0] > 0xF4) {
return false;
}
return true;
}
inline UChar32 ReadUtf8Sequence(base::span<const uint8_t> source,
size_t length) {
DCHECK_LT(0u, length);
DCHECK_GT(5u, length);
if (length == 1) {
return source[0];
}
const uint8_t b0 = source[0];
const uint8_t b1 = source[1];
if (length == 2) {
// 2-byte sequence: 110xxxxx 10xxxxxx
return ((b0 & 0x1F) << 6) | (b1 & 0x3F);
}
const uint8_t b2 = source[2];
if (length == 3) {
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
return ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
}
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
const uint8_t b3 = source[3];
return ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) |
(b3 & 0x3F);
}
ConversionStatus ConvertUtf8ToUtf16Internal(base::span<const uint8_t>& source,
base::span<UChar>& target,
bool strict) {
ConversionStatus status = kConversionOK;
using MachineWord = uintptr_t;
constexpr size_t kWordWidth = sizeof(MachineWord);
constexpr MachineWord kAsciiMask =
(kWordWidth == 8) ? 0x8080808080808080ULL : 0x80808080UL;
constexpr uintptr_t kMachineWordAlignmentMask = kWordWidth - 1;
while (!source.empty()) {
// Attempt the fast path if we have enough data for a full, aligned word.
if (source.size() >= kWordWidth && target.size() >= kWordWidth &&
!(reinterpret_cast<uintptr_t>(source.data()) &
kMachineWordAlignmentMask)) {
const MachineWord word =
*reinterpret_cast<const MachineWord*>(source.data());
if ((word & kAsciiMask) == 0) {
// All bytes in the aligned word are ASCII. Convert them in a simple
// loop.
for (size_t i = 0; i < kWordWidth; ++i) {
target[i] = source[i];
}
source = source.subspan(kWordWidth);
target = target.subspan(kWordWidth);
continue;
}
}
// Process one character using the scalar path.
const size_t utf8_sequence_length = InlineUtf8SequenceLength(source[0]);
if (source.size() < utf8_sequence_length) {
status = kSourceExhausted;
break;
}
// Do this check whether lenient or strict
if (!IsLegalUtf8(source.first(utf8_sequence_length))) {
status = kSourceIllegal;
break;
}
const UChar32 character = ReadUtf8Sequence(source, utf8_sequence_length);
if (U_IS_BMP(character)) {
if (target.empty()) {
status = kTargetExhausted;
break;
}
// UTF-16 surrogate values are illegal in UTF-32
if (U_IS_SURROGATE(character)) {
if (strict) {
status = kSourceIllegal;
break;
}
target[0] = blink::uchar::kReplacementCharacter;
} else {
target[0] = static_cast<UChar>(character);
}
source = source.subspan(utf8_sequence_length);
target = target.subspan(1u);
} else if (U_IS_SUPPLEMENTARY(character)) {
// target is a character in range 0xFFFF - 0x10FFFF
if (target.size() < 2u) {
status = kTargetExhausted;
break;
}
target[0] = U16_LEAD(character);
target[1] = U16_TRAIL(character);
source = source.subspan(utf8_sequence_length);
target = target.subspan(2u);
} else {
// This should never happen; InlineUTF8SequenceLength() can never return
// a value higher than 4, and a 4-byte UTF-8 sequence can never encode
// anything higher than 0x10FFFF.
NOTREACHED();
}
}
return status;
}
} // namespace
ConversionResult<uint8_t> ConvertLatin1ToUtf8(base::span<const LChar> source,
base::span<uint8_t> target) {
auto original_source = source;
auto original_target = target;
auto status = ConvertLatin1ToUtf8Internal(source, target);
return {
original_target.first(original_target.size() - target.size()),
original_source.size() - source.size(),
status,
};
}
ConversionResult<uint8_t> ConvertUtf16ToUtf8(base::span<const UChar> source,
base::span<uint8_t> target,
bool strict) {
auto original_source = source;
auto original_target = target;
auto status = ConvertUtf16ToUtf8Internal(source, target, strict);
return {
original_target.first(original_target.size() - target.size()),
original_source.size() - source.size(),
status,
};
}
ConversionResult<UChar> ConvertUtf8ToUtf16(base::span<const uint8_t> source,
base::span<UChar> target,
bool strict) {
auto original_source = source;
auto original_target = target;
auto status = ConvertUtf8ToUtf16Internal(source, target, strict);
return {
original_target.first(original_target.size() - target.size()),
original_source.size() - source.size(),
status,
};
}
unsigned CalculateStringLengthFromUtf8(base::span<const uint8_t> data,
bool& seen_non_ascii,
bool& seen_non_latin1) {
seen_non_ascii = false;
seen_non_latin1 = false;
if (data.empty()) {
return 0;
}
unsigned utf16_length = 0;
size_t data_cursor = 0;
size_t data_end = data.size();
while (data_cursor < data_end) {
if (IsASCII(data[data_cursor])) {
data_cursor++;
utf16_length++;
continue;
}
seen_non_ascii = true;
size_t utf8_sequence_length =
InlineUtf8SequenceLengthNonAscii(data[data_cursor]);
if (data_end - data_cursor < utf8_sequence_length) {
return 0;
}
if (!IsLegalUtf8(data.subspan(data_cursor, utf8_sequence_length))) {
return 0;
}
UChar32 character =
ReadUtf8Sequence(data.subspan(data_cursor), utf8_sequence_length);
DCHECK(!IsASCII(character));
data_cursor += utf8_sequence_length;
if (character > 0xff) {
seen_non_latin1 = true;
}
if (U_IS_BMP(character)) {
// UTF-16 surrogate values are illegal in UTF-32
if (U_IS_SURROGATE(character))
return 0;
utf16_length++;
} else if (U_IS_SUPPLEMENTARY(character)) {
utf16_length += 2;
} else {
return 0;
}
}
data = data.first(data_cursor);
return utf16_length;
}
} // namespace blink::unicode