blob: 544a3ea5a959d7980ea8f7964f00310463fc23a5 [file] [log] [blame] [edit]
#include "simdutf.h"
#include <array>
#include <iostream>
#include <tests/helpers/transcode_test_base.h>
#include <tests/helpers/random_int.h>
#include <tests/helpers/test.h>
#include <memory>
#include <tests/helpers/random_utf8.h>
namespace {
std::array<size_t, 9> input_size{7, 12, 16, 64, 67, 128, 256, 511, 1000};
using simdutf::tests::helpers::transcode_utf8_to_utf16_test_base;
constexpr size_t trials = 10000;
}
#include "reference/validate_utf8.h"
TEST(convert_check_validation) {
fflush(NULL);
uint32_t seed{1234};
simdutf::tests::helpers::random_utf8 gen_1_2_3_4(seed, 1, 1, 1, 1);
size_t total = 1000;
for (size_t i = 0; i < total; i++) {
auto UTF8 = gen_1_2_3_4.generate(rand() % 256);
std::unique_ptr<char16_t[]> buffer(new char16_t[UTF8.size()]);
ASSERT_TRUE(implementation.convert_utf8_to_utf16le((const char *)UTF8.data(), UTF8.size(), buffer.get()) > 0);
for (size_t flip = 0; flip < 1000; ++flip) {
// we are going to hack the string as long as it is UTF-8
const int bitflip{1 << (rand() % 8)};
UTF8[rand() % UTF8.size()] = uint8_t(bitflip); // we flip exactly one bit
bool is_ok =
(implementation.convert_utf8_to_utf16le((const char *)UTF8.data(), UTF8.size(), buffer.get()) > 0);
bool is_ok_reference =
simdutf::tests::reference::validate_utf8((const char *)UTF8.data(), UTF8.size());
ASSERT_TRUE(is_ok == is_ok_reference);
}
}
}
TEST(convert_check_validation_examples) {
const char *goodsequences[] = {"a",
"\xc3\xb1",
"\xe2\x82\xa1",
"\xf0\x90\x8c\xbc",
"\xc2\x80", // 6.7.2
"\xf0\x90\x80\x80", // 6.7.4
"\xee\x80\x80", // 6.11.2
"\xef\xbb\xbf"};
const char *badsequences[] = {
"\xc3\x28", // 0
"\xa0\xa1", // 1
"\xe2\x28\xa1", // 2
"\xe2\x82\x28", // 3
"\xf0\x28\x8c\xbc", // 4
"\xf0\x90\x28\xbc", // 5
"\xf0\x28\x8c\x28", // 6
"\xc0\x9f", // 7
"\xf5\xff\xff\xff", // 8
"\xed\xa0\x81", // 9
"\xf8\x90\x80\x80\x80", // 10
"123456789012345\xed", // 11
"123456789012345\xf1", // 12
"123456789012345\xc2", // 13
"\xC2\x7F", // 14
"\xce", // 6.6.1
"\xce\xba\xe1", // 6.6.3
"\xce\xba\xe1\xbd", // 6.6.4
"\xce\xba\xe1\xbd\xb9\xcf", // 6.6.6
"\xce\xba\xe1\xbd\xb9\xcf\x83\xce", // 6.6.8
"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce", // 6.6.10
"\xdf", // 6.14.6
"\xef\xbf", // 6.14.7
"\x80",
"\x91\x85\x95\x9e",
"\x6c\x02\x8e\x18",
"\x25\x5b\x6e\x2c\x32\x2c\x5b\x5b\x33\x2c\x34\x2c\x05\x29\x2c\x33\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5d\x2c\x35\x2e\x33\x2c\x39\x2e\x33\x2c\x37\x2e\x33\x2c\x39\x2e\x34\x2c\x37\x2e\x33\x2c\x39\x2e\x33\x2c\x37\x2e\x33\x2c\x39\x2e\x34\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x20\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x23\x0a\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x7e\x7e\x0a\x0a\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5d\x2c\x37\x2e\x33\x2c\x39\x2e\x33\x2c\x37\x2e\x33\x2c\x39\x2e\x34\x2c\x37\x2e\x33\x2c\x39\x2e\x33\x2c\x37\x2e\x33\x2c\x39\x2e\x34\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x5d\x01\x01\x80\x01\x01\x01\x79\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01",
"[[[[[[[[[[[[[[[\x80\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x010\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01",
"\x20\x0b\x01\x01\x01\x64\x3a\x64\x3a\x64\x3a\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x5b\x30\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x80\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"};
for (size_t i = 0; i < sizeof(goodsequences)/sizeof(goodsequences[0]); i++) {
size_t len = std::strlen(goodsequences[i]);
std::unique_ptr<char16_t[]> buffer(new char16_t[len]);
ASSERT_TRUE(implementation.convert_utf8_to_utf16le(goodsequences[i], len, buffer.get()) > 0);
}
for (size_t i = 0; i < sizeof(badsequences)/sizeof(badsequences[0]); i++) {
size_t len = std::strlen(badsequences[i]);
std::unique_ptr<char16_t[]> buffer(new char16_t[len]);
ASSERT_TRUE(implementation.convert_utf8_to_utf16le(badsequences[i], len, buffer.get()) == 0);
}
}
TEST(convert_pure_ASCII) {
for(size_t trial = 0; trial < trials; trial ++) {
if((trial % 100) == 0) { std::cout << "."; std::cout.flush(); }
size_t counter = 0;
auto generator = [&counter]() -> uint32_t {
return counter++ & 0x7f;
};
auto procedure = [&implementation](const char* utf8, size_t size, char16_t* utf16) -> size_t {
return implementation.convert_utf8_to_utf16le(utf8, size, utf16);
};
auto size_procedure = [&implementation](const char* utf8, size_t size) -> size_t {
return implementation.utf16_length_from_utf8(utf8, size);
};
for (size_t size: input_size) {
transcode_utf8_to_utf16_test_base test(generator, size);
ASSERT_TRUE(test(procedure));
ASSERT_TRUE(test.check_size(size_procedure));
}
}
}
TEST(convert_1_or_2_UTF8_bytes) {
for(size_t trial = 0; trial < trials; trial ++) {
uint32_t seed{1234+uint32_t(trial)};
if((trial % 100) == 0) { std::cout << "."; std::cout.flush(); }
simdutf::tests::helpers::RandomInt random(0x0000, 0x07ff, seed); // range for 1 or 2 UTF-8 bytes
auto procedure = [&implementation](const char* utf8, size_t size, char16_t* utf16) -> size_t {
return implementation.convert_utf8_to_utf16le(utf8, size, utf16);
};
auto size_procedure = [&implementation](const char* utf8, size_t size) -> size_t {
return implementation.utf16_length_from_utf8(utf8, size);
};
for (size_t size: input_size) {
transcode_utf8_to_utf16_test_base test(random, size);
ASSERT_TRUE(test(procedure));
ASSERT_TRUE(test.check_size(size_procedure));
}
}
}
TEST(convert_1_or_2_or_3_UTF8_bytes) {
for(size_t trial = 0; trial < trials; trial ++) {
uint32_t seed{1234+uint32_t(trial)};
if((trial % 100) == 0) { std::cout << "."; std::cout.flush(); }
// range for 1, 2 or 3 UTF-8 bytes
simdutf::tests::helpers::RandomIntRanges random({{0x0000, 0xd7ff},
{0xe000, 0xffff}}, seed);
auto procedure = [&implementation](const char* utf8, size_t size, char16_t* utf16) -> size_t {
return implementation.convert_utf8_to_utf16le(utf8, size, utf16);
};
auto size_procedure = [&implementation](const char* utf8, size_t size) -> size_t {
return implementation.utf16_length_from_utf8(utf8, size);
};
for (size_t size: input_size) {
transcode_utf8_to_utf16_test_base test(random, size);
ASSERT_TRUE(test(procedure));
ASSERT_TRUE(test.check_size(size_procedure));
}
}
}
TEST(convert_3_UTF8_bytes) {
for(size_t trial = 0; trial < trials; trial ++) {
uint32_t seed{1234+uint32_t(trial)};
if((trial % 100) == 0) { std::cout << "."; std::cout.flush(); }
simdutf::tests::helpers::RandomIntRanges random({{0x0800, 0xd800-1}}, seed); // range for 3 UTF-8 bytes
auto procedure = [&implementation](const char* utf8, size_t size, char16_t* utf16) -> size_t {
return implementation.convert_utf8_to_utf16le(utf8, size, utf16);
};
auto size_procedure = [&implementation](const char* utf8, size_t size) -> size_t {
return implementation.utf16_length_from_utf8(utf8, size);
};
for (size_t size: input_size) {
transcode_utf8_to_utf16_test_base test(random, size);
ASSERT_TRUE(test(procedure));
ASSERT_TRUE(test.check_size(size_procedure));
}
}
}
TEST(convert_3_or_4_UTF8_bytes) {
for(size_t trial = 0; trial < trials; trial ++) {
uint32_t seed{1234+uint32_t(trial)};
if((trial % 100) == 0) { std::cout << "."; std::cout.flush(); }
simdutf::tests::helpers::RandomIntRanges random({{0x0800, 0xd800-1},
{0xe000, 0x10ffff}}, seed); // range for 3 or 4 UTF-8 bytes
auto procedure = [&implementation](const char* utf8, size_t size, char16_t* utf16) -> size_t {
return implementation.convert_utf8_to_utf16le(utf8, size, utf16);
};
auto size_procedure = [&implementation](const char* utf8, size_t size) -> size_t {
return implementation.utf16_length_from_utf8(utf8, size);
};
for (size_t size: input_size) {
transcode_utf8_to_utf16_test_base test(random, size);
ASSERT_TRUE(test(procedure));
ASSERT_TRUE(test.check_size(size_procedure));
}
}
}
TEST(convert_null_4_UTF8_bytes) {
for(size_t trial = 0; trial < trials; trial ++) {
uint32_t seed{1234+uint32_t(trial)};
if((trial % 100) == 0) { std::cout << "."; std::cout.flush(); }
simdutf::tests::helpers::RandomIntRanges random({{0x0000, 0x00000},
{0x10000, 0x10ffff}}, seed); // range for 3 or 4 UTF-8 bytes
auto procedure = [&implementation](const char* utf8, size_t size, char16_t* utf16) -> size_t {
return implementation.convert_utf8_to_utf16le(utf8, size, utf16);
};
for (size_t size: input_size) {
transcode_utf8_to_utf16_test_base test(random, size);
ASSERT_TRUE(test(procedure));
}
}
}
#if SIMDUTF_IS_BIG_ENDIAN
// todo: port this test for big-endian platforms.
#else
TEST(issue111) {
// We stick to ASCII for our source code given that there is no universal way to specify the character encoding of
// the source files.
char16_t input[] = u"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\u30b3aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
size_t utf16_len = sizeof(input) / sizeof(char16_t) - 1;
ASSERT_TRUE(implementation.validate_utf16le(input, utf16_len));
ASSERT_TRUE(implementation.utf8_length_from_utf16le(input, utf16_len)
== 2 + utf16_len);
size_t utf8_len = implementation.utf8_length_from_utf16le(input, utf16_len);
std::unique_ptr<char[]> utf8_buffer{new char[utf8_len]};
ASSERT_TRUE(implementation.convert_utf16le_to_utf8(input, utf16_len, utf8_buffer.get())
== utf8_len);
std::unique_ptr<char16_t[]> utf16_buffer{new char16_t[utf16_len]};
ASSERT_TRUE(implementation.convert_utf8_to_utf16le(utf8_buffer.get(), utf8_len, utf16_buffer.get())
== utf16_len);
ASSERT_TRUE(std::char_traits<char16_t>::compare(input, utf16_buffer.get(), utf16_len) == 0);
}
#endif
TEST(special_cases) {
const uint8_t utf8[] = {0xC2, 0xA9}; // copyright sign
const uint8_t expected[] = {0xA9, 0x00}; // expected UTF-16LE
size_t utf16len = implementation.utf16_length_from_utf8((const char*)utf8, 2);
ASSERT_TRUE(utf16len == 1);
std::unique_ptr<char16_t[]> utf16(new char16_t[utf16len]);
size_t utf16size = implementation.convert_utf8_to_utf16le((const char*)utf8, 2, utf16.get());
ASSERT_TRUE(utf16size == utf16len);
ASSERT_TRUE(memcmp((const char*)utf16.get(), expected, 2) == 0);
}
int main(int argc, char* argv[]) {
return simdutf::test::main(argc, argv);
}