blob: c7beb0f3bb778b78dac7f56133e2d644e450a138 [file] [log] [blame]
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "config.h"
#include "platform/fonts/ScriptRunIterator.h"
#include "platform/Logging.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "wtf/Assertions.h"
#include "wtf/Threading.h"
#include "wtf/text/WTFString.h"
#include <string>
namespace blink {
struct TestRun {
std::string text;
UScriptCode code;
};
struct ExpectedRun {
unsigned limit;
UScriptCode code;
ExpectedRun(unsigned the_limit, UScriptCode the_code)
: limit(the_limit)
, code(the_code)
{
}
};
class MockScriptData : public ScriptData {
public:
~MockScriptData() override {}
static const MockScriptData* instance()
{
AtomicallyInitializedStaticReference(const MockScriptData, mockScriptData, (new MockScriptData()));
return &mockScriptData;
}
void getScripts(UChar32 ch, Vector<UScriptCode>& dst) const override
{
ASSERT(ch >= kMockCharMin);
ASSERT(ch < kMockCharLimit);
int code = ch - kMockCharMin;
dst.clear();
switch (code & kCodeSpecialMask) {
case kCodeSpecialCommon:
dst.append(USCRIPT_COMMON);
break;
case kCodeSpecialInherited:
dst.append(USCRIPT_INHERITED);
break;
default:
break;
}
int listBits = kTable[code & kCodeListIndexMask];
if (dst.isEmpty() && listBits == 0) {
dst.append(USCRIPT_UNKNOWN);
return;
}
while (listBits) {
switch (listBits & kListMask) {
case 0:
break;
case kLatin:
dst.append(USCRIPT_LATIN);
break;
case kHan:
dst.append(USCRIPT_HAN);
break;
case kGreek:
dst.append(USCRIPT_GREEK);
break;
}
listBits >>= kListShift;
}
}
UChar32 getPairedBracket(UChar32 ch) const override
{
switch (getPairedBracketType(ch)) {
case PairedBracketType::BracketTypeClose:
return ch - kBracketDelta;
case PairedBracketType::BracketTypeOpen:
return ch + kBracketDelta;
default:
return ch;
}
}
PairedBracketType getPairedBracketType(UChar32 ch) const override
{
ASSERT(ch >= kMockCharMin && ch < kMockCharLimit);
int code = ch - kMockCharMin;
if ((code & kCodeBracketBit) == 0) {
return PairedBracketType::BracketTypeNone;
}
if (code & kCodeBracketCloseBit) {
return PairedBracketType::BracketTypeClose;
}
return PairedBracketType::BracketTypeOpen;
}
static int TableLookup(int value)
{
for (int i = 0; i < 16; ++i) {
if (kTable[i] == value) {
return i;
}
}
WTF_LOG_ERROR("Table does not contain value 0x%x", value);
return 0;
}
static String ToTestString(const std::string& input)
{
String result(String::make16BitFrom8BitSource(0, 0));
bool inSet = false;
int seen = 0;
int code = 0;
int list = 0;
int currentShift = 0;
for (char c : input) {
if (inSet) {
switch (c) {
case '(':
ASSERT(seen == 0);
seen |= kSawBracket;
code |= kCodeBracketBit;
break;
case '[':
ASSERT(seen == 0);
seen |= kSawBracket;
code |= kCodeBracketBit | kCodeSquareBracketBit;
break;
case ')':
ASSERT(seen == 0);
seen |= kSawBracket;
code |= kCodeBracketBit | kCodeBracketCloseBit;
break;
case ']':
ASSERT(seen == 0);
seen |= kSawBracket;
code |= kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketCloseBit;
break;
case 'i':
ASSERT(seen == 0); // brackets can't be inherited
seen |= kSawSpecial;
code |= kCodeSpecialInherited;
break;
case 'c':
ASSERT((seen & ~kSawBracket) == 0);
seen |= kSawSpecial;
code |= kCodeSpecialCommon;
break;
case 'l':
ASSERT((seen & kSawLatin) == 0);
ASSERT(currentShift < 3);
seen |= kSawLatin;
list |= kLatin << (2 * currentShift++);
break;
case 'h':
ASSERT((seen & kSawHan) == 0);
ASSERT(currentShift < 3);
seen |= kSawHan;
list |= kHan << (2 * currentShift++);
break;
case 'g':
ASSERT((seen & kSawGreek) == 0);
ASSERT(currentShift < 3);
seen |= kSawGreek;
list |= kGreek << (2 * currentShift++);
break;
case '>':
ASSERT(seen != 0);
code |= TableLookup(list);
result.append(static_cast<UChar>(kMockCharMin + code));
inSet = false;
break;
default:
WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);
break;
}
continue;
}
// not in set
switch (c) {
case '<':
seen = 0;
code = 0;
list = 0;
currentShift = 0;
inSet = true;
break;
case '(':
code = kCodeBracketBit | kCodeSpecialCommon;
break;
case '[':
code = kCodeBracketBit | kCodeSquareBracketBit | kCodeSpecialCommon;
break;
case ')':
code = kCodeBracketBit | kCodeBracketCloseBit | kCodeSpecialCommon;
break;
case ']':
code = kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketCloseBit | kCodeSpecialCommon;
break;
case 'i':
code = kCodeSpecialInherited;
break;
case 'c':
code = kCodeSpecialCommon;
break;
case 'l':
code = kLatin;
break;
case 'h':
code = kHan;
break;
case 'g':
code = kGreek;
break;
case '?':
code = 0; // unknown
break;
default:
WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);
}
if (!inSet) {
result.append(static_cast<UChar>(kMockCharMin + code));
}
}
return result;
}
// We determine properties based on the offset from kMockCharMin:
// bits 0-3 represent the list of l, h, c scripts (index into table)
// bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal
// bit 6 clear means non-bracket, open means bracket
// bit 7 clear means open bracket, set means close bracket
// bit 8 clear means paren, set means bracket
// if it's a bracket, the matching bracket is 64 code points away
static const UChar32 kMockCharMin = 0xe000;
static const UChar32 kMockCharLimit = kMockCharMin + 0x200;
static const int kLatin = 1;
static const int kHan = 2;
static const int kGreek = 3;
static const int kCodeListIndexMask = 0xf;
static const int kCodeSpecialMask = 0x30;
static const int kCodeSpecialCommon = 0x10;
static const int kCodeSpecialInherited = 0x20;
static const int kCodeBracketCloseBit = 0x40;
static const int kCodeBracketBit = 0x80;
static const int kCodeSquareBracketBit = 0x100;
static const int kListShift = 2;
static const int kListMask = 0x3;
static const int kBracketDelta = kCodeBracketCloseBit;
static const int kTable[16];
static const int kSawBracket = 0x1;
static const int kSawSpecial = 0x2;
static const int kSawLatin = 0x4;
static const int kSawHan = 0x8;
static const int kSawGreek = 0x10;
};
static const int kLatin2 = MockScriptData::kLatin << 2;
static const int kHan2 = MockScriptData::kHan << 2;
static const int kGreek2 = MockScriptData::kGreek << 2;
static const int kLatin3 = MockScriptData::kLatin << 4;
static const int kHan3 = MockScriptData::kHan << 4;
static const int kGreek3 = MockScriptData::kGreek << 4;
const int MockScriptData::kTable[] = {
0, kLatin, kHan, kGreek,
kLatin2 + kHan, kLatin2 + kGreek,
kHan2 + kLatin, kHan2 + kGreek,
kGreek2 + kLatin, kGreek2 + kHan,
kLatin3 + kHan2 + kGreek, kLatin3 + kGreek2 + kHan,
kHan3 + kLatin2 + kGreek, kHan3 + kGreek2 + kLatin,
kGreek3 + kLatin2 + kHan, kGreek3 + kHan2 + kLatin,
};
class ScriptRunIteratorTest : public testing::Test {
protected:
void CheckRuns(const Vector<TestRun>& runs)
{
String text(String::make16BitFrom8BitSource(0, 0));
Vector<ExpectedRun> expect;
for (auto& run : runs) {
text.append(String::fromUTF8(run.text.c_str()));
expect.append(ExpectedRun(text.length(), run.code));
}
ScriptRunIterator scriptRunIterator(text.characters16(), text.length());
VerifyRuns(&scriptRunIterator, expect);
}
// FIXME crbug.com/527329 - CheckMockRuns should be replaced by finding
// suitable equivalent real codepoint sequences instead.
void CheckMockRuns(const Vector<TestRun>& runs)
{
String text(String::make16BitFrom8BitSource(0, 0));
Vector<ExpectedRun> expect;
for (const TestRun& run : runs) {
text.append(MockScriptData::ToTestString(run.text));
expect.append(ExpectedRun(text.length(), run.code));
}
ScriptRunIterator scriptRunIterator(text.characters16(), text.length(),
MockScriptData::instance());
VerifyRuns(&scriptRunIterator, expect);
}
void VerifyRuns(ScriptRunIterator* scriptRunIterator,
const Vector<ExpectedRun>& expect)
{
unsigned limit;
UScriptCode code;
unsigned long runCount = 0;
while (scriptRunIterator->consume(limit, code)) {
ASSERT_LT(runCount, expect.size());
ASSERT_EQ(expect[runCount].limit, limit);
ASSERT_EQ(expect[runCount].code, code);
++runCount;
}
WTF_LOG_ERROR("Expected %zu runs, got %lu ", expect.size(), runCount);
ASSERT_EQ(expect.size(), runCount);
}
};
TEST_F(ScriptRunIteratorTest, Empty)
{
String empty(String::make16BitFrom8BitSource(0, 0));
ScriptRunIterator scriptRunIterator(empty.characters16(), empty.length());
unsigned limit = 0;
UScriptCode code = USCRIPT_INVALID_CODE;
ASSERT(!scriptRunIterator.consume(limit, code));
ASSERT_EQ(limit, 0u);
ASSERT_EQ(code, USCRIPT_INVALID_CODE);
}
// Some of our compilers cannot initialize a vector from an array yet.
#define DECLARE_RUNSVECTOR(...) \
static const TestRun runsArray[] = __VA_ARGS__; \
Vector<TestRun> runs; \
runs.append(runsArray, sizeof(runsArray) / sizeof(*runsArray));
#define CHECK_RUNS(...) \
DECLARE_RUNSVECTOR(__VA_ARGS__); \
CheckRuns(runs);
#define CHECK_MOCK_RUNS(...) \
DECLARE_RUNSVECTOR(__VA_ARGS__); \
CheckMockRuns(runs);
TEST_F(ScriptRunIteratorTest, Whitespace)
{
CHECK_RUNS({ { " \t ", USCRIPT_COMMON } });
}
TEST_F(ScriptRunIteratorTest, Common)
{
CHECK_RUNS({ { " ... !?", USCRIPT_COMMON } });
}
TEST_F(ScriptRunIteratorTest, Latin)
{
CHECK_RUNS({ { "latin", USCRIPT_LATIN } });
}
TEST_F(ScriptRunIteratorTest, Chinese)
{
CHECK_RUNS({ { "萬國碼", USCRIPT_HAN } });
}
// Close bracket without matching open is ignored
TEST_F(ScriptRunIteratorTest, UnbalancedParens1)
{
CHECK_RUNS({ { "(萬", USCRIPT_HAN },
{ "a]", USCRIPT_LATIN },
{ ")", USCRIPT_HAN } });
}
// Open bracket without matching close is popped when inside
// matching close brackets, so doesn't match later close.
TEST_F(ScriptRunIteratorTest, UnbalancedParens2)
{
CHECK_RUNS({ { "(萬", USCRIPT_HAN },
{ "a[", USCRIPT_LATIN },
{ ")]", USCRIPT_HAN } });
}
// space goes with leading script
TEST_F(ScriptRunIteratorTest, LatinHan)
{
CHECK_RUNS({ { "Unicode ", USCRIPT_LATIN },
{ "萬國碼", USCRIPT_HAN } });
}
// space goes with leading script
TEST_F(ScriptRunIteratorTest, HanLatin)
{
CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },
{ "Unicode", USCRIPT_LATIN } });
}
TEST_F(ScriptRunIteratorTest, ParenEmptyParen)
{
CHECK_RUNS({ { "()", USCRIPT_COMMON } });
}
TEST_F(ScriptRunIteratorTest, ParenChineseParen)
{
CHECK_RUNS({ { "(萬國碼)", USCRIPT_HAN } });
}
TEST_F(ScriptRunIteratorTest, ParenLatinParen)
{
CHECK_RUNS({ { "(Unicode)", USCRIPT_LATIN } });
}
// open paren gets leading script
TEST_F(ScriptRunIteratorTest, LatinParenChineseParen)
{
CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },
{ "萬國碼", USCRIPT_HAN },
{ ")", USCRIPT_LATIN } });
}
// open paren gets first trailing script if no leading script
TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin)
{
CHECK_RUNS({ { "(萬國碼) ", USCRIPT_HAN },
{ "Unicode", USCRIPT_LATIN } });
}
// leading common and open paren get first trailing script.
// TODO(dougfelt): we don't do quote matching, but probably should figure out
// something better then doing nothing.
TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote)
{
CHECK_RUNS({ { "\"(萬國碼) ", USCRIPT_HAN },
{ "Unicode\"", USCRIPT_LATIN } });
}
// Emojies are resolved to the leading script.
TEST_F(ScriptRunIteratorTest, EmojiCommon)
{
CHECK_RUNS({ { "百家姓🌱🌲🌳🌴", USCRIPT_HAN } });
}
// Unmatched close brace gets leading context
TEST_F(ScriptRunIteratorTest, UnmatchedClose)
{
CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },
{ "萬國碼] ", USCRIPT_HAN },
{ ") Unicode\"", USCRIPT_LATIN } });
}
// Match up to 32 bracket pairs
TEST_F(ScriptRunIteratorTest, Match32Brackets)
{
CHECK_RUNS({ { "[萬國碼 ", USCRIPT_HAN },
{ "Unicode (((((((((((((((((((((((((((((((!"
")))))))))))))))))))))))))))))))",
USCRIPT_LATIN },
{ "]", USCRIPT_HAN } });
}
// Matches 32 most recent bracket pairs. More than that, and we revert to
// surrounding script.
TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets)
{
CHECK_RUNS({ { "((([萬國碼 ", USCRIPT_HAN },
{ "Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN },
{ "萬國碼!", USCRIPT_HAN },
{ ")))))))))))))))))))))))))))))))", USCRIPT_LATIN },
{ "]", USCRIPT_HAN },
{ "But )))", USCRIPT_LATIN } });
}
// A char with multiple scripts that match both leading and trailing context
// gets the leading context.
TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext)
{
CHECK_MOCK_RUNS({ { "h<lh>", USCRIPT_HAN },
{ "l", USCRIPT_LATIN } });
}
// A char with multiple scripts that only match trailing context gets the
// trailing context.
TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext)
{
CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },
{ "<gl>l", USCRIPT_LATIN } });
}
// Retain first established priority script. <lhg><gh> produce the script <gh>
// with g as priority, because of the two priority scripts l and g, only g
// remains. Then <gh><hgl> retains g as priority, because of the two priority
// scripts g and h that remain, g was encountered first.
TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript)
{
CHECK_MOCK_RUNS({ { "<lhg><gh><hgl>", USCRIPT_GREEK } });
}
// Parens can have scripts that break script runs.
TEST_F(ScriptRunIteratorTest, ExtensionsParens)
{
CHECK_MOCK_RUNS({ { "<gl><(lg>", USCRIPT_GREEK },
{ "h<[hl>", USCRIPT_HAN },
{ "l", USCRIPT_LATIN },
{ "<]hl>", USCRIPT_HAN },
{ "<)lg>", USCRIPT_GREEK } });
}
// The close paren might be encountered before we've established the open
// paren's script, but when this is the case the current set is still valid, so
// this doesn't affect it nor break the run.
TEST_F(ScriptRunIteratorTest, ExtensionsParens2)
{
CHECK_MOCK_RUNS({ { "<(lhg><gh><)lhg>", USCRIPT_GREEK } });
}
// A common script with a single extension should be treated as common, but
// with the extended script as a default. If we encounter anything other than
// common, that takes priority. If we encounter other common scripts with a
// single extension, the current priority remains.
TEST_F(ScriptRunIteratorTest, CommonWithPriority)
{
CHECK_MOCK_RUNS({ { "<ch>", USCRIPT_HAN } });
}
TEST_F(ScriptRunIteratorTest, CommonWithPriority2)
{
CHECK_MOCK_RUNS({ { "<ch><lh>", USCRIPT_LATIN } });
}
TEST_F(ScriptRunIteratorTest, CommonWithPriority3)
{
CHECK_MOCK_RUNS({ { "<ch><cl><cg>", USCRIPT_HAN } });
}
// UDatta (\xE0\xA5\x91) is inherited with LATIN and DEVANAGARI extensions.
// Since it has LATIN, and the dotted circle (\xE2\x97\x8C) is COMMON and has
// adopted the preceding LATIN, it gets the LATIN. This is standard.
TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta)
{
CHECK_RUNS({ { "Latin \xE2\x97\x8C\xE0\xA5\x91", USCRIPT_LATIN } });
}
// In this situation, UDatta (\xE0\xA5\x91) doesn't share a script with the
// value inherited by the dotted circle (\xE2\x97\x8C). It captures the
// preceding dotted circle and breaks it from the run it would normally have
// been in.
TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta)
{
CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },
{ "\xE2\x97\x8C\xE0\xA5\x91", USCRIPT_DEVANAGARI } });
}
// Tatweel is \xD9\x80 Lm, Fathatan is \xD9\x8B Mn. The script of tatweel is
// common, that of Fathatan is inherited. The script extensions for Fathatan
// are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the
// preferred script for Fathatan is Arabic, according to Behdad's
// heuristic. This is exactly analogous to the Udatta tests above, except
// Tatweel is Lm. But we don't take properties into account, only scripts.
TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan)
{
CHECK_RUNS({ { "Latin ", USCRIPT_LATIN },
{ "\xD9\x80\xD9\x8B", USCRIPT_ARABIC } });
}
// Another case where if the mark accepts a script that was inherited by the
// preceding common-script character, they both continue in that script.
// SYRIAC LETTER NUN \xDC\xA2
// ARABIC TATWEEL \xD9\x80
// ARABIC FATHATAN \xD9\x82
TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan)
{
CHECK_RUNS({ { "\xDC\xA2\xD9\x80\xD9\x8B", USCRIPT_SYRIAC } });
}
// The Udatta (\xE0\xA5\x91) is inherited, so will share runs with anything that
// is not common.
TEST_F(ScriptRunIteratorTest, HanUdatta)
{
CHECK_RUNS({ { "萬國碼\xE0\xA5\x91", USCRIPT_HAN } });
}
// The Udatta (\xE0\xA5\x91) is inherited, and will capture the space and turn
// it into Devanagari.
TEST_F(ScriptRunIteratorTest, HanSpaceUdatta)
{
CHECK_RUNS({ { "萬國碼", USCRIPT_HAN },
{ " \xE0\xA5\x91", USCRIPT_DEVANAGARI } });
}
// Corresponds to one test in RunSegmenter, where orientation of the
// space character is sidesways in vertical.
TEST_F(ScriptRunIteratorTest, Hangul)
{
CHECK_RUNS({ { "키스의 고유조건은", USCRIPT_HANGUL } });
}
// Corresponds to one test in RunSegmenter, which tests that the punctuation
// characters mixed in are actually sideways in vertical. The ScriptIterator
// should report one run, but the RunSegmenter should report three, with the
// middle one rotated sideways.
TEST_F(ScriptRunIteratorTest, HiraganaMixedPunctuation)
{
CHECK_RUNS({ { "いろはに.…¡ほへと", USCRIPT_HIRAGANA } });
}
// Make sure Mock code works too.
TEST_F(ScriptRunIteratorTest, MockHanInheritedGL)
{
CHECK_MOCK_RUNS({ { "h<igl>", USCRIPT_HAN } });
}
TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL)
{
CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },
{ "c<igl>", USCRIPT_GREEK } });
}
// Leading inherited just act like common, except there's no preferred script.
TEST_F(ScriptRunIteratorTest, MockLeadingInherited)
{
CHECK_MOCK_RUNS({ { "<igl>", USCRIPT_COMMON } });
}
// Leading inherited just act like common, except there's no preferred script.
TEST_F(ScriptRunIteratorTest, MockLeadingInherited2)
{
CHECK_MOCK_RUNS({ { "<igl><ih>", USCRIPT_COMMON } });
}
TEST_F(ScriptRunIteratorTest, LeadingInheritedHan)
{
// DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
CHECK_RUNS({ { "\xE0\xA5\x91萬國碼", USCRIPT_HAN } });
}
TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2)
{
// DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
// ARABIC FATHATAN \xD9\x8B
CHECK_RUNS({ { "\xE0\xA5\x91\xD9\x8B萬國碼", USCRIPT_HAN } });
}
TEST_F(ScriptRunIteratorTest, OddLatinString)
{
CHECK_RUNS({ { "ç̈", USCRIPT_LATIN } });
}
TEST_F(ScriptRunIteratorTest, CommonMalayalam)
{
CHECK_RUNS({ { "100-ാം", USCRIPT_MALAYALAM } });
}
class ScriptRunIteratorICUDataTest : public testing::Test {
public:
ScriptRunIteratorICUDataTest()
: m_maxExtensions(0)
, m_maxExtensionsCodepoint(0xffff)
{
int maxExtensions = 0;
UChar32 m_maxExtensionscp = 0;
for (UChar32 cp = 0; cp < 0x11000; ++cp) {
UErrorCode status = U_ZERO_ERROR;
int count = uscript_getScriptExtensions(cp, 0, 0, &status);
if (count > maxExtensions) {
maxExtensions = count;
m_maxExtensionscp = cp;
}
}
m_maxExtensions = maxExtensions;
m_maxExtensionsCodepoint = m_maxExtensionscp;
}
protected:
UChar32 GetACharWithMaxExtensions(int* numExtensions)
{
if (numExtensions) {
*numExtensions = m_maxExtensions;
}
return m_maxExtensionsCodepoint;
}
private:
int m_maxExtensions;
UChar32 m_maxExtensionsCodepoint;
};
// Validate that ICU never returns more than our maximum expected number of
// script extensions.
TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions)
{
int maxExtensions;
UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);
ASSERT_LE(maxExtensions, ScriptData::kMaxScriptCount)
<< "char " << std::hex << cp << std::dec;
}
// Check that ICUScriptData returns all of a character's scripts.
// This only checks one likely character, but doesn't check all cases.
TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions)
{
int maxExtensions;
UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);
Vector<UScriptCode> extensions;
ICUScriptData::instance()->getScripts(cp, extensions);
// It's possible that GetScripts adds the primary script to the list of
// extensions, resulting in one more script than the raw extension count.
ASSERT_GE(static_cast<int>(extensions.size()), maxExtensions)
<< "char " << std::hex << cp << std::dec;
}
TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension)
{
Vector<UScriptCode> extensions;
for (UChar32 cp = 0; cp < 0x110000; ++cp) {
ICUScriptData::instance()->getScripts(cp, extensions);
UScriptCode primary = extensions.at(0);
if (primary == USCRIPT_COMMON) {
ASSERT_LE(extensions.size(), 2ul)
<< "cp: " << std::hex << cp << std::dec;
}
}
}
// ZWJ is \u200D Cf (Format, other) and its script is inherited. I'm going to
// ignore this for now, as I think it shouldn't matter which run it ends up
// in. HarfBuzz needs to be able to use it as context and shape each
// neighboring character appropriately no matter what run it got assigned to.
} // namespace blink