third_party/WebKit/Source/platform/fonts/ScriptRunIteratorTest.cpp - chromium/src - Git at Google

 // Copyright 2015 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "config.h"
 #include "platform/fonts/ScriptRunIterator.h"

 #include "platform/Logging.h"
 #include "testing/gtest/include/gtest/gtest.h"
 #include "wtf/Assertions.h"
 #include "wtf/Threading.h"
 #include "wtf/text/WTFString.h"
 #include <string>

 namespace blink {

 struct TestRun {
     std::string text;
     UScriptCode code;
 };

 struct ExpectedRun {
     unsigned limit;
     UScriptCode code;

     ExpectedRun(unsigned the_limit, UScriptCode the_code)
         : limit(the_limit)
         , code(the_code)
     {
     }
 };

 class MockScriptData : public ScriptData {
 public:
     ~MockScriptData() override {}

     static const MockScriptData* instance()
     {
         AtomicallyInitializedStaticReference(const MockScriptData, mockScriptData, (new MockScriptData()));

         return &mockScriptData;
     }

     void getScripts(UChar32 ch, Vector<UScriptCode>& dst) const override
     {
         ASSERT(ch >= kMockCharMin);
         ASSERT(ch < kMockCharLimit);

         int code = ch - kMockCharMin;
         dst.clear();
         switch (code & kCodeSpecialMask) {
         case kCodeSpecialCommon:
             dst.append(USCRIPT_COMMON);
             break;
         case kCodeSpecialInherited:
             dst.append(USCRIPT_INHERITED);
             break;
         default:
             break;
         }
         int listBits = kTable[code & kCodeListIndexMask];
         if (dst.isEmpty() && listBits == 0) {
             dst.append(USCRIPT_UNKNOWN);
             return;
         }
         while (listBits) {
             switch (listBits & kListMask) {
             case 0:
                 break;
             case kLatin:
                 dst.append(USCRIPT_LATIN);
                 break;
             case kHan:
                 dst.append(USCRIPT_HAN);
                 break;
             case kGreek:
                 dst.append(USCRIPT_GREEK);
                 break;
             }
             listBits >>= kListShift;
         }
     }

     UChar32 getPairedBracket(UChar32 ch) const override
     {
         switch (getPairedBracketType(ch)) {
         case PairedBracketType::BracketTypeClose:
             return ch - kBracketDelta;
         case PairedBracketType::BracketTypeOpen:
             return ch + kBracketDelta;
         default:
             return ch;
         }
     }

     PairedBracketType getPairedBracketType(UChar32 ch) const override
     {
         ASSERT(ch >= kMockCharMin && ch < kMockCharLimit);
         int code = ch - kMockCharMin;
         if ((code & kCodeBracketBit) == 0) {
             return PairedBracketType::BracketTypeNone;
         }
         if (code & kCodeBracketCloseBit) {
             return PairedBracketType::BracketTypeClose;
         }
         return PairedBracketType::BracketTypeOpen;
     }

     static int TableLookup(int value)
     {
         for (int i = 0; i < 16; ++i) {
             if (kTable[i] == value) {
                 return i;
             }
         }
         WTF_LOG_ERROR("Table does not contain value 0x%x", value);
         return 0;
     }

     static String ToTestString(const std::string& input)
     {
         String result(String::make16BitFrom8BitSource(0, 0));
         bool inSet = false;
         int seen = 0;
         int code = 0;
         int list = 0;
         int currentShift = 0;
         for (char c : input) {
             if (inSet) {
                 switch (c) {
                 case '(':
                     ASSERT(seen == 0);
                     seen |= kSawBracket;
                     code |= kCodeBracketBit;
                     break;
                 case '[':
                     ASSERT(seen == 0);
                     seen |= kSawBracket;
                     code |= kCodeBracketBit | kCodeSquareBracketBit;
                     break;
                 case ')':
                     ASSERT(seen == 0);
                     seen |= kSawBracket;
                     code |= kCodeBracketBit | kCodeBracketCloseBit;
                     break;
                 case ']':
                     ASSERT(seen == 0);
                     seen |= kSawBracket;
                     code |= kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketCloseBit;
                     break;
                 case 'i':
                     ASSERT(seen == 0); // brackets can't be inherited
                     seen |= kSawSpecial;
                     code |= kCodeSpecialInherited;
                     break;
                 case 'c':
                     ASSERT((seen & ~kSawBracket) == 0);
                     seen |= kSawSpecial;
                     code |= kCodeSpecialCommon;
                     break;
                 case 'l':
                     ASSERT((seen & kSawLatin) == 0);
                     ASSERT(currentShift < 3);
                     seen |= kSawLatin;
                     list |= kLatin << (2 * currentShift++);
                     break;
                 case 'h':
                     ASSERT((seen & kSawHan) == 0);
                     ASSERT(currentShift < 3);
                     seen |= kSawHan;
                     list |= kHan << (2 * currentShift++);
                     break;
                 case 'g':
                     ASSERT((seen & kSawGreek) == 0);
                     ASSERT(currentShift < 3);
                     seen |= kSawGreek;
                     list |= kGreek << (2 * currentShift++);
                     break;
                 case '>':
                     ASSERT(seen != 0);
                     code |= TableLookup(list);
                     result.append(static_cast<UChar>(kMockCharMin + code));
                     inSet = false;
                     break;
                 default:
                     WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);
                     break;
                 }
                 continue;
             }
             // not in set
             switch (c) {
             case '<':
                 seen = 0;
                 code = 0;
                 list = 0;
                 currentShift = 0;
                 inSet = true;
                 break;
             case '(':
                 code = kCodeBracketBit | kCodeSpecialCommon;
                 break;
             case '[':
                 code = kCodeBracketBit | kCodeSquareBracketBit | kCodeSpecialCommon;
                 break;
             case ')':
                 code = kCodeBracketBit | kCodeBracketCloseBit | kCodeSpecialCommon;
                 break;
             case ']':
                 code = kCodeBracketBit | kCodeSquareBracketBit | kCodeBracketCloseBit | kCodeSpecialCommon;
                 break;
             case 'i':
                 code = kCodeSpecialInherited;
                 break;
             case 'c':
                 code = kCodeSpecialCommon;
                 break;
             case 'l':
                 code = kLatin;
                 break;
             case 'h':
                 code = kHan;
                 break;
             case 'g':
                 code = kGreek;
                 break;
             case '?':
                 code = 0; // unknown
                 break;
             default:
                 WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);
             }
             if (!inSet) {
                 result.append(static_cast<UChar>(kMockCharMin + code));
             }
         }
         return result;
     }

     // We determine properties based on the offset from kMockCharMin:
     // bits 0-3 represent the list of l, h, c scripts (index into table)
     // bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal
     // bit 6 clear means non-bracket, open means bracket
     // bit 7 clear means open bracket, set means close bracket
     // bit 8 clear means paren, set means bracket
     // if it's a bracket, the matching bracket is 64 code points away
     static const UChar32 kMockCharMin = 0xe000;
     static const UChar32 kMockCharLimit = kMockCharMin + 0x200;
     static const int kLatin = 1;
     static const int kHan = 2;
     static const int kGreek = 3;
     static const int kCodeListIndexMask = 0xf;
     static const int kCodeSpecialMask = 0x30;
     static const int kCodeSpecialCommon = 0x10;
     static const int kCodeSpecialInherited = 0x20;
     static const int kCodeBracketCloseBit = 0x40;
     static const int kCodeBracketBit = 0x80;
     static const int kCodeSquareBracketBit = 0x100;
     static const int kListShift = 2;
     static const int kListMask = 0x3;
     static const int kBracketDelta = kCodeBracketCloseBit;
     static const int kTable[16];

     static const int kSawBracket = 0x1;
     static const int kSawSpecial = 0x2;
     static const int kSawLatin = 0x4;
     static const int kSawHan = 0x8;
     static const int kSawGreek = 0x10;
 };

 static const int kLatin2 = MockScriptData::kLatin << 2;
 static const int kHan2 = MockScriptData::kHan << 2;
 static const int kGreek2 = MockScriptData::kGreek << 2;
 static const int kLatin3 = MockScriptData::kLatin << 4;
 static const int kHan3 = MockScriptData::kHan << 4;
 static const int kGreek3 = MockScriptData::kGreek << 4;
 const int MockScriptData::kTable[] = {
     0, kLatin, kHan, kGreek,
     kLatin2 + kHan, kLatin2 + kGreek,
     kHan2 + kLatin, kHan2 + kGreek,
     kGreek2 + kLatin, kGreek2 + kHan,
     kLatin3 + kHan2 + kGreek, kLatin3 + kGreek2 + kHan,
     kHan3 + kLatin2 + kGreek, kHan3 + kGreek2 + kLatin,
     kGreek3 + kLatin2 + kHan, kGreek3 + kHan2 + kLatin,
 };

 class ScriptRunIteratorTest : public testing::Test {
 protected:
     void CheckRuns(const Vector<TestRun>& runs)
     {
         String text(String::make16BitFrom8BitSource(0, 0));
         Vector<ExpectedRun> expect;
         for (auto& run : runs) {
             text.append(String::fromUTF8(run.text.c_str()));
             expect.append(ExpectedRun(text.length(), run.code));
         }
         ScriptRunIterator scriptRunIterator(text.characters16(), text.length());
         VerifyRuns(&scriptRunIterator, expect);
     }

     // FIXME crbug.com/527329 - CheckMockRuns should be replaced by finding
     // suitable equivalent real codepoint sequences instead.
     void CheckMockRuns(const Vector<TestRun>& runs)
     {
         String text(String::make16BitFrom8BitSource(0, 0));
         Vector<ExpectedRun> expect;
         for (const TestRun& run : runs) {
             text.append(MockScriptData::ToTestString(run.text));
             expect.append(ExpectedRun(text.length(), run.code));
         }

         ScriptRunIterator scriptRunIterator(text.characters16(), text.length(),
             MockScriptData::instance());
         VerifyRuns(&scriptRunIterator, expect);
     }

     void VerifyRuns(ScriptRunIterator* scriptRunIterator,
         const Vector<ExpectedRun>& expect)
     {
         unsigned limit;
         UScriptCode code;
         unsigned long runCount = 0;
         while (scriptRunIterator->consume(limit, code)) {
             ASSERT_LT(runCount, expect.size());
             ASSERT_EQ(expect[runCount].limit, limit);
             ASSERT_EQ(expect[runCount].code, code);
             ++runCount;
         }
         WTF_LOG_ERROR("Expected %zu runs, got %lu ", expect.size(), runCount);
         ASSERT_EQ(expect.size(), runCount);
     }
 };

 TEST_F(ScriptRunIteratorTest, Empty)
 {
     String empty(String::make16BitFrom8BitSource(0, 0));
     ScriptRunIterator scriptRunIterator(empty.characters16(), empty.length());
     unsigned limit = 0;
     UScriptCode code = USCRIPT_INVALID_CODE;
     ASSERT(!scriptRunIterator.consume(limit, code));
     ASSERT_EQ(limit, 0u);
     ASSERT_EQ(code, USCRIPT_INVALID_CODE);
 }

 // Some of our compilers cannot initialize a vector from an array yet.
 #define DECLARE_RUNSVECTOR(...) \
     static const TestRun runsArray[] = __VA_ARGS__; \
     Vector<TestRun> runs; \
     runs.append(runsArray, sizeof(runsArray) / sizeof(*runsArray));

 #define CHECK_RUNS(...)              \
     DECLARE_RUNSVECTOR(__VA_ARGS__); \
     CheckRuns(runs);

 #define CHECK_MOCK_RUNS(...)         \
     DECLARE_RUNSVECTOR(__VA_ARGS__); \
     CheckMockRuns(runs);

 TEST_F(ScriptRunIteratorTest, Whitespace)
 {
     CHECK_RUNS({ { " \t ", USCRIPT_COMMON } });
 }

 TEST_F(ScriptRunIteratorTest, Common)
 {
     CHECK_RUNS({ { " ... !?", USCRIPT_COMMON } });
 }

 TEST_F(ScriptRunIteratorTest, Latin)
 {
     CHECK_RUNS({ { "latin", USCRIPT_LATIN } });
 }

 TEST_F(ScriptRunIteratorTest, Chinese)
 {
     CHECK_RUNS({ { "萬國碼", USCRIPT_HAN } });
 }

 // Close bracket without matching open is ignored
 TEST_F(ScriptRunIteratorTest, UnbalancedParens1)
 {
     CHECK_RUNS({ { "(萬", USCRIPT_HAN },
         { "a]", USCRIPT_LATIN },
         { ")", USCRIPT_HAN } });
 }

 // Open bracket without matching close is popped when inside
 // matching close brackets, so doesn't match later close.
 TEST_F(ScriptRunIteratorTest, UnbalancedParens2)
 {
     CHECK_RUNS({ { "(萬", USCRIPT_HAN },
         { "a[", USCRIPT_LATIN },
         { ")]", USCRIPT_HAN } });
 }

 // space goes with leading script
 TEST_F(ScriptRunIteratorTest, LatinHan)
 {
     CHECK_RUNS({ { "Unicode ", USCRIPT_LATIN },
         { "萬國碼", USCRIPT_HAN } });
 }

 // space goes with leading script
 TEST_F(ScriptRunIteratorTest, HanLatin)
 {
     CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },
         { "Unicode", USCRIPT_LATIN } });
 }

 TEST_F(ScriptRunIteratorTest, ParenEmptyParen)
 {
     CHECK_RUNS({ { "()", USCRIPT_COMMON } });
 }

 TEST_F(ScriptRunIteratorTest, ParenChineseParen)
 {
     CHECK_RUNS({ { "(萬國碼)", USCRIPT_HAN } });
 }

 TEST_F(ScriptRunIteratorTest, ParenLatinParen)
 {
     CHECK_RUNS({ { "(Unicode)", USCRIPT_LATIN } });
 }

 // open paren gets leading script
 TEST_F(ScriptRunIteratorTest, LatinParenChineseParen)
 {
     CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },
         { "萬國碼", USCRIPT_HAN },
         { ")", USCRIPT_LATIN } });
 }

 // open paren gets first trailing script if no leading script
 TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin)
 {
     CHECK_RUNS({ { "(萬國碼) ", USCRIPT_HAN },
         { "Unicode", USCRIPT_LATIN } });
 }

 // leading common and open paren get first trailing script.
 // TODO(dougfelt): we don't do quote matching, but probably should figure out
 // something better then doing nothing.
 TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote)
 {
     CHECK_RUNS({ { "\"(萬國碼) ", USCRIPT_HAN },
         { "Unicode\"", USCRIPT_LATIN } });
 }

 // Emojies are resolved to the leading script.
 TEST_F(ScriptRunIteratorTest, EmojiCommon)
 {
     CHECK_RUNS({ { "百家姓🌱🌲🌳🌴", USCRIPT_HAN } });
 }

 // Unmatched close brace gets leading context
 TEST_F(ScriptRunIteratorTest, UnmatchedClose)
 {
     CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },
         { "萬國碼] ", USCRIPT_HAN },
         { ") Unicode\"", USCRIPT_LATIN } });
 }

 // Match up to 32 bracket pairs
 TEST_F(ScriptRunIteratorTest, Match32Brackets)
 {
     CHECK_RUNS({ { "[萬國碼 ", USCRIPT_HAN },
         { "Unicode (((((((((((((((((((((((((((((((!"
         ")))))))))))))))))))))))))))))))",
         USCRIPT_LATIN },
         { "]", USCRIPT_HAN } });
 }

 // Matches 32 most recent bracket pairs. More than that, and we revert to
 // surrounding script.
 TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets)
 {
     CHECK_RUNS({ { "((([萬國碼 ", USCRIPT_HAN },
         { "Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN },
         { "萬國碼!", USCRIPT_HAN },
         { ")))))))))))))))))))))))))))))))", USCRIPT_LATIN },
         { "]", USCRIPT_HAN },
         { "But )))", USCRIPT_LATIN } });
 }

 // A char with multiple scripts that match both leading and trailing context
 // gets the leading context.
 TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext)
 {
     CHECK_MOCK_RUNS({ { "h<lh>", USCRIPT_HAN },
         { "l", USCRIPT_LATIN } });
 }

 // A char with multiple scripts that only match trailing context gets the
 // trailing context.
 TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext)
 {
     CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },
         { "<gl>l", USCRIPT_LATIN } });
 }

 // Retain first established priority script.  <lhg><gh> produce the script <gh>
 // with g as priority, because of the two priority scripts l and g, only g
 // remains.  Then <gh><hgl> retains g as priority, because of the two priority
 // scripts g and h that remain, g was encountered first.
 TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript)
 {
     CHECK_MOCK_RUNS({ { "<lhg><gh><hgl>", USCRIPT_GREEK } });
 }

 // Parens can have scripts that break script runs.
 TEST_F(ScriptRunIteratorTest, ExtensionsParens)
 {
     CHECK_MOCK_RUNS({ { "<gl><(lg>", USCRIPT_GREEK },
         { "h<[hl>", USCRIPT_HAN },
         { "l", USCRIPT_LATIN },
         { "<]hl>", USCRIPT_HAN },
         { "<)lg>", USCRIPT_GREEK } });
 }

 // The close paren might be encountered before we've established the open
 // paren's script, but when this is the case the current set is still valid, so
 // this doesn't affect it nor break the run.
 TEST_F(ScriptRunIteratorTest, ExtensionsParens2)
 {
     CHECK_MOCK_RUNS({ { "<(lhg><gh><)lhg>", USCRIPT_GREEK } });
 }

 // A common script with a single extension should be treated as common, but
 // with the extended script as a default.  If we encounter anything other than
 // common, that takes priority.  If we encounter other common scripts with a
 // single extension, the current priority remains.
 TEST_F(ScriptRunIteratorTest, CommonWithPriority)
 {
     CHECK_MOCK_RUNS({ { "<ch>", USCRIPT_HAN } });
 }

 TEST_F(ScriptRunIteratorTest, CommonWithPriority2)
 {
     CHECK_MOCK_RUNS({ { "<ch><lh>", USCRIPT_LATIN } });
 }

 TEST_F(ScriptRunIteratorTest, CommonWithPriority3)
 {
     CHECK_MOCK_RUNS({ { "<ch><cl><cg>", USCRIPT_HAN } });
 }

 // UDatta (\xE0\xA5\x91) is inherited with LATIN and DEVANAGARI extensions.
 // Since it has LATIN, and the dotted circle (\xE2\x97\x8C) is COMMON and has
 // adopted the preceding LATIN, it gets the LATIN. This is standard.
 TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta)
 {
     CHECK_RUNS({ { "Latin \xE2\x97\x8C\xE0\xA5\x91", USCRIPT_LATIN } });
 }

 // In this situation, UDatta (\xE0\xA5\x91) doesn't share a script with the
 // value inherited by the dotted circle (\xE2\x97\x8C). It captures the
 // preceding dotted circle and breaks it from the run it would normally have
 // been in.
 TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta)
 {
     CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },
         { "\xE2\x97\x8C\xE0\xA5\x91", USCRIPT_DEVANAGARI } });
 }

 // Tatweel is \xD9\x80 Lm, Fathatan is \xD9\x8B Mn. The script of tatweel is
 // common, that of Fathatan is inherited.  The script extensions for Fathatan
 // are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the
 // preferred script for Fathatan is Arabic, according to Behdad's
 // heuristic. This is exactly analogous to the Udatta tests above, except
 // Tatweel is Lm. But we don't take properties into account, only scripts.
 TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan)
 {
     CHECK_RUNS({ { "Latin ", USCRIPT_LATIN },
         { "\xD9\x80\xD9\x8B", USCRIPT_ARABIC } });
 }

 // Another case where if the mark accepts a script that was inherited by the
 // preceding common-script character, they both continue in that script.
 // SYRIAC LETTER NUN \xDC\xA2
 // ARABIC TATWEEL \xD9\x80
 // ARABIC FATHATAN \xD9\x82
 TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan)
 {
     CHECK_RUNS({ { "\xDC\xA2\xD9\x80\xD9\x8B", USCRIPT_SYRIAC } });
 }

 // The Udatta (\xE0\xA5\x91) is inherited, so will share runs with anything that
 // is not common.
 TEST_F(ScriptRunIteratorTest, HanUdatta)
 {
     CHECK_RUNS({ { "萬國碼\xE0\xA5\x91", USCRIPT_HAN } });
 }

 // The Udatta (\xE0\xA5\x91) is inherited, and will capture the space and turn
 // it into Devanagari.
 TEST_F(ScriptRunIteratorTest, HanSpaceUdatta)
 {
     CHECK_RUNS({ { "萬國碼", USCRIPT_HAN },
         { " \xE0\xA5\x91", USCRIPT_DEVANAGARI } });
 }

 // Corresponds to one test in RunSegmenter, where orientation of the
 // space character is sidesways in vertical.
 TEST_F(ScriptRunIteratorTest, Hangul)
 {
     CHECK_RUNS({ { "키스의 고유조건은", USCRIPT_HANGUL } });
 }

 // Corresponds to one test in RunSegmenter, which tests that the punctuation
 // characters mixed in are actually sideways in vertical. The ScriptIterator
 // should report one run, but the RunSegmenter should report three, with the
 // middle one rotated sideways.
 TEST_F(ScriptRunIteratorTest, HiraganaMixedPunctuation)
 {
     CHECK_RUNS({ { "いろはに.…¡ほへと", USCRIPT_HIRAGANA } });
 }

 // Make sure Mock code works too.
 TEST_F(ScriptRunIteratorTest, MockHanInheritedGL)
 {
     CHECK_MOCK_RUNS({ { "h<igl>", USCRIPT_HAN } });
 }

 TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL)
 {
     CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },
         { "c<igl>", USCRIPT_GREEK } });
 }

 // Leading inherited just act like common, except there's no preferred script.
 TEST_F(ScriptRunIteratorTest, MockLeadingInherited)
 {
     CHECK_MOCK_RUNS({ { "<igl>", USCRIPT_COMMON } });
 }

 // Leading inherited just act like common, except there's no preferred script.
 TEST_F(ScriptRunIteratorTest, MockLeadingInherited2)
 {
     CHECK_MOCK_RUNS({ { "<igl><ih>", USCRIPT_COMMON } });
 }

 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan)
 {
     // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
     CHECK_RUNS({ { "\xE0\xA5\x91萬國碼", USCRIPT_HAN } });
 }

 TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2)
 {
     // DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
     // ARABIC FATHATAN \xD9\x8B
     CHECK_RUNS({ { "\xE0\xA5\x91\xD9\x8B萬國碼", USCRIPT_HAN } });
 }

 TEST_F(ScriptRunIteratorTest, OddLatinString)
 {
     CHECK_RUNS({ { "ç̈", USCRIPT_LATIN } });
 }

 TEST_F(ScriptRunIteratorTest, CommonMalayalam)
 {
     CHECK_RUNS({ { "100-ാം", USCRIPT_MALAYALAM } });
 }


 class ScriptRunIteratorICUDataTest : public testing::Test {

 public:
     ScriptRunIteratorICUDataTest()
         : m_maxExtensions(0)
         , m_maxExtensionsCodepoint(0xffff)
     {
         int maxExtensions = 0;
         UChar32 m_maxExtensionscp = 0;
         for (UChar32 cp = 0; cp < 0x11000; ++cp) {
             UErrorCode status = U_ZERO_ERROR;
             int count = uscript_getScriptExtensions(cp, 0, 0, &status);
             if (count > maxExtensions) {
                 maxExtensions = count;
                 m_maxExtensionscp = cp;
             }
         }
         m_maxExtensions = maxExtensions;
         m_maxExtensionsCodepoint = m_maxExtensionscp;
     }

 protected:
     UChar32 GetACharWithMaxExtensions(int* numExtensions)
     {
         if (numExtensions) {
             *numExtensions = m_maxExtensions;
         }
         return m_maxExtensionsCodepoint;
     }

 private:
     int m_maxExtensions;
     UChar32 m_maxExtensionsCodepoint;
 };

 // Validate that ICU never returns more than our maximum expected number of
 // script extensions.
 TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions)
 {
     int maxExtensions;
     UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);
     ASSERT_LE(maxExtensions, ScriptData::kMaxScriptCount)
         << "char " << std::hex << cp << std::dec;
 }

 // Check that ICUScriptData returns all of a character's scripts.
 // This only checks one likely character, but doesn't check all cases.
 TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions)
 {
     int maxExtensions;
     UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);
     Vector<UScriptCode> extensions;
     ICUScriptData::instance()->getScripts(cp, extensions);

     // It's possible that GetScripts adds the primary script to the list of
     // extensions, resulting in one more script than the raw extension count.
     ASSERT_GE(static_cast<int>(extensions.size()), maxExtensions)
         << "char " << std::hex << cp << std::dec;
 }

 TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension)
 {
     Vector<UScriptCode> extensions;
     for (UChar32 cp = 0; cp < 0x110000; ++cp) {
         ICUScriptData::instance()->getScripts(cp, extensions);
         UScriptCode primary = extensions.at(0);
         if (primary == USCRIPT_COMMON) {
             ASSERT_LE(extensions.size(), 2ul)
                 << "cp: " << std::hex << cp << std::dec;
         }
     }
 }

 // ZWJ is \u200D Cf (Format, other) and its script is inherited.  I'm going to
 // ignore this for now, as I think it shouldn't matter which run it ends up
 // in. HarfBuzz needs to be able to use it as context and shape each
 // neighboring character appropriately no matter what run it got assigned to.

 } // namespace blink
	// Copyright 2015 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "config.h"
	#include "platform/fonts/ScriptRunIterator.h"

	#include "platform/Logging.h"
	#include "testing/gtest/include/gtest/gtest.h"
	#include "wtf/Assertions.h"
	#include "wtf/Threading.h"
	#include "wtf/text/WTFString.h"
	#include <string>

	namespace blink {

	struct TestRun {
	std::string text;
	UScriptCode code;
	};

	struct ExpectedRun {
	unsigned limit;
	UScriptCode code;

	ExpectedRun(unsigned the_limit, UScriptCode the_code)
	: limit(the_limit)
	, code(the_code)
	{
	}
	};

	class MockScriptData : public ScriptData {
	public:
	~MockScriptData() override {}

	static const MockScriptData* instance()
	{
	AtomicallyInitializedStaticReference(const MockScriptData, mockScriptData, (new MockScriptData()));

	return &mockScriptData;
	}

	void getScripts(UChar32 ch, Vector<UScriptCode>& dst) const override
	{
	ASSERT(ch >= kMockCharMin);
	ASSERT(ch < kMockCharLimit);

	int code = ch - kMockCharMin;
	dst.clear();
	switch (code & kCodeSpecialMask) {
	case kCodeSpecialCommon:
	dst.append(USCRIPT_COMMON);
	break;
	case kCodeSpecialInherited:
	dst.append(USCRIPT_INHERITED);
	break;
	default:
	break;
	}
	int listBits = kTable[code & kCodeListIndexMask];
	if (dst.isEmpty() && listBits == 0) {
	dst.append(USCRIPT_UNKNOWN);
	return;
	}
	while (listBits) {
	switch (listBits & kListMask) {
	case 0:
	break;
	case kLatin:
	dst.append(USCRIPT_LATIN);
	break;
	case kHan:
	dst.append(USCRIPT_HAN);
	break;
	case kGreek:
	dst.append(USCRIPT_GREEK);
	break;
	}
	listBits >>= kListShift;
	}
	}

	UChar32 getPairedBracket(UChar32 ch) const override
	{
	switch (getPairedBracketType(ch)) {
	case PairedBracketType::BracketTypeClose:
	return ch - kBracketDelta;
	case PairedBracketType::BracketTypeOpen:
	return ch + kBracketDelta;
	default:
	return ch;
	}
	}

	PairedBracketType getPairedBracketType(UChar32 ch) const override
	{
	ASSERT(ch >= kMockCharMin && ch < kMockCharLimit);
	int code = ch - kMockCharMin;
	if ((code & kCodeBracketBit) == 0) {
	return PairedBracketType::BracketTypeNone;
	}
	if (code & kCodeBracketCloseBit) {
	return PairedBracketType::BracketTypeClose;
	}
	return PairedBracketType::BracketTypeOpen;
	}

	static int TableLookup(int value)
	{
	for (int i = 0; i < 16; ++i) {
	if (kTable[i] == value) {
	return i;
	}
	}
	WTF_LOG_ERROR("Table does not contain value 0x%x", value);
	return 0;
	}

	static String ToTestString(const std::string& input)
	{
	String result(String::make16BitFrom8BitSource(0, 0));
	bool inSet = false;
	int seen = 0;
	int code = 0;
	int list = 0;
	int currentShift = 0;
	for (char c : input) {
	if (inSet) {
	switch (c) {
	case '(':
	ASSERT(seen == 0);
	seen \|= kSawBracket;
	code \|= kCodeBracketBit;
	break;
	case '[':
	ASSERT(seen == 0);
	seen \|= kSawBracket;
	code \|= kCodeBracketBit \| kCodeSquareBracketBit;
	break;
	case ')':
	ASSERT(seen == 0);
	seen \|= kSawBracket;
	code \|= kCodeBracketBit \| kCodeBracketCloseBit;
	break;
	case ']':
	ASSERT(seen == 0);
	seen \|= kSawBracket;
	code \|= kCodeBracketBit \| kCodeSquareBracketBit \| kCodeBracketCloseBit;
	break;
	case 'i':
	ASSERT(seen == 0); // brackets can't be inherited
	seen \|= kSawSpecial;
	code \|= kCodeSpecialInherited;
	break;
	case 'c':
	ASSERT((seen & ~kSawBracket) == 0);
	seen \|= kSawSpecial;
	code \|= kCodeSpecialCommon;
	break;
	case 'l':
	ASSERT((seen & kSawLatin) == 0);
	ASSERT(currentShift < 3);
	seen \|= kSawLatin;
	list \|= kLatin << (2 * currentShift++);
	break;
	case 'h':
	ASSERT((seen & kSawHan) == 0);
	ASSERT(currentShift < 3);
	seen \|= kSawHan;
	list \|= kHan << (2 * currentShift++);
	break;
	case 'g':
	ASSERT((seen & kSawGreek) == 0);
	ASSERT(currentShift < 3);
	seen \|= kSawGreek;
	list \|= kGreek << (2 * currentShift++);
	break;
	case '>':
	ASSERT(seen != 0);
	code \|= TableLookup(list);
	result.append(static_cast<UChar>(kMockCharMin + code));
	inSet = false;
	break;
	default:
	WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);
	break;
	}
	continue;
	}
	// not in set
	switch (c) {
	case '<':
	seen = 0;
	code = 0;
	list = 0;
	currentShift = 0;
	inSet = true;
	break;
	case '(':
	code = kCodeBracketBit \| kCodeSpecialCommon;
	break;
	case '[':
	code = kCodeBracketBit \| kCodeSquareBracketBit \| kCodeSpecialCommon;
	break;
	case ')':
	code = kCodeBracketBit \| kCodeBracketCloseBit \| kCodeSpecialCommon;
	break;
	case ']':
	code = kCodeBracketBit \| kCodeSquareBracketBit \| kCodeBracketCloseBit \| kCodeSpecialCommon;
	break;
	case 'i':
	code = kCodeSpecialInherited;
	break;
	case 'c':
	code = kCodeSpecialCommon;
	break;
	case 'l':
	code = kLatin;
	break;
	case 'h':
	code = kHan;
	break;
	case 'g':
	code = kGreek;
	break;
	case '?':
	code = 0; // unknown
	break;
	default:
	WTF_LOG_ERROR("Illegal mock string set char: '%c'", c);
	}
	if (!inSet) {
	result.append(static_cast<UChar>(kMockCharMin + code));
	}
	}
	return result;
	}

	// We determine properties based on the offset from kMockCharMin:
	// bits 0-3 represent the list of l, h, c scripts (index into table)
	// bit 4-5 means: 0 plain, 1 common, 2 inherited, 3 illegal
	// bit 6 clear means non-bracket, open means bracket
	// bit 7 clear means open bracket, set means close bracket
	// bit 8 clear means paren, set means bracket
	// if it's a bracket, the matching bracket is 64 code points away
	static const UChar32 kMockCharMin = 0xe000;
	static const UChar32 kMockCharLimit = kMockCharMin + 0x200;
	static const int kLatin = 1;
	static const int kHan = 2;
	static const int kGreek = 3;
	static const int kCodeListIndexMask = 0xf;
	static const int kCodeSpecialMask = 0x30;
	static const int kCodeSpecialCommon = 0x10;
	static const int kCodeSpecialInherited = 0x20;
	static const int kCodeBracketCloseBit = 0x40;
	static const int kCodeBracketBit = 0x80;
	static const int kCodeSquareBracketBit = 0x100;
	static const int kListShift = 2;
	static const int kListMask = 0x3;
	static const int kBracketDelta = kCodeBracketCloseBit;
	static const int kTable[16];

	static const int kSawBracket = 0x1;
	static const int kSawSpecial = 0x2;
	static const int kSawLatin = 0x4;
	static const int kSawHan = 0x8;
	static const int kSawGreek = 0x10;
	};

	static const int kLatin2 = MockScriptData::kLatin << 2;
	static const int kHan2 = MockScriptData::kHan << 2;
	static const int kGreek2 = MockScriptData::kGreek << 2;
	static const int kLatin3 = MockScriptData::kLatin << 4;
	static const int kHan3 = MockScriptData::kHan << 4;
	static const int kGreek3 = MockScriptData::kGreek << 4;
	const int MockScriptData::kTable[] = {
	0, kLatin, kHan, kGreek,
	kLatin2 + kHan, kLatin2 + kGreek,
	kHan2 + kLatin, kHan2 + kGreek,
	kGreek2 + kLatin, kGreek2 + kHan,
	kLatin3 + kHan2 + kGreek, kLatin3 + kGreek2 + kHan,
	kHan3 + kLatin2 + kGreek, kHan3 + kGreek2 + kLatin,
	kGreek3 + kLatin2 + kHan, kGreek3 + kHan2 + kLatin,
	};

	class ScriptRunIteratorTest : public testing::Test {
	protected:
	void CheckRuns(const Vector<TestRun>& runs)
	{
	String text(String::make16BitFrom8BitSource(0, 0));
	Vector<ExpectedRun> expect;
	for (auto& run : runs) {
	text.append(String::fromUTF8(run.text.c_str()));
	expect.append(ExpectedRun(text.length(), run.code));
	}
	ScriptRunIterator scriptRunIterator(text.characters16(), text.length());
	VerifyRuns(&scriptRunIterator, expect);
	}

	// FIXME crbug.com/527329 - CheckMockRuns should be replaced by finding
	// suitable equivalent real codepoint sequences instead.
	void CheckMockRuns(const Vector<TestRun>& runs)
	{
	String text(String::make16BitFrom8BitSource(0, 0));
	Vector<ExpectedRun> expect;
	for (const TestRun& run : runs) {
	text.append(MockScriptData::ToTestString(run.text));
	expect.append(ExpectedRun(text.length(), run.code));
	}

	ScriptRunIterator scriptRunIterator(text.characters16(), text.length(),
	MockScriptData::instance());
	VerifyRuns(&scriptRunIterator, expect);
	}

	void VerifyRuns(ScriptRunIterator* scriptRunIterator,
	const Vector<ExpectedRun>& expect)
	{
	unsigned limit;
	UScriptCode code;
	unsigned long runCount = 0;
	while (scriptRunIterator->consume(limit, code)) {
	ASSERT_LT(runCount, expect.size());
	ASSERT_EQ(expect[runCount].limit, limit);
	ASSERT_EQ(expect[runCount].code, code);
	++runCount;
	}
	WTF_LOG_ERROR("Expected %zu runs, got %lu ", expect.size(), runCount);
	ASSERT_EQ(expect.size(), runCount);
	}
	};

	TEST_F(ScriptRunIteratorTest, Empty)
	{
	String empty(String::make16BitFrom8BitSource(0, 0));
	ScriptRunIterator scriptRunIterator(empty.characters16(), empty.length());
	unsigned limit = 0;
	UScriptCode code = USCRIPT_INVALID_CODE;
	ASSERT(!scriptRunIterator.consume(limit, code));
	ASSERT_EQ(limit, 0u);
	ASSERT_EQ(code, USCRIPT_INVALID_CODE);
	}

	// Some of our compilers cannot initialize a vector from an array yet.
	#define DECLARE_RUNSVECTOR(...) \
	static const TestRun runsArray[] = __VA_ARGS__; \
	Vector<TestRun> runs; \
	runs.append(runsArray, sizeof(runsArray) / sizeof(*runsArray));

	#define CHECK_RUNS(...) \
	DECLARE_RUNSVECTOR(__VA_ARGS__); \
	CheckRuns(runs);

	#define CHECK_MOCK_RUNS(...) \
	DECLARE_RUNSVECTOR(__VA_ARGS__); \
	CheckMockRuns(runs);

	TEST_F(ScriptRunIteratorTest, Whitespace)
	{
	CHECK_RUNS({ { " \t ", USCRIPT_COMMON } });
	}

	TEST_F(ScriptRunIteratorTest, Common)
	{
	CHECK_RUNS({ { " ... !?", USCRIPT_COMMON } });
	}

	TEST_F(ScriptRunIteratorTest, Latin)
	{
	CHECK_RUNS({ { "latin", USCRIPT_LATIN } });
	}

	TEST_F(ScriptRunIteratorTest, Chinese)
	{
	CHECK_RUNS({ { "萬國碼", USCRIPT_HAN } });
	}

	// Close bracket without matching open is ignored
	TEST_F(ScriptRunIteratorTest, UnbalancedParens1)
	{
	CHECK_RUNS({ { "(萬", USCRIPT_HAN },
	{ "a]", USCRIPT_LATIN },
	{ ")", USCRIPT_HAN } });
	}

	// Open bracket without matching close is popped when inside
	// matching close brackets, so doesn't match later close.
	TEST_F(ScriptRunIteratorTest, UnbalancedParens2)
	{
	CHECK_RUNS({ { "(萬", USCRIPT_HAN },
	{ "a[", USCRIPT_LATIN },
	{ ")]", USCRIPT_HAN } });
	}

	// space goes with leading script
	TEST_F(ScriptRunIteratorTest, LatinHan)
	{
	CHECK_RUNS({ { "Unicode ", USCRIPT_LATIN },
	{ "萬國碼", USCRIPT_HAN } });
	}

	// space goes with leading script
	TEST_F(ScriptRunIteratorTest, HanLatin)
	{
	CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },
	{ "Unicode", USCRIPT_LATIN } });
	}

	TEST_F(ScriptRunIteratorTest, ParenEmptyParen)
	{
	CHECK_RUNS({ { "()", USCRIPT_COMMON } });
	}

	TEST_F(ScriptRunIteratorTest, ParenChineseParen)
	{
	CHECK_RUNS({ { "(萬國碼)", USCRIPT_HAN } });
	}

	TEST_F(ScriptRunIteratorTest, ParenLatinParen)
	{
	CHECK_RUNS({ { "(Unicode)", USCRIPT_LATIN } });
	}

	// open paren gets leading script
	TEST_F(ScriptRunIteratorTest, LatinParenChineseParen)
	{
	CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },
	{ "萬國碼", USCRIPT_HAN },
	{ ")", USCRIPT_LATIN } });
	}

	// open paren gets first trailing script if no leading script
	TEST_F(ScriptRunIteratorTest, ParenChineseParenLatin)
	{
	CHECK_RUNS({ { "(萬國碼) ", USCRIPT_HAN },
	{ "Unicode", USCRIPT_LATIN } });
	}

	// leading common and open paren get first trailing script.
	// TODO(dougfelt): we don't do quote matching, but probably should figure out
	// something better then doing nothing.
	TEST_F(ScriptRunIteratorTest, QuoteParenChineseParenLatinQuote)
	{
	CHECK_RUNS({ { "\"(萬國碼) ", USCRIPT_HAN },
	{ "Unicode\"", USCRIPT_LATIN } });
	}

	// Emojies are resolved to the leading script.
	TEST_F(ScriptRunIteratorTest, EmojiCommon)
	{
	CHECK_RUNS({ { "百家姓🌱🌲🌳🌴", USCRIPT_HAN } });
	}

	// Unmatched close brace gets leading context
	TEST_F(ScriptRunIteratorTest, UnmatchedClose)
	{
	CHECK_RUNS({ { "Unicode (", USCRIPT_LATIN },
	{ "萬國碼] ", USCRIPT_HAN },
	{ ") Unicode\"", USCRIPT_LATIN } });
	}

	// Match up to 32 bracket pairs
	TEST_F(ScriptRunIteratorTest, Match32Brackets)
	{
	CHECK_RUNS({ { "[萬國碼 ", USCRIPT_HAN },
	{ "Unicode (((((((((((((((((((((((((((((((!"
	")))))))))))))))))))))))))))))))",
	USCRIPT_LATIN },
	{ "]", USCRIPT_HAN } });
	}

	// Matches 32 most recent bracket pairs. More than that, and we revert to
	// surrounding script.
	TEST_F(ScriptRunIteratorTest, Match32MostRecentBrackets)
	{
	CHECK_RUNS({ { "((([萬國碼 ", USCRIPT_HAN },
	{ "Unicode (((((((((((((((((((((((((((((((", USCRIPT_LATIN },
	{ "萬國碼!", USCRIPT_HAN },
	{ ")))))))))))))))))))))))))))))))", USCRIPT_LATIN },
	{ "]", USCRIPT_HAN },
	{ "But )))", USCRIPT_LATIN } });
	}

	// A char with multiple scripts that match both leading and trailing context
	// gets the leading context.
	TEST_F(ScriptRunIteratorTest, ExtensionsPreferLeadingContext)
	{
	CHECK_MOCK_RUNS({ { "h<lh>", USCRIPT_HAN },
	{ "l", USCRIPT_LATIN } });
	}

	// A char with multiple scripts that only match trailing context gets the
	// trailing context.
	TEST_F(ScriptRunIteratorTest, ExtensionsMatchTrailingContext)
	{
	CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },
	{ "<gl>l", USCRIPT_LATIN } });
	}

	// Retain first established priority script. <lhg><gh> produce the script <gh>
	// with g as priority, because of the two priority scripts l and g, only g
	// remains. Then <gh><hgl> retains g as priority, because of the two priority
	// scripts g and h that remain, g was encountered first.
	TEST_F(ScriptRunIteratorTest, ExtensionsRetainFirstPriorityScript)
	{
	CHECK_MOCK_RUNS({ { "<lhg><gh><hgl>", USCRIPT_GREEK } });
	}

	// Parens can have scripts that break script runs.
	TEST_F(ScriptRunIteratorTest, ExtensionsParens)
	{
	CHECK_MOCK_RUNS({ { "<gl><(lg>", USCRIPT_GREEK },
	{ "h<[hl>", USCRIPT_HAN },
	{ "l", USCRIPT_LATIN },
	{ "<]hl>", USCRIPT_HAN },
	{ "<)lg>", USCRIPT_GREEK } });
	}

	// The close paren might be encountered before we've established the open
	// paren's script, but when this is the case the current set is still valid, so
	// this doesn't affect it nor break the run.
	TEST_F(ScriptRunIteratorTest, ExtensionsParens2)
	{
	CHECK_MOCK_RUNS({ { "<(lhg><gh><)lhg>", USCRIPT_GREEK } });
	}

	// A common script with a single extension should be treated as common, but
	// with the extended script as a default. If we encounter anything other than
	// common, that takes priority. If we encounter other common scripts with a
	// single extension, the current priority remains.
	TEST_F(ScriptRunIteratorTest, CommonWithPriority)
	{
	CHECK_MOCK_RUNS({ { "<ch>", USCRIPT_HAN } });
	}

	TEST_F(ScriptRunIteratorTest, CommonWithPriority2)
	{
	CHECK_MOCK_RUNS({ { "<ch><lh>", USCRIPT_LATIN } });
	}

	TEST_F(ScriptRunIteratorTest, CommonWithPriority3)
	{
	CHECK_MOCK_RUNS({ { "<ch><cl><cg>", USCRIPT_HAN } });
	}

	// UDatta (\xE0\xA5\x91) is inherited with LATIN and DEVANAGARI extensions.
	// Since it has LATIN, and the dotted circle (\xE2\x97\x8C) is COMMON and has
	// adopted the preceding LATIN, it gets the LATIN. This is standard.
	TEST_F(ScriptRunIteratorTest, LatinDottedCircleUdatta)
	{
	CHECK_RUNS({ { "Latin \xE2\x97\x8C\xE0\xA5\x91", USCRIPT_LATIN } });
	}

	// In this situation, UDatta (\xE0\xA5\x91) doesn't share a script with the
	// value inherited by the dotted circle (\xE2\x97\x8C). It captures the
	// preceding dotted circle and breaks it from the run it would normally have
	// been in.
	TEST_F(ScriptRunIteratorTest, HanDottedCircleUdatta)
	{
	CHECK_RUNS({ { "萬國碼 ", USCRIPT_HAN },
	{ "\xE2\x97\x8C\xE0\xA5\x91", USCRIPT_DEVANAGARI } });
	}

	// Tatweel is \xD9\x80 Lm, Fathatan is \xD9\x8B Mn. The script of tatweel is
	// common, that of Fathatan is inherited. The script extensions for Fathatan
	// are Arabic and Syriac. The Syriac script is 34 in ICU, Arabic is 2. So the
	// preferred script for Fathatan is Arabic, according to Behdad's
	// heuristic. This is exactly analogous to the Udatta tests above, except
	// Tatweel is Lm. But we don't take properties into account, only scripts.
	TEST_F(ScriptRunIteratorTest, LatinTatweelFathatan)
	{
	CHECK_RUNS({ { "Latin ", USCRIPT_LATIN },
	{ "\xD9\x80\xD9\x8B", USCRIPT_ARABIC } });
	}

	// Another case where if the mark accepts a script that was inherited by the
	// preceding common-script character, they both continue in that script.
	// SYRIAC LETTER NUN \xDC\xA2
	// ARABIC TATWEEL \xD9\x80
	// ARABIC FATHATAN \xD9\x82
	TEST_F(ScriptRunIteratorTest, SyriacTatweelFathatan)
	{
	CHECK_RUNS({ { "\xDC\xA2\xD9\x80\xD9\x8B", USCRIPT_SYRIAC } });
	}

	// The Udatta (\xE0\xA5\x91) is inherited, so will share runs with anything that
	// is not common.
	TEST_F(ScriptRunIteratorTest, HanUdatta)
	{
	CHECK_RUNS({ { "萬國碼\xE0\xA5\x91", USCRIPT_HAN } });
	}

	// The Udatta (\xE0\xA5\x91) is inherited, and will capture the space and turn
	// it into Devanagari.
	TEST_F(ScriptRunIteratorTest, HanSpaceUdatta)
	{
	CHECK_RUNS({ { "萬國碼", USCRIPT_HAN },
	{ " \xE0\xA5\x91", USCRIPT_DEVANAGARI } });
	}

	// Corresponds to one test in RunSegmenter, where orientation of the
	// space character is sidesways in vertical.
	TEST_F(ScriptRunIteratorTest, Hangul)
	{
	CHECK_RUNS({ { "키스의 고유조건은", USCRIPT_HANGUL } });
	}

	// Corresponds to one test in RunSegmenter, which tests that the punctuation
	// characters mixed in are actually sideways in vertical. The ScriptIterator
	// should report one run, but the RunSegmenter should report three, with the
	// middle one rotated sideways.
	TEST_F(ScriptRunIteratorTest, HiraganaMixedPunctuation)
	{
	CHECK_RUNS({ { "いろはに.…¡ほへと", USCRIPT_HIRAGANA } });
	}

	// Make sure Mock code works too.
	TEST_F(ScriptRunIteratorTest, MockHanInheritedGL)
	{
	CHECK_MOCK_RUNS({ { "h<igl>", USCRIPT_HAN } });
	}

	TEST_F(ScriptRunIteratorTest, MockHanCommonInheritedGL)
	{
	CHECK_MOCK_RUNS({ { "h", USCRIPT_HAN },
	{ "c<igl>", USCRIPT_GREEK } });
	}

	// Leading inherited just act like common, except there's no preferred script.
	TEST_F(ScriptRunIteratorTest, MockLeadingInherited)
	{
	CHECK_MOCK_RUNS({ { "<igl>", USCRIPT_COMMON } });
	}

	// Leading inherited just act like common, except there's no preferred script.
	TEST_F(ScriptRunIteratorTest, MockLeadingInherited2)
	{
	CHECK_MOCK_RUNS({ { "<igl><ih>", USCRIPT_COMMON } });
	}

	TEST_F(ScriptRunIteratorTest, LeadingInheritedHan)
	{
	// DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
	CHECK_RUNS({ { "\xE0\xA5\x91萬國碼", USCRIPT_HAN } });
	}

	TEST_F(ScriptRunIteratorTest, LeadingInheritedHan2)
	{
	// DEVANAGARI STRESS SIGN UDATTA \xE0\xA5\x91
	// ARABIC FATHATAN \xD9\x8B
	CHECK_RUNS({ { "\xE0\xA5\x91\xD9\x8B萬國碼", USCRIPT_HAN } });
	}

	TEST_F(ScriptRunIteratorTest, OddLatinString)
	{
	CHECK_RUNS({ { "ç̈", USCRIPT_LATIN } });
	}

	TEST_F(ScriptRunIteratorTest, CommonMalayalam)
	{
	CHECK_RUNS({ { "100-ാം", USCRIPT_MALAYALAM } });
	}


	class ScriptRunIteratorICUDataTest : public testing::Test {

	public:
	ScriptRunIteratorICUDataTest()
	: m_maxExtensions(0)
	, m_maxExtensionsCodepoint(0xffff)
	{
	int maxExtensions = 0;
	UChar32 m_maxExtensionscp = 0;
	for (UChar32 cp = 0; cp < 0x11000; ++cp) {
	UErrorCode status = U_ZERO_ERROR;
	int count = uscript_getScriptExtensions(cp, 0, 0, &status);
	if (count > maxExtensions) {
	maxExtensions = count;
	m_maxExtensionscp = cp;
	}
	}
	m_maxExtensions = maxExtensions;
	m_maxExtensionsCodepoint = m_maxExtensionscp;
	}

	protected:
	UChar32 GetACharWithMaxExtensions(int* numExtensions)
	{
	if (numExtensions) {
	*numExtensions = m_maxExtensions;
	}
	return m_maxExtensionsCodepoint;
	}

	private:
	int m_maxExtensions;
	UChar32 m_maxExtensionsCodepoint;
	};

	// Validate that ICU never returns more than our maximum expected number of
	// script extensions.
	TEST_F(ScriptRunIteratorICUDataTest, ValidateICUMaxScriptExtensions)
	{
	int maxExtensions;
	UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);
	ASSERT_LE(maxExtensions, ScriptData::kMaxScriptCount)
	<< "char " << std::hex << cp << std::dec;
	}

	// Check that ICUScriptData returns all of a character's scripts.
	// This only checks one likely character, but doesn't check all cases.
	TEST_F(ScriptRunIteratorICUDataTest, ICUDataGetScriptsReturnsAllExtensions)
	{
	int maxExtensions;
	UChar32 cp = GetACharWithMaxExtensions(&maxExtensions);
	Vector<UScriptCode> extensions;
	ICUScriptData::instance()->getScripts(cp, extensions);

	// It's possible that GetScripts adds the primary script to the list of
	// extensions, resulting in one more script than the raw extension count.
	ASSERT_GE(static_cast<int>(extensions.size()), maxExtensions)
	<< "char " << std::hex << cp << std::dec;
	}

	TEST_F(ScriptRunIteratorICUDataTest, CommonHaveNoMoreThanOneExtension)
	{
	Vector<UScriptCode> extensions;
	for (UChar32 cp = 0; cp < 0x110000; ++cp) {
	ICUScriptData::instance()->getScripts(cp, extensions);
	UScriptCode primary = extensions.at(0);
	if (primary == USCRIPT_COMMON) {
	ASSERT_LE(extensions.size(), 2ul)
	<< "cp: " << std::hex << cp << std::dec;
	}
	}
	}

	// ZWJ is \u200D Cf (Format, other) and its script is inherited. I'm going to
	// ignore this for now, as I think it shouldn't matter which run it ends up
	// in. HarfBuzz needs to be able to use it as context and shape each
	// neighboring character appropriately no matter what run it got assigned to.

	} // namespace blink