| /* |
| * Copyright (C) 2014 Google Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are |
| * met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following disclaimer |
| * in the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Google Inc. nor the names of its |
| * contributors may be used to endorse or promote products derived from |
| * this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "third_party/blink/renderer/platform/text/character.h" |
| |
| #include <unicode/uchar.h> |
| #include <unicode/ucptrie.h> |
| #include <unicode/uobject.h> |
| #include <unicode/uscript.h> |
| |
| #include <algorithm> |
| |
| #include "base/synchronization/lock.h" |
| #include "third_party/abseil-cpp/absl/strings/ascii.h" |
| #include "third_party/blink/renderer/platform/runtime_enabled_features.h" |
| #include "third_party/blink/renderer/platform/text/character_break_iterator.h" |
| #include "third_party/blink/renderer/platform/text/character_property_data.h" |
| #include "third_party/blink/renderer/platform/text/icu_error.h" |
| #include "third_party/blink/renderer/platform/wtf/std_lib_extras.h" |
| #include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h" |
| #include "third_party/blink/renderer/platform/wtf/text/string_builder.h" |
| #include "third_party/blink/renderer/platform/wtf/text/unicode.h" |
| #include "third_party/blink/renderer/platform/wtf/text/wtf_uchar.h" |
| |
| namespace blink { |
| |
| namespace { |
| |
| UCPTrie* CreateTrie() { |
| // Create a Trie from the value array. |
| ICUError error; |
| UCPTrie* trie = ucptrie_openFromBinary( |
| UCPTrieType::UCPTRIE_TYPE_FAST, UCPTrieValueWidth::UCPTRIE_VALUE_BITS_16, |
| kSerializedCharacterData, kSerializedCharacterDataSize, nullptr, &error); |
| DCHECK_EQ(error, U_ZERO_ERROR); |
| return trie; |
| } |
| |
| inline CharacterProperty GetProperty(UChar32 c) { |
| static const UCPTrie* trie = CreateTrie(); |
| static_assert(sizeof(CharacterProperty) == 2); |
| const auto value = UNSAFE_TODO(UCPTRIE_FAST_GET(trie, UCPTRIE_16, c)); |
| return CharacterProperty(value); |
| } |
| |
| base::Lock& GetFreezePatternLock() { |
| DEFINE_THREAD_SAFE_STATIC_LOCAL(base::Lock, lock, ()); |
| return lock; |
| } |
| |
| } // namespace |
| |
| void Character::ApplyPatternAndFreezeIfEmpty(icu::UnicodeSet* unicodeSet, |
| const char* pattern) { |
| base::AutoLock locker(GetFreezePatternLock()); |
| if (!unicodeSet->isEmpty()) { |
| return; |
| } |
| blink::ICUError err; |
| // Use ICU's invariant-character initialization method. |
| unicodeSet->applyPattern(icu::UnicodeString(pattern, -1, US_INV), err); |
| unicodeSet->freeze(); |
| DCHECK_EQ(err, U_ZERO_ERROR); |
| } |
| |
| bool Character::IsUprightInMixedVertical(UChar32 character) { |
| return u_getIntPropertyValue(character, |
| UProperty::UCHAR_VERTICAL_ORIENTATION) != |
| UVerticalOrientation::U_VO_ROTATED; |
| } |
| |
| bool Character::IsCJKIdeographOrSymbolSlow(UChar32 c) { |
| return GetProperty(c).is_cjk_ideograph_or_symbol; |
| } |
| |
| bool Character::IsPotentialCustomElementNameChar(UChar32 character) { |
| return GetProperty(character).is_potential_custom_element_name_char; |
| } |
| |
| bool Character::IsBidiControl(UChar32 character) { |
| return GetProperty(character).is_bidi_control; |
| } |
| |
| bool Character::IsHangulSlow(UChar32 character) { |
| return GetProperty(character).is_hangul; |
| } |
| |
| // static |
| HanKerningCharType Character::GetHanKerningCharType(UChar32 character) { |
| return GetProperty(character).han_kerning; |
| } |
| |
| // static |
| EastAsianSpacingType Character::GetEastAsianSpacingType(UChar32 character) { |
| return GetProperty(character).east_asian_spacing; |
| } |
| |
| bool Character::MaybeHanKerningOpenSlow(UChar32 ch) { |
| // See `HanKerning::GetCharType`. |
| const HanKerningCharType type = Character::GetHanKerningCharType(ch); |
| return type == HanKerningCharType::kOpen || |
| type == HanKerningCharType::kOpenQuote; |
| } |
| |
| bool Character::MaybeHanKerningCloseSlow(UChar32 ch) { |
| // See `HanKerning::GetCharType`. |
| const HanKerningCharType type = Character::GetHanKerningCharType(ch); |
| return type == HanKerningCharType::kClose || |
| type == HanKerningCharType::kCloseQuote; |
| } |
| |
| unsigned Character::ExpansionOpportunityCount( |
| base::span<const LChar> characters, |
| TextDirection direction, |
| bool& is_after_expansion) { |
| unsigned count = 0; |
| if (direction == TextDirection::kLtr) { |
| for (size_t i = 0; i < characters.size(); ++i) { |
| if (TreatAsSpace(characters[i])) { |
| count++; |
| is_after_expansion = true; |
| } else { |
| is_after_expansion = false; |
| } |
| } |
| } else { |
| for (size_t i = characters.size(); i > 0; --i) { |
| if (TreatAsSpace(characters[i - 1])) { |
| count++; |
| is_after_expansion = true; |
| } else { |
| is_after_expansion = false; |
| } |
| } |
| } |
| |
| return count; |
| } |
| |
| unsigned Character::ExpansionOpportunityCount( |
| base::span<const UChar> characters, |
| TextDirection direction, |
| bool& is_after_expansion) { |
| if (characters.size() == 0) { |
| return 0; |
| } |
| unsigned count = 0; |
| |
| if (!RuntimeEnabledFeatures::EmojiJustificationEnabled()) { |
| if (direction == TextDirection::kLtr) { |
| for (size_t i = 0; i < characters.size(); ++i) { |
| UChar32 character = characters[i]; |
| if (TreatAsSpace(character)) { |
| count++; |
| is_after_expansion = true; |
| continue; |
| } |
| if (U16_IS_LEAD(character) && i + 1 < characters.size() && |
| U16_IS_TRAIL(characters[i + 1])) { |
| character = U16_GET_SUPPLEMENTARY(character, characters[i + 1]); |
| i++; |
| } |
| if (IsCJKIdeographOrSymbol(character)) { |
| if (!is_after_expansion) { |
| count++; |
| } |
| count++; |
| is_after_expansion = true; |
| continue; |
| } else if (!IsDefaultIgnorable(character)) { |
| is_after_expansion = false; |
| } |
| } |
| } else { |
| for (size_t i = characters.size(); i > 0; --i) { |
| UChar32 character = characters[i - 1]; |
| if (TreatAsSpace(character)) { |
| count++; |
| is_after_expansion = true; |
| continue; |
| } |
| if (U16_IS_TRAIL(character) && i > 1 && |
| U16_IS_LEAD(characters[i - 2])) { |
| character = U16_GET_SUPPLEMENTARY(characters[i - 2], character); |
| i--; |
| } |
| if (IsCJKIdeographOrSymbol(character)) { |
| if (!is_after_expansion) { |
| count++; |
| } |
| count++; |
| is_after_expansion = true; |
| continue; |
| } else if (!IsDefaultIgnorable(character)) { |
| is_after_expansion = false; |
| } |
| } |
| } |
| return count; |
| } |
| CharacterBreakIterator iter(characters); |
| if (direction == TextDirection::kLtr) { |
| for (int i = 0; static_cast<size_t>(i) < characters.size(); |
| i = iter.Next()) { |
| UChar32 character = characters[i]; |
| if (TreatAsSpace(character)) { |
| count++; |
| is_after_expansion = true; |
| continue; |
| } |
| if (U16_IS_LEAD(character) && |
| static_cast<size_t>(i + 1) < characters.size() && |
| U16_IS_TRAIL(characters[i + 1])) { |
| character = U16_GET_SUPPLEMENTARY(character, characters[i + 1]); |
| } |
| if (IsCJKIdeographOrSymbol(character)) { |
| if (!is_after_expansion) |
| count++; |
| count++; |
| is_after_expansion = true; |
| continue; |
| } else if (!IsDefaultIgnorable(character)) { |
| is_after_expansion = false; |
| } |
| } |
| } else { |
| for (int i = iter.Preceding(characters.size()); i != kTextBreakDone; |
| i = iter.Preceding(i)) { |
| UChar32 character = characters[i]; |
| if (TreatAsSpace(character)) { |
| count++; |
| is_after_expansion = true; |
| continue; |
| } |
| if (U16_IS_LEAD(character) && |
| static_cast<size_t>(i + 1) < characters.size() && |
| U16_IS_TRAIL(characters[i + 1])) { |
| character = U16_GET_SUPPLEMENTARY(character, characters[i + 1]); |
| } |
| if (IsCJKIdeographOrSymbol(character)) { |
| if (!is_after_expansion) |
| count++; |
| count++; |
| is_after_expansion = true; |
| continue; |
| } else if (!IsDefaultIgnorable(character)) { |
| is_after_expansion = false; |
| } |
| } |
| } |
| return count; |
| } |
| |
| bool Character::CanTextDecorationSkipInk(UChar32 codepoint) { |
| if (codepoint == uchar::kSolidus || codepoint == uchar::kReverseSolidus || |
| codepoint == uchar::kLowLine) { |
| return false; |
| } |
| |
| if (Character::IsCJKIdeographOrSymbol(codepoint)) |
| return false; |
| |
| UBlockCode block = ublock_getCode(codepoint); |
| switch (block) { |
| // These blocks contain CJK characters we don't want to skip ink, but are |
| // not ideograph that IsCJKIdeographOrSymbol() does not cover. |
| case UBLOCK_HANGUL_JAMO: |
| case UBLOCK_HANGUL_COMPATIBILITY_JAMO: |
| case UBLOCK_HANGUL_SYLLABLES: |
| case UBLOCK_HANGUL_JAMO_EXTENDED_A: |
| case UBLOCK_HANGUL_JAMO_EXTENDED_B: |
| case UBLOCK_LINEAR_B_IDEOGRAMS: |
| return false; |
| default: |
| return true; |
| } |
| } |
| |
| bool Character::CanReceiveTextEmphasis(UChar32 c) { |
| unicode::CharCategory category = unicode::Category(c); |
| if (category & (unicode::kSeparator_Space | unicode::kSeparator_Line | |
| unicode::kSeparator_Paragraph | unicode::kOther_NotAssigned | |
| unicode::kOther_Control | unicode::kOther_Format)) { |
| return false; |
| } |
| |
| // Additional word-separator characters listed in CSS Text Level 3 Editor's |
| // Draft 3 November 2010. |
| // https://www.w3.org/TR/css-text-3/#word-separator |
| if (c == uchar::kEthiopicWordspace || c == uchar::kAegeanWordSeparatorLine || |
| c == uchar::kAegeanWordSeparatorDot || c == uchar::kUgariticWordDivider || |
| c == uchar::kTibetanMarkIntersyllabicTsheg || |
| c == uchar::kTibetanMarkDelimiterTshegBstar) { |
| return false; |
| } |
| |
| if (RuntimeEnabledFeatures::TextEmphasisPunctuationExceptionsEnabled()) { |
| // A set of exceptions for punctuation. |
| switch (c) { |
| // List from |
| // https://drafts.csswg.org/css-text-decor/#text-emphasis-style-property |
| case uchar::kNumberSign: |
| case uchar::kPercentSign: |
| case uchar::kAmpersand: |
| case uchar::kCommercialAt: |
| case uchar::kSectionSign: |
| case uchar::kPilcrowSign: |
| case uchar::kArabicIndicPerMilleSign: |
| case uchar::kArabicIndicPerTenThousandSign: |
| case uchar::kArabicPercentSign: |
| case uchar::kPerMilleSign: |
| case uchar::kPerTenThousandSign: |
| case uchar::kTironianSignEt: |
| case uchar::kReversedPilcrowSign: |
| case uchar::kSwungDash: |
| case uchar::kPartAlternationMark: |
| // Characters with NFKD equivalence to the above. |
| case uchar::kSmallNumberSign: |
| case uchar::kSmallAmpersand: |
| case uchar::kSmallPercentSign: |
| case uchar::kSmallCommercialAt: |
| case uchar::kFullwidthNumberSign: |
| case uchar::kFullwidthPercentSign: |
| case uchar::kFullwidthAmpersand: |
| case uchar::kFullwidthCommercialAt: |
| return true; |
| default: |
| break; |
| } |
| } |
| |
| // Punctuation |
| if (category & |
| (unicode::kPunctuation_Dash | unicode::kPunctuation_Open | |
| unicode::kPunctuation_Close | unicode::kPunctuation_Connector | |
| unicode::kPunctuation_Other | unicode::kPunctuation_InitialQuote | |
| unicode::kPunctuation_FinalQuote)) { |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool Character::IsEmojiTagSequence(UChar32 c) { |
| // http://www.unicode.org/reports/tr51/proposed.html#valid-emoji-tag-sequences |
| return (c >= uchar::kTagDigitZero && c <= uchar::kTagDigitNine) || |
| (c >= uchar::kTagLatinSmallLetterA && |
| c <= uchar::kTagLatinSmallLetterZ); |
| } |
| |
| bool Character::IsExtendedPictographic(UChar32 c) { |
| return u_hasBinaryProperty(c, UCHAR_EXTENDED_PICTOGRAPHIC); |
| } |
| |
| bool Character::IsEmojiComponent(UChar32 c) { |
| return u_hasBinaryProperty(c, UCHAR_EMOJI_COMPONENT); |
| } |
| |
| namespace { |
| |
| consteval bool MaybeEmojiPresentationForAscii(unsigned char ch) { |
| constexpr auto kCopyRightSign = 0xA9; |
| constexpr auto kRegisteredSign = 0xAE; |
| return ch == kCopyRightSign || ch == kRegisteredSign || |
| Character::IsEmojiKeycapBase(ch); |
| } |
| |
| template <std::size_t N, typename Function> |
| consteval auto GenerateTable(Function&& f) { |
| std::array<bool, N> arr; |
| for (unsigned char i = 0; i < N; ++i) { |
| arr[i] = f(i); |
| } |
| return arr; |
| } |
| |
| static const auto maybe_emoji_presentation_ascii = |
| GenerateTable<128>([](int i) { return MaybeEmojiPresentationForAscii(i); }); |
| |
| } // namespace |
| |
| bool Character::MaybeEmojiPresentation(UChar32 c) { |
| if (IsASCII(c)) [[likely]] { |
| return maybe_emoji_presentation_ascii[c]; |
| } |
| // Non-ascii characters. |
| return c == uchar::kZeroWidthJoiner || IsInRange(c, 0x203C, 0x2B55) || |
| c == uchar::kVariationSelector15 || c == 0x3030 || c == 0x303D || |
| c == 0x3297 || c == 0x3299 || c == uchar::kVariationSelector16 || |
| c >= 65536; |
| } |
| |
| bool Character::IsCommonOrInheritedScript(UChar32 character) { |
| ICUError status; |
| UScriptCode script = uscript_getScript(character, &status); |
| return U_SUCCESS(status) && |
| (script == USCRIPT_COMMON || script == USCRIPT_INHERITED); |
| } |
| |
| bool Character::IsPrivateUse(UChar32 character) { |
| return unicode::Category(character) & unicode::kOther_PrivateUse; |
| } |
| |
| bool Character::IsNonCharacter(UChar32 character) { |
| return U_IS_UNICODE_NONCHAR(character); |
| } |
| |
| bool Character::HasDefiniteScript(UChar32 character) { |
| ICUError err; |
| UScriptCode hint_char_script = uscript_getScript(character, &err); |
| if (!U_SUCCESS(err)) |
| return false; |
| return hint_char_script != USCRIPT_INHERITED && |
| hint_char_script != USCRIPT_COMMON; |
| } |
| |
| // https://w3c.github.io/mathml-core/#stretchy-operator-axis |
| static const UChar stretchy_operator_with_inline_axis[]{ |
| 0x003D, 0x005E, 0x005F, 0x007E, 0x00AF, 0x02C6, 0x02C7, 0x02C9, 0x02CD, |
| 0x02DC, 0x02F7, 0x0302, 0x0332, 0x203E, 0x20D0, 0x20D1, 0x20D6, 0x20D7, |
| 0x20E1, 0x2190, 0x2192, 0x2194, 0x2198, 0x2199, 0x219C, 0x219D, 0x219E, |
| 0x21A0, 0x21A2, 0x21A3, 0x21A4, 0x21A6, 0x21A9, 0x21AA, 0x21AB, 0x21AC, |
| 0x21AD, 0x21B4, 0x21B9, 0x21BC, 0x21BD, 0x21C0, 0x21C1, 0x21C4, 0x21C6, |
| 0x21C7, 0x21C9, 0x21CB, 0x21CC, 0x21D0, 0x21D2, 0x21D4, 0x21DA, 0x21DB, |
| 0x21DC, 0x21DD, 0x21E0, 0x21E2, 0x21E4, 0x21E5, 0x21E6, 0x21E8, 0x21F0, |
| 0x21F6, 0x21FD, 0x21FE, 0x21FF, 0x23B4, 0x23B5, 0x23DC, 0x23DD, 0x23DE, |
| 0x23DF, 0x23E0, 0x23E1, 0x2500, 0x27F5, 0x27F6, 0x27F7, 0x27F8, 0x27F9, |
| 0x27FA, 0x27FB, 0x27FC, 0x27FD, 0x27FE, 0x27FF, 0x290C, 0x290D, 0x290E, |
| 0x290F, 0x2910, 0x294E, 0x2950, 0x2952, 0x2953, 0x2956, 0x2957, 0x295A, |
| 0x295B, 0x295E, 0x295F, 0x2B45, 0x2B46, 0xFE35, 0xFE36, 0xFE37, 0xFE38}; |
| |
| bool Character::IsVerticalMathCharacter(UChar32 text_content) { |
| return text_content != |
| uchar::kArabicMathematicalOperatorMeemWithHahWithTatweel && |
| text_content != uchar::kArabicMathematicalOperatorHahWithDal && |
| !std::binary_search( |
| stretchy_operator_with_inline_axis, |
| UNSAFE_TODO(stretchy_operator_with_inline_axis + |
| std::size(stretchy_operator_with_inline_axis)), |
| text_content); |
| } |
| |
| } // namespace blink |