blob: 6be80a54410aae17d13bdf909c9f1c4f02264333 [file] [log] [blame]
/*
* Copyright (C) 2014 Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_CHARACTER_H_
#define THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_CHARACTER_H_
#include <unicode/uchar.h>
#include <unicode/uniset.h>
#include "base/containers/span.h"
#include "base/gtest_prod_util.h"
#include "third_party/blink/renderer/platform/platform_export.h"
#include "third_party/blink/renderer/platform/text/character_property.h"
#include "third_party/blink/renderer/platform/text/east_asian_spacing_type.h"
#include "third_party/blink/renderer/platform/text/han_kerning_char_type.h"
#include "third_party/blink/renderer/platform/text/text_direction.h"
#include "third_party/blink/renderer/platform/wtf/allocator/allocator.h"
#include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
#include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
namespace blink {
class PLATFORM_EXPORT Character {
STATIC_ONLY(Character);
public:
static inline bool IsInRange(UChar32 character,
UChar32 lower_bound,
UChar32 upper_bound) {
return character >= lower_bound && character <= upper_bound;
}
// Commonly used Unicode Blocks in CSS specs.
// https://www.unicode.org/Public/UNIDATA/Blocks.txt
static bool IsBlockCjkSymbolsAndPunctuation(UChar32 ch) {
return IsInRange(ch, 0x3000, 0x303F);
}
static bool IsBlockHalfwidthAndFullwidthForms(UChar32 ch) {
return IsInRange(ch, 0xFF00, 0xFFEF);
}
// Commonly used Unicode General Categories.
// https://unicode.org/reports/tr44/#General_Category_Values
static bool IsGcMark(UChar32 ch) { return U_GET_GC_MASK(ch) & U_GC_M_MASK; }
// East Asian Width: https://unicode.org/reports/tr11/
static UEastAsianWidth EastAsianWidth(UChar32 ch) {
return static_cast<UEastAsianWidth>(
u_getIntPropertyValue(ch, UCHAR_EAST_ASIAN_WIDTH));
}
static bool IsEastAsianWidthFullwidth(UChar32 ch);
static inline bool IsUnicodeVariationSelector(UChar32 character) {
// http://www.unicode.org/Public/UCD/latest/ucd/StandardizedVariants.html
return IsInRange(character, 0x180B,
0x180D) // MONGOLIAN FREE VARIATION SELECTOR ONE to THREE
||
IsInRange(character, 0xFE00, 0xFE0F) // VARIATION SELECTOR-1 to 16
|| IsInRange(character, 0xE0100,
0xE01EF); // VARIATION SELECTOR-17 to 256
}
static inline bool IsUnicodeEmojiVariationSelector(UChar32 character) {
// https://www.unicode.org/Public/emoji/5.0/emoji-variation-sequences.txt
return character == 0xFE0E ||
character == 0xFE0F; // VARIATION SELECTOR-15 to 16
}
static bool IsCJKIdeographOrSymbol(UChar32 c) {
// Below U+02C7 is likely a common case.
return c < 0x2C7 ? false : IsCJKIdeographOrSymbolSlow(c);
}
static bool IsCJKIdeographOrSymbolBase(UChar32 c) {
return IsCJKIdeographOrSymbol(c) &&
!(U_GET_GC_MASK(c) & (U_GC_M_MASK | U_GC_LM_MASK | U_GC_SK_MASK));
}
static bool IsIdeographic(UChar32 c) {
return u_hasBinaryProperty(c, UCHAR_IDEOGRAPHIC);
}
static bool IsHangul(UChar32 c) {
// Below U+1100 is likely a common case.
return c < 0x1100 ? false : IsHangulSlow(c);
}
static unsigned ExpansionOpportunityCount(base::span<const LChar>,
TextDirection,
bool& is_after_expansion);
static unsigned ExpansionOpportunityCount(base::span<const UChar>,
TextDirection,
bool& is_after_expansion);
static bool IsUprightInMixedVertical(UChar32 character);
// https://html.spec.whatwg.org/C/#prod-potentialcustomelementname
static bool IsPotentialCustomElementName8BitChar(LChar ch) {
return IsASCIILower(ch) || IsASCIIDigit(ch) || ch == '-' || ch == '.' ||
ch == '_' || ch == 0xb7 || (0xc0 <= ch && ch != 0xd7 && ch != 0xf7);
}
static bool IsPotentialCustomElementNameChar(UChar32 character);
// http://unicode.org/reports/tr9/#Directional_Formatting_Characters
static bool IsBidiControl(UChar32 character);
static bool MaybeBidiRtlUtf16(base::StrictNumeric<UChar> ch);
static bool MaybeBidiRtl(UChar32 ch);
static bool MaybeBidiRtl(const String&);
static HanKerningCharType GetHanKerningCharType(UChar32 character);
static EastAsianSpacingType GetEastAsianSpacingType(UChar32 character);
// Check the `HanKerningCharType` of a character without knowing the font.
// It depends on fonts, so it may not be `kOpen` or `kClose` even when this
// function returns `true`. See `HanKerning::GetCharType`.
static bool MaybeHanKerningOpen(UChar32 ch) {
return MaybeHanKerningOpenOrCloseFast(ch) && MaybeHanKerningOpenSlow(ch);
}
static bool MaybeHanKerningClose(UChar32 ch) {
return MaybeHanKerningOpenOrCloseFast(ch) && MaybeHanKerningCloseSlow(ch);
}
// Check if the character may be `kOpen` or `kClose` only by ranges, without
// getting the Unicode property. Faster than `MaybeHanKerningOpen` and
// `MaybeHanKerningClose` but has more cases where it returns `true` for other
// characters.
static bool MaybeHanKerningOpenOrCloseFast(UChar32 character) {
return IsInRange(character, uchar::kLeftSingleQuotationMark, 0x301F) ||
IsInRange(character, 0xFF08, 0xFF60);
}
static bool MayNeedEastAsianSpacing(UChar32);
// Collapsible white space characters defined in CSS:
// https://drafts.csswg.org/css-text-3/#collapsible-white-space
static bool IsCollapsibleSpace(UChar c) {
return c == uchar::kSpace || c == uchar::kLineFeed || c == uchar::kTab ||
c == uchar::kCarriageReturn;
}
static bool IsLineFeed(UChar c) { return c == uchar::kLineFeed; }
template <typename CharacterType>
static bool IsOtherSpaceSeparator(CharacterType c) {
return c == uchar::kIdeographicSpace;
}
static bool TreatAsSpace(UChar32 c) {
return c == uchar::kSpace || c == uchar::kTab || c == uchar::kLineFeed ||
c == uchar::kNoBreakSpace;
}
static bool TreatAsZeroWidthSpace(UChar32 c) {
return TreatAsZeroWidthSpaceInComplexScript(c) ||
c == uchar::kZeroWidthNonJoiner || c == uchar::kZeroWidthJoiner;
}
static bool TreatAsZeroWidthSpaceInComplexScriptLegacy(UChar32 c) {
return c == uchar::kFormFeed || c == uchar::kCarriageReturn ||
c == uchar::kSoftHyphen || c == uchar::kZeroWidthSpace ||
(c >= uchar::kLeftToRightMark && c <= uchar::kRightToLeftMark) ||
(c >= uchar::kLeftToRightEmbedding &&
c <= uchar::kRightToLeftOverride) ||
c == uchar::kZeroWidthNoBreakSpace ||
c == uchar::kObjectReplacementCharacter;
}
static bool TreatAsZeroWidthSpaceInComplexScript(UChar32 c) {
if (c == uchar::kFormFeed || c == uchar::kCarriageReturn ||
c == uchar::kObjectReplacementCharacter) {
return true;
}
return IsDefaultIgnorable(c);
}
// https://unicode.org/reports/tr44/#Default_Ignorable_Code_Point
static bool IsDefaultIgnorable(UChar32 c) {
if (c < 0x0100) {
return c == uchar::kSoftHyphen;
}
return u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT);
}
static bool CanTextDecorationSkipInk(UChar32);
static bool CanReceiveTextEmphasis(UChar32);
static bool IsGraphemeExtended(UChar32 c) {
// http://unicode.org/reports/tr29/#Extend
return u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND);
}
// Returns true if the character has a Emoji property.
// See http://www.unicode.org/Public/emoji/3.0/emoji-data.txt
static bool IsEmoji(UChar32);
// Reserved ranges in blocks largely associated with emoji characters. This
// allows handling future Emoji code points.
static bool IsEmojiReserved(UChar32);
static bool IsEmojiIncludingReserved(UChar32);
// Default presentation style according to:
// http://www.unicode.org/reports/tr51/#Presentation_Style
static bool IsEmojiTextDefault(UChar32);
static bool IsEmojiEmojiDefault(UChar32);
static bool IsEmojiModifierBase(UChar32);
static constexpr bool IsEmojiKeycapBase(UChar32 ch) {
return (ch >= '0' && ch <= '9') || ch == '#' || ch == '*';
}
static bool IsRegionalIndicator(UChar32);
static bool IsModifier(UChar32 c) { return c >= 0x1F3FB && c <= 0x1F3FF; }
// http://www.unicode.org/reports/tr51/proposed.html#flag-emoji-tag-sequences
static bool IsEmojiTagSequence(UChar32);
static bool IsEmojiComponent(UChar32);
static bool IsExtendedPictographic(UChar32);
static bool MaybeEmojiPresentation(UChar32);
static bool IsStandardizedVariationSequence(UChar32, UChar32);
static bool IsEmojiVariationSequence(UChar32, UChar32);
static bool IsIdeographicVariationSequence(UChar32 ch, UChar32 vs);
static bool IsVariationSequence(UChar32, UChar32);
static inline bool IsNormalizedCanvasSpaceCharacter(UChar32 c) {
// According to specification all space characters should be replaced with
// 0x0020 space character.
// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-canvas-element.html#text-preparation-algorithm
// The space characters according to specification are : U+0020, U+0009,
// U+000A, U+000C, and U+000D.
// http://www.whatwg.org/specs/web-apps/current-work/multipage/common-microsyntaxes.html#space-character
// This function returns true for 0x000B also, so that this is backward
// compatible. Otherwise, the test
// web_tests/canvas/philip/tests/2d.text.draw.space.collapse.space.html
// will fail
return c == 0x0009 || (c >= 0x000A && c <= 0x000D);
}
static bool IsCommonOrInheritedScript(UChar32);
static bool IsPrivateUse(UChar32);
static bool IsNonCharacter(UChar32);
// Returns whether a script code could be determined for the given character
// and that script code is not USCRIPT_COMMON or USCRIPT_INHERITED.
static bool HasDefiniteScript(UChar32);
static bool IsModernGeorgianUppercase(UChar32 c) {
return IsInRange(c, 0x1C90, 0x1CBF);
}
// Map modern secular Georgian uppercase letters added in Unicode
// 11.0 to their corresponding lowercase letters.
// https://www.unicode.org/charts/PDF/U10A0.pdf
// https://www.unicode.org/charts/PDF/U1C90.pdf
static UChar32 LowercaseModernGeorgianUppercase(UChar32 c) {
return (c - (0x1C90 - 0x10D0));
}
static bool IsVerticalMathCharacter(UChar32);
private:
FRIEND_TEST_ALL_PREFIXES(CharacterTest, Derived);
static bool IsCJKIdeographOrSymbolSlow(UChar32);
static bool IsHangulSlow(UChar32);
static bool MaybeHanKerningOpenSlow(UChar32);
static bool MaybeHanKerningCloseSlow(UChar32);
static void ApplyPatternAndFreezeIfEmpty(icu::UnicodeSet* unicodeSet,
const char* pattern);
};
// True if the character may need the Bidi reordering. If false, the
// `Bidi_Class` of `ch` isn't `R`, `AL`, nor Bidi controls.
// https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5B%3Abc%3DR%3A%5D%5B%3Abc%3DAL%3A%5D%5D&g=bc
// https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Bidi_C:]
//
// This function assumes all non-BMP characters may be Bidi.
inline bool Character::MaybeBidiRtlUtf16(base::StrictNumeric<UChar> ch) {
return ch >= 0x0590 &&
// `InlineItemsBuilder` may emit U+200B Zero Width Space.
ch != uchar::kZeroWidthSpace &&
// General Punctuation such as curly quotes.
!IsInRange(ch, 0x2010, 0x2029) &&
// CJK etc., up to Surrogate Pairs.
!IsInRange(ch, 0x206A, 0xD7FF) &&
// Common in CJK.
!IsInRange(ch, 0xFF00, 0xFFFF);
}
inline bool Character::MaybeBidiRtl(UChar32 ch) {
return ch >= 0x0590 &&
// `InlineItemsBuilder` may emit U+200B Zero Width Space.
ch != uchar::kZeroWidthSpace &&
// General Punctuation such as curly quotes.
!IsInRange(ch, 0x2010, 0x2029) &&
// CJK etc., up to Surrogate Pairs.
!IsInRange(ch, 0x206A, 0xD7FF) &&
// Common in CJK.
!IsInRange(ch, 0xFF00, 0xFFFF) &&
// Kana Extended-B, Kana Supplement, Kana Extended-A, Small Kana
// Extension
!IsInRange(ch, 0x1AFF0, 0x1B16F) &&
// CJK Ideographs Extensions
!IsInRange(ch, 0x20000, 0x323AF);
}
inline bool Character::MaybeBidiRtl(const String& text) {
return !text.Is8Bit() && !text.IsAllSpecialCharacters<[](UChar c) {
return !MaybeBidiRtlUtf16(c);
}>();
}
inline bool Character::IsEastAsianWidthFullwidth(UChar32 ch) {
// All EAW=F characters are in the "Halfwidth and Fullwidth forms" block,
// except U+3000 IDEOGRAPHIC SPACE.
// https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:ea=F:]
return ch == uchar::kIdeographicSpace ||
(IsBlockHalfwidthAndFullwidthForms(ch) &&
EastAsianWidth(ch) == UEastAsianWidth::U_EA_FULLWIDTH);
}
inline bool Character::MayNeedEastAsianSpacing(UChar32 ch) {
// `EastAsianSpacingType::kWide` may need the spacing. U+02C7 is the minimum
// code point of `kWide`.
return ch >= 0x02C7 && ch != uchar::kObjectReplacementCharacter &&
// U+2000-206F General Punctuation has rather popular characters, such
// as ZWSP and curly quotation marks. Exclude the largest range of
// non-`kWide` that include them.
!IsInRange(ch, 0x1200, 0x3004);
}
} // namespace blink
#endif // THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_CHARACTER_H_