blob: 93ccadf5088ed126ae24e0fe67c448293d603dbc [file] [log] [blame]
#!/usr/bin/env python3
# Copyright 2022 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import argparse
import dataclasses
import json
import logging
import os
import sys
import unicodedata
from typing import Any, Dict, Generator, List, Optional, Sequence, Set, Tuple
# Add extra dependencies to the python path.
_SCRIPT_DIR = os.path.realpath(os.path.dirname(__file__))
_CHROME_SOURCE = os.path.realpath(
os.path.join(_SCRIPT_DIR, *[os.path.pardir] * 6))
sys.path.append(os.path.join(_CHROME_SOURCE, 'build/android/gyp'))
from util import build_utils
# Initialize logger.
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
LOGGER = logging.getLogger(__name__)
# List of unicode ranges for each symbol group (ranges are inclusive).
SYMBOLS_GROUPS = {
'Arrows': [
# Arrows Unicode Block.
(0x2190, 0x21ff),
# Supplemental Arrows-C Unicode Block.
# Note: There are unassigned code points in the block which are
# automatically skipped by the script.
(0x1f800, 0x1f8ff),
],
'Bullet/Stars': [
# Some rows from Miscellaneous Symbols and Arrows Unicode block.
(0x2b20, 0x2b2f),
(0x2b50, 0x2b5f),
(0x2b90, 0x2b9f),
(0x2bb0, 0x2bbf),
(0x2bc0, 0x2bcf),
],
'Currency': [
# Currency Unicode Block.
(0x20a0, 0x20bf),
],
'Letterlike': [
# Letterlike Symbols Unicode Block.
(0x2100, 0x210f),
],
'Math': [
# Greek Letters and Symbols from Mathematical and Alphanumeric
# Symbols Unicode Block.
# Normal Capital Letters.
(0x0391, 0x0391 + 25),
# Normal Small Letters.
(0x03b1, 0x03b1 + 25),
# Mathematical Operators
(0x2200, 0x2235),
(0x2260, 0x228b),
],
'Miscellaneous': [
# Miscellaneous Symbols Unicode Block.
(0x2600, 0x26ff)
],
}
# List of unicode ranges (inclusive) for each search only symbol group.
SEARCH_ONLY_SYMBOLS_GROUPS = {
'Letterlike': [
# Letterlike Symbols Unicode Block.
(0x2110, 0x214f),
],
'Math': [
# Greek Letters and Symbols from Mathematical and Alphanumeric
# Symbols Unicode Block.
# Bold Capital Letters.
(0x1D6A8, 0x1D6A8 + 25),
# Italic Capital Letters.
(0x1D6E2, 0x1D6E2 + 25),
# Bold-Italic Capital Letters.
(0x1D71C, 0x1D71C + 25),
# Mathematical Operators
(0x2236, 0x225f),
(0x228c, 0x22df),
],
'Miscellaneous': [
# Miscellaneous Symbols Unicode Block.
(0x2300, 0x23cf)
],
}
# Set of unicode symbols that do not render with fonts available on ChromeOS
INVALID_SYMBOLS = set([
'\u2BBA',
'\u2BBB',
'\u2BBC',
'\u2B97',
'\u2BC9',
'\U0001F8B0',
'\U0001F8B1',
])
@dataclasses.dataclass
class EmojiPickerChar:
"""A type representing a single character in EmojiPicker."""
# Unicode character.
string: str
# Name of the unicode character.
name: str
# Search keywords related to the unicode character.
keywords: List[str] = dataclasses.field(
default_factory=list)
@dataclasses.dataclass
class EmojiPickerEmoji:
"""A type representing an emoji/emoticon/symbol in EmojiPicker."""
# Base Emoji.
base: EmojiPickerChar
# Base Emoji's variants and alternative emojis.
alternates: List[EmojiPickerChar] = dataclasses.field(
default_factory=list)
@dataclasses.dataclass
class EmojiPickerGroup:
"""A type representing a group of emoji/emoticon/symbols."""
# Name of the group.
group: str
# List of the emojis in the group.
emoji: List[EmojiPickerEmoji]
# Determines If the group is search-only.
search_only: bool = False
def _convert_snake_case_to_camel_case(snake_case_input: str) -> str:
"""Converts an snake-case string to camel-case.
Args:
snake_case_input: String that is snake case.
Returns:
An string that is camel-case version of input.
"""
words = snake_case_input.split('_')
return words[0] + ''.join(word.title() for word in words[1:])
def _emoji_data_dict_factory(
data: Sequence[Tuple[str, Any]]) -> Dict[str, Any]:
"""Implements a dictionary factory for emoji data preparation.
This factory skips empty keys with empty value. It also converts snake-case
keys to camel-case.
Args:
data: A sequence of (key, value) pairs
Returns:
A dictionary created from the input sequence where keys with an empty
list value are ignored and keys are converted to camel-case.
"""
return {
_convert_snake_case_to_camel_case(key): value
for (key, value) in data
if not isinstance(value, list) or value
}
def _load_emoji_characters_from_files(data_paths: List[str]) -> Set[str]:
"""Loads a set of emoji characters from a list of data file paths.
Args:
data_paths: A list of emoji data files.
Returns:
The set of emoji unicode characters read from the data.
"""
emoji_character_set = set()
for data_path in data_paths:
with open(data_path, 'r') as data_file:
emoji_groups = json.load(data_file)
file_character_set = {
emoji['base']['string']
for emoji_group in emoji_groups
for emoji in emoji_group['emoji']
}
emoji_character_set.update(file_character_set)
return emoji_character_set
def _convert_unicode_ranges_to_emoji_chars(
unicode_ranges: List[Tuple[int, int]],
ignore_errors: bool = True) -> Generator[EmojiPickerChar, None, None]:
"""Converts unicode ranges to `EmojiPickerChar` instances.
Given a list of unicode ranges, it iterates over all characters in all the
ranges and creates and yields an instance of `EmojiPickerChar` for each
one.
Args:
unicode_ranges: A list of unicode ranges.
ignore_errors: If True, any exceptions raised during processing
unicode characters is silently ignored.
Raises:
ValueError: If a unicode character does not exist in the data source
and `ignore_errors` is true, an exception is raised.
Yields:
The converted version of each unicode character in the input ranges.
"""
LOGGER.info(
'generating EmojiPickerChar instances for ranges: [%s].',
', '.join(
'(U+{:02x}, U+{:02x})'.format(*rng)
for rng in unicode_ranges))
num_chars = 0
num_ignored = 0
# Iterate over the input unicode ranges.
for (start_code_point, end_code_point) in unicode_ranges:
LOGGER.debug(
'generating EmojiPickerChar instances '
'for range (U+%02x to U+%02x).',
start_code_point,
end_code_point)
num_chars += end_code_point + 1 - start_code_point
# Iterate over all code points in the range.
for code_point in range(start_code_point, end_code_point + 1):
try:
# For the current code point, create the corresponding
# character and lookup its name in the unicodedata. Then,
# create an instance of `EmojiPickerChar` from the data.
unicode_character = chr(code_point)
yield EmojiPickerChar(
string=unicode_character,
name=unicodedata.name(unicode_character).lower())
except ValueError:
# If ignore_errors is False, raise the exception.
if not ignore_errors:
raise
else:
num_ignored += 1
LOGGER.warning(
'invalid code point U+%02x.', code_point)
LOGGER.info(
'stats: #returned instances: %d, #ignored code points: %d',
num_chars,
num_ignored)
def get_symbols_groups(
group_unicode_ranges: Dict[str, List[Tuple[int, int]]],
search_only: bool = False, ignore_errors: bool = True,
filter_set: Optional[Set[str]] = None) -> List[EmojiPickerGroup]:
"""Creates symbols data from predefined groups and their unicode ranges.
Args:
group_unicode_ranges: A base mapping of group names to unicode ranges.
search_only: If True, the group is considered search-only.
ignore_errors: If True, any exceptions raised during processing
unicode characters is silently ignored.
filter_set: If not None, the characters that exist in this set are
excluded from output symbol groups.
Raises:
ValueError: If a unicode character does not exist in the data source
and `ignore_errors` is true, the exception is raised.
"""
emoji_groups = list()
for (group_name, unicode_ranges) in group_unicode_ranges.items():
LOGGER.info('generating symbols for group %s.', group_name)
emoji_chars = _convert_unicode_ranges_to_emoji_chars(
unicode_ranges, ignore_errors=ignore_errors)
emoji = [
EmojiPickerEmoji(base=emoji_char)
for emoji_char in emoji_chars
if filter_set is None or emoji_char.string not in filter_set]
emoji_group = EmojiPickerGroup(
group=group_name, emoji=emoji, search_only=search_only)
emoji_groups.append(emoji_group)
return emoji_groups
def main(argv: List[str]) -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
'--output', required=True, type=str,
help='Path to write the output JSON file.')
parser.add_argument(
'--verbose', required=False, default=False,
action='store_true',
help="Set the logging level to Debug.")
parser.add_argument(
'--filter-data-paths', action='append', nargs='+')
args = parser.parse_args(argv)
if args.verbose:
LOGGER.setLevel(level=logging.DEBUG)
# Flatten list of data paths if any.
filter_data_paths = list()
if args.filter_data_paths is not None:
for data_path_element in args.filter_data_paths:
filter_data_paths.extend(data_path_element)
# Loads a list of other emoji characters that must be
# excluded from symbols.
filter_set = _load_emoji_characters_from_files(
data_paths=filter_data_paths)
# Explicitly remove individual symbols that don't render on ChromeOS
filter_set |= INVALID_SYMBOLS
# Add symbol groups.
symbols_groups = get_symbols_groups(
group_unicode_ranges=SYMBOLS_GROUPS,
filter_set=filter_set,
search_only=False)
# Add search-only symbol groups.
symbols_groups.extend(
get_symbols_groups(
group_unicode_ranges=SEARCH_ONLY_SYMBOLS_GROUPS,
filter_set=filter_set,
search_only=True)
)
# Create the data and convert them to dict.
symbols_groups_dicts = []
for symbol_group in symbols_groups:
symbol_group_dict = dataclasses.asdict(
symbol_group,
dict_factory=_emoji_data_dict_factory)
symbols_groups_dicts.append(symbol_group_dict)
# Write the result to output path as json file.
with build_utils.AtomicOutput(args.output) as tmp_file:
tmp_file.write(
json.dumps(
symbols_groups_dicts,
separators=(',', ':'),
ensure_ascii=False).encode('utf-8'))
if __name__ == "__main__":
main(sys.argv[1:])