| #!/usr/bin/env python |
| # Copyright 2015 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Generate a compact character composition table. |
| |
| Normal use: |
| ./generate_character_composer_data.py \ |
| --output character_composer_data.h \ |
| character_composer_sequences.txt |
| |
| Run this script with --help for a description of arguments. |
| |
| Input file format: |
| |
| Comments begin with '#' and extend to the end of the line. |
| |
| Each non-comment line is a sequence of two or more keys, separated by |
| space, the last of which is the result of composing the initial part. |
| A sequence must begin with a dead key or Compose, and the result must |
| be a character key. |
| |
| Each key can either be a character key, a dead key, or the compose key. |
| |
| A character key is written as one of the following inside matched delimiters: |
| - a single printable ASCII character; |
| - a Unicode character name; |
| - 'U+' followed by one or more hexadecimal digits. |
| Delimiter pairs are any of '' "" () <> [] or {}. |
| |
| A dead key is written as the word 'Dead' followed (immediately, without space) |
| by the combining character written in the same form as a character key. |
| |
| A compose key is written as the word 'Compose'. |
| |
| Output file format: |
| |
| The output file is a C++ header containing a small header structure |
| |ui::TreeComposeChecker::CompositionData| and a tree of composition sequences. |
| |
| For space efficiency, the tree is stored in a single array of 16-bit values, |
| which represent either characters (printable or dead-key) or subtree array |
| indices. |
| |
| Each tree level consists for four tables: two key kinds (character or dead) |
| by two node types (internal or leaf), in the order: |
| - character internal |
| - character leaf |
| - dead-key internal |
| - dead-key leaf |
| This order is used because character key entries are more common than dead-key |
| entries, and internal entries are more common than leaf entries. |
| |
| Each table consists of a word containing the number of table entries |n|, |
| followed immediately by |n| key-value pairs of words, ordered by key. |
| For internal edges, the value is the array index of the resulting subtree. |
| For leaf edges, the value is the unicode character code of the composition |
| result. |
| """ |
| |
| import argparse |
| import codecs |
| import collections |
| import sys |
| import unicodedata |
| |
| # Global error counter. |
| g_fail = 0 |
| |
| class Key(str): |
| """Represents an element of a composition sequence. |
| |
| Supports only Compose, dead keys, and BMP unicode characters. |
| Based on |str| for easy comparison and sorting. |
| The representation is 'C' (for unicode characters) or 'D' (for dead keys) |
| followed by 4 hex digits for the character value. The Compose key is |
| represented as dead key with combining character 0. |
| """ |
| _kinds = ['character', 'dead_key'] |
| |
| def __new__(cls, key, character, location=None): |
| """Construct a Key. |
| Call as: |
| - Key(None, character_code) |
| - Key('Dead', combining_character_code) |
| - Key('Compose', 0) |
| """ |
| global g_fail |
| if character > 0xFFFF: |
| print '{}: unsupported non-BMP character {}'.format(location, character) |
| g_fail += 1 |
| s = 'ERROR' |
| elif key is None: |
| s = 'C{:04X}'.format(character) |
| elif key.lower() == 'dead': |
| s = 'D{:04X}'.format(character) |
| elif key.lower() == 'compose': |
| s = 'D0000' |
| else: |
| print '{}: unexpected combination {}<{}>'.format(location, key, character) |
| g_fail += 1 |
| s = 'ERROR' |
| return str.__new__(cls, s) |
| |
| def Kind(self): |
| return {'C': 'character', 'D': 'dead_key'}[self[0]] |
| |
| def CharacterCode(self): |
| return int(self[1:], 16) |
| |
| def UnicodeName(self): |
| v = self.CharacterCode() |
| try: |
| return unicodedata.name(unichr(v)).lower() |
| except ValueError: |
| return 'U+{:04X}'.format(v) |
| |
| def ShorterUnicodeName(self): |
| s = self.UnicodeName() |
| if s.startswith('latin ') or s.startswith('greek '): |
| s = s[6:] |
| if s.startswith('small '): |
| s = s[6:] |
| return s.replace(' letter ', ' ') |
| |
| def Pretty(self): |
| if self == 'D0000': |
| return 'Compose' |
| return ('Dead' if self[0] == 'D' else '') + '<' + self.UnicodeName() + '>' |
| |
| |
| class Input: |
| """ |
| Takes a sequences of file names and presents them as a single input stream, |
| with location reporting for error messages. |
| """ |
| def __init__(self, filenames): |
| self._pending = filenames |
| self._filename = None |
| self._file = None |
| self._line = None |
| self._lineno = 0 |
| self._column = 0 |
| |
| def Where(self): |
| """Report the current input location, for error messages.""" |
| if self._file: |
| return '{}:{}:{}'.format(self._filename, self._lineno, self._column) |
| if self._pending: |
| return '<before>' |
| return '<eof>' |
| |
| def Get(self): |
| """Return the next input character, or None when inputs are exhausted.""" |
| if self._line is None: |
| if self._file is None: |
| if not self._pending: |
| return None |
| self._filename = self._pending[0] |
| self._pending = self._pending[1:] |
| self._file = codecs.open(self._filename, mode='rb', encoding='utf-8') |
| self._lineno = 0 |
| self._lineno += 1 |
| self._line = self._file.readline() |
| if not self._line: |
| self._file = None |
| self._filename = None |
| return self.Get() |
| self._column = 0 |
| if self._column >= len(self._line): |
| self._line = None |
| return self.Get() |
| c = self._line[self._column] |
| self._column += 1 |
| return c |
| |
| |
| class Lexer: |
| """ |
| Breaks the input stream into a sequence of tokens, each of which is either |
| a Key or the string 'EOL'. |
| """ |
| def __init__(self, compose_input): |
| self._input = compose_input |
| |
| _delimiters = { |
| '"': '"', |
| "'": "'", |
| '<': '>', |
| '(': ')', |
| '[': ']', |
| '{': '}', |
| } |
| |
| def GetUntilDelimiter(self, e): |
| text = '' |
| c = self._input.Get() |
| while c and c != e: |
| text += c |
| c = self._input.Get() |
| return text |
| |
| def Get(self): |
| global g_fail |
| c = ' ' |
| while c and c.isspace() and c != '\n': |
| c = self._input.Get() |
| if not c: |
| return None |
| location = self._input.Where() |
| if c == '\n': |
| return 'EOL' |
| if c == '#': |
| self.GetUntilDelimiter('\n') |
| return 'EOL' |
| if c == '\\': |
| self.GetUntilDelimiter('\n') |
| return self.Get() |
| key = None |
| character = None |
| if c.isalnum(): |
| key = '' |
| while c and c.isalnum(): |
| key += c |
| c = self._input.Get() |
| if c in Lexer._delimiters: |
| s = self.GetUntilDelimiter(Lexer._delimiters[c]) |
| if len(s) == 1: |
| character = ord(s) |
| elif s.startswith('U+'): |
| character = int(s[2:], 16) |
| else: |
| try: |
| character = ord(unicodedata.lookup(s.upper())) |
| except KeyError: |
| g_fail += 1 |
| character = None |
| print '{}: unknown character name "{}"'.format(location, |
| s.encode('utf-8')) |
| return Key(key, character, location) |
| |
| def Where(self): |
| return self._input.Where() |
| |
| |
| class Parser: |
| """ |
| Takes a sequence of tokens from a Lexer and returns a tree of |
| composition sequences, represented as nested dictionaries where each |
| composition source element key is a dictionary key, and the final |
| composition result has a dictionary key of |None|. |
| """ |
| def __init__(self, lexer): |
| self._lexer = lexer |
| self._trie = {} |
| |
| def Parse(self): |
| global g_fail |
| self._trie = {} |
| while True: |
| seq = [] |
| t = self._lexer.Get() |
| if not t: |
| break |
| if t and t != 'EOL' and t.Kind() != 'dead_key': |
| g_fail += 1 |
| print ('{}: sequence does not begin with Compose or Dead key' |
| .format(self._lexer.Where())) |
| break |
| while t and t != 'EOL': |
| seq.append(t) |
| t = self._lexer.Get() |
| if not seq: |
| continue |
| self.AddToSimpleTree(self._trie, seq) |
| return self._trie |
| |
| def AddToSimpleTree(self, tree, seq): |
| first = seq[0] |
| rest = seq[1:] |
| if first not in tree: |
| tree[first] = {} |
| if len(rest) == 1: |
| # Leaf |
| tree[first][None] = rest[0] |
| else: |
| self.AddToSimpleTree(tree[first], rest) |
| |
| |
| class GroupedTree: |
| """ |
| Represents composition sequences in a manner close to the output format. |
| |
| The core of the representation is the |_tables| dictionary, which has |
| an entry for each kind of |Key|, each of which is a dictionary with |
| two entries, 'internal' and 'leaf', for the output tables, each being |
| a dictionary indexed by a composition sequence |Key|. For 'internal' |
| tables the dictionary values are |GroupedTree|s at the next level; |
| for 'leaf' tables the dictionary values are |Key| composition results. |
| """ |
| _key_kinds = Key._kinds |
| _sub_parts = ['internal', 'leaf'] |
| |
| def __init__(self, simple_trie, path=None): |
| if path is None: |
| path = [] |
| self.path = path |
| self.depth = len(path) |
| self.height = 0 |
| self.empty = True |
| self.location = -1 |
| |
| # Initialize |_tables|. |
| self._tables = {} |
| for k in self._key_kinds: |
| self._tables[k] = {} |
| for p in self._sub_parts: |
| self._tables[k][p] = {} |
| |
| # Build the tables. |
| for key in simple_trie: |
| if key is not None: |
| # Leaf table entry. |
| if None in simple_trie[key]: |
| self.empty = False |
| self._tables[key.Kind()]['leaf'][key] = simple_trie[key][None] |
| # Internal subtree entries. |
| v = GroupedTree(simple_trie[key], path + [key]) |
| if not v.empty: |
| self.empty = False |
| self._tables[key.Kind()]['internal'][key] = v |
| if self.height < v.height: |
| self.height = v.height |
| self.height += 1 |
| |
| def SubTrees(self): |
| """Returns a list of all sub-|GroupedTree|s of the current GroupTree.""" |
| r = [] |
| for k in self._key_kinds: |
| for key in sorted(self._tables[k]['internal']): |
| r.append(self._tables[k]['internal'][key]) |
| return r |
| |
| |
| class Assembler: |
| """Convert a parse tree via a GroupedTree to a C++ header.""" |
| |
| def __init__(self, args, dtree): |
| self._name = args.data_name |
| self._type = args.type_name |
| self._gtree = GroupedTree(dtree) |
| |
| def Write(self, out): |
| # First pass: determine table sizes and locations. |
| self.Pass(None, self._gtree) |
| |
| # Second pass: write the array. |
| out.write('\nstatic const uint16_t {}Tree[] = {{\n'.format(self._name)) |
| end = self.Pass(out, self._gtree) |
| out.write('};\n\n') |
| |
| # Write the description structure. |
| out.write('static const {} {} = {{\n' |
| .format(self._type, self._name)) |
| out.write(' {}, // maximum sequence length\n'.format(self._gtree.height)) |
| out.write(' {}, // tree array entries\n'.format(end)) |
| out.write(' {}Tree\n'.format(self._name)) |
| out.write('};\n\n') |
| |
| def Pass(self, out, gtree, location=0): |
| gtree.location = location |
| |
| # Write tree header. |
| if out: |
| out.write('\n // offset 0x{:04X}:\n'.format(location)) |
| if gtree.path: |
| out.write(' // prefix:\n') |
| for key in gtree.path: |
| out.write(' // {}\n'.format(key.Pretty())) |
| |
| # Write tables. |
| for k in gtree._key_kinds: |
| for p in gtree._sub_parts: |
| # Write table size. |
| location += 1 |
| if out: |
| out.write(' // {} {} table\n'.format(p, k)) |
| out.write(' 0x{:04X}, // number of entries\n' |
| .format(len(gtree._tables[k][p]))) |
| # Write table body. |
| for key in sorted(gtree._tables[k][p]): |
| location += 2 |
| if out: |
| out.write(' 0x{:04X}, // {}\n' |
| .format(key.CharacterCode(), key.ShorterUnicodeName())) |
| result = gtree._tables[k][p][key] |
| if p == 'internal': |
| out.write(' 0x{:04X},\n'.format(result.location)) |
| else: |
| out.write(' 0x{:04X}, // -> {}\n' |
| .format(result.CharacterCode(), |
| result.ShorterUnicodeName())) |
| |
| # Assemble subtrees of the current tree. |
| for t in gtree.SubTrees(): |
| location = self.Pass(out, t, location) |
| |
| return location |
| |
| |
| def main(argv): |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--type_name', |
| default='ui::TreeComposeChecker::CompositionData') |
| parser.add_argument('--data_name', default='kCompositions') |
| parser.add_argument('--output', default='character_composer_data.h') |
| parser.add_argument('--guard', default=None) |
| parser.add_argument('inputs', nargs='*') |
| args = parser.parse_args(argv[1:]) |
| |
| parse_tree = Parser(Lexer(Input(args.inputs))).Parse() |
| |
| with (sys.stdout if args.output == '-' else open(args.output, 'wb')) as out: |
| out.write('// Copyright 2015 The Chromium Authors. All rights reserved.\n') |
| out.write('// Use of this source code is governed by a BSD-style license') |
| out.write(' that can be\n// found in the LICENSE file.\n\n') |
| out.write('// DO NOT EDIT.\n') |
| out.write('// GENERATED BY {}\n'.format(sys.argv[0])) |
| out.write('// FROM {}\n\n'.format(' '.join(args.inputs))) |
| guard = args.guard if args.guard else args.output |
| guard = ''.join([c.upper() if c.isalpha() else '_' for c in guard]) |
| out.write('#ifndef {0}_\n#define {0}_\n'.format(guard)) |
| Assembler(args, parse_tree).Write(out) |
| out.write('#endif // {}_\n'.format(guard)) |
| |
| return g_fail |
| |
| if __name__ == '__main__': |
| sys.exit(main(sys.argv)) |