blob: a0becd22479ec324c98a8e310cc07ceb14892956 [file] [log] [blame]
#!/usr/bin/env python
# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Generate a compact character composition table.
Normal use:
./generate_character_composer_data.py \
--output character_composer_data.h \
character_composer_sequences.txt
Run this script with --help for a description of arguments.
Input file format:
Comments begin with '#' and extend to the end of the line.
Each non-comment line is a sequence of two or more keys, separated by
space, the last of which is the result of composing the initial part.
A sequence must begin with a dead key or Compose, and the result must
be a character key.
Each key can either be a character key, a dead key, or the compose key.
A character key is written as one of the following inside matched delimiters:
- a single printable ASCII character;
- a Unicode character name;
- 'U+' followed by one or more hexadecimal digits.
Delimiter pairs are any of '' "" () <> [] or {}.
A dead key is written as the word 'Dead' followed (immediately, without space)
by the combining character written in the same form as a character key.
A compose key is written as the word 'Compose'.
Output file format:
The output file is a C++ header containing a small header structure
|ui::TreeComposeChecker::CompositionData| and a tree of composition sequences.
For space efficiency, the tree is stored in a single array of 16-bit values,
which represent either characters (printable or dead-key) or subtree array
indices.
Each tree level consists for four tables: two key kinds (character or dead)
by two node types (internal or leaf), in the order:
- character internal
- character leaf
- dead-key internal
- dead-key leaf
This order is used because character key entries are more common than dead-key
entries, and internal entries are more common than leaf entries.
Each table consists of a word containing the number of table entries |n|,
followed immediately by |n| key-value pairs of words, ordered by key.
For internal edges, the value is the array index of the resulting subtree.
For leaf edges, the value is the unicode character code of the composition
result.
"""
import argparse
import codecs
import collections
import sys
import unicodedata
# Global error counter.
g_fail = 0
class Key(str):
"""Represents an element of a composition sequence.
Supports only Compose, dead keys, and BMP unicode characters.
Based on |str| for easy comparison and sorting.
The representation is 'C' (for unicode characters) or 'D' (for dead keys)
followed by 4 hex digits for the character value. The Compose key is
represented as dead key with combining character 0.
"""
_kinds = ['character', 'dead_key']
def __new__(cls, key, character, location=None):
"""Construct a Key.
Call as:
- Key(None, character_code)
- Key('Dead', combining_character_code)
- Key('Compose', 0)
"""
global g_fail
if character > 0xFFFF:
print '{}: unsupported non-BMP character {}'.format(location, character)
g_fail += 1
s = 'ERROR'
elif key is None:
s = 'C{:04X}'.format(character)
elif key.lower() == 'dead':
s = 'D{:04X}'.format(character)
elif key.lower() == 'compose':
s = 'D0000'
else:
print '{}: unexpected combination {}<{}>'.format(location, key, character)
g_fail += 1
s = 'ERROR'
return str.__new__(cls, s)
def Kind(self):
return {'C': 'character', 'D': 'dead_key'}[self[0]]
def CharacterCode(self):
return int(self[1:], 16)
def UnicodeName(self):
v = self.CharacterCode()
try:
return unicodedata.name(unichr(v)).lower()
except ValueError:
return 'U+{:04X}'.format(v)
def ShorterUnicodeName(self):
s = self.UnicodeName()
if s.startswith('latin ') or s.startswith('greek '):
s = s[6:]
if s.startswith('small '):
s = s[6:]
return s.replace(' letter ', ' ')
def Pretty(self):
if self == 'D0000':
return 'Compose'
return ('Dead' if self[0] == 'D' else '') + '<' + self.UnicodeName() + '>'
class Input:
"""
Takes a sequences of file names and presents them as a single input stream,
with location reporting for error messages.
"""
def __init__(self, filenames):
self._pending = filenames
self._filename = None
self._file = None
self._line = None
self._lineno = 0
self._column = 0
def Where(self):
"""Report the current input location, for error messages."""
if self._file:
return '{}:{}:{}'.format(self._filename, self._lineno, self._column)
if self._pending:
return '<before>'
return '<eof>'
def Get(self):
"""Return the next input character, or None when inputs are exhausted."""
if self._line is None:
if self._file is None:
if not self._pending:
return None
self._filename = self._pending[0]
self._pending = self._pending[1:]
self._file = codecs.open(self._filename, mode='rb', encoding='utf-8')
self._lineno = 0
self._lineno += 1
self._line = self._file.readline()
if not self._line:
self._file = None
self._filename = None
return self.Get()
self._column = 0
if self._column >= len(self._line):
self._line = None
return self.Get()
c = self._line[self._column]
self._column += 1
return c
class Lexer:
"""
Breaks the input stream into a sequence of tokens, each of which is either
a Key or the string 'EOL'.
"""
def __init__(self, compose_input):
self._input = compose_input
_delimiters = {
'"': '"',
"'": "'",
'<': '>',
'(': ')',
'[': ']',
'{': '}',
}
def GetUntilDelimiter(self, e):
text = ''
c = self._input.Get()
while c and c != e:
text += c
c = self._input.Get()
return text
def Get(self):
global g_fail
c = ' '
while c and c.isspace() and c != '\n':
c = self._input.Get()
if not c:
return None
location = self._input.Where()
if c == '\n':
return 'EOL'
if c == '#':
self.GetUntilDelimiter('\n')
return 'EOL'
if c == '\\':
self.GetUntilDelimiter('\n')
return self.Get()
key = None
character = None
if c.isalnum():
key = ''
while c and c.isalnum():
key += c
c = self._input.Get()
if c in Lexer._delimiters:
s = self.GetUntilDelimiter(Lexer._delimiters[c])
if len(s) == 1:
character = ord(s)
elif s.startswith('U+'):
character = int(s[2:], 16)
else:
try:
character = ord(unicodedata.lookup(s.upper()))
except KeyError:
g_fail += 1
character = None
print '{}: unknown character name "{}"'.format(location,
s.encode('utf-8'))
return Key(key, character, location)
def Where(self):
return self._input.Where()
class Parser:
"""
Takes a sequence of tokens from a Lexer and returns a tree of
composition sequences, represented as nested dictionaries where each
composition source element key is a dictionary key, and the final
composition result has a dictionary key of |None|.
"""
def __init__(self, lexer):
self._lexer = lexer
self._trie = {}
def Parse(self):
global g_fail
self._trie = {}
while True:
seq = []
t = self._lexer.Get()
if not t:
break
if t and t != 'EOL' and t.Kind() != 'dead_key':
g_fail += 1
print ('{}: sequence does not begin with Compose or Dead key'
.format(self._lexer.Where()))
break
while t and t != 'EOL':
seq.append(t)
t = self._lexer.Get()
if not seq:
continue
self.AddToSimpleTree(self._trie, seq)
return self._trie
def AddToSimpleTree(self, tree, seq):
first = seq[0]
rest = seq[1:]
if first not in tree:
tree[first] = {}
if len(rest) == 1:
# Leaf
tree[first][None] = rest[0]
else:
self.AddToSimpleTree(tree[first], rest)
class GroupedTree:
"""
Represents composition sequences in a manner close to the output format.
The core of the representation is the |_tables| dictionary, which has
an entry for each kind of |Key|, each of which is a dictionary with
two entries, 'internal' and 'leaf', for the output tables, each being
a dictionary indexed by a composition sequence |Key|. For 'internal'
tables the dictionary values are |GroupedTree|s at the next level;
for 'leaf' tables the dictionary values are |Key| composition results.
"""
_key_kinds = Key._kinds
_sub_parts = ['internal', 'leaf']
def __init__(self, simple_trie, path=None):
if path is None:
path = []
self.path = path
self.depth = len(path)
self.height = 0
self.empty = True
self.location = -1
# Initialize |_tables|.
self._tables = {}
for k in self._key_kinds:
self._tables[k] = {}
for p in self._sub_parts:
self._tables[k][p] = {}
# Build the tables.
for key in simple_trie:
if key is not None:
# Leaf table entry.
if None in simple_trie[key]:
self.empty = False
self._tables[key.Kind()]['leaf'][key] = simple_trie[key][None]
# Internal subtree entries.
v = GroupedTree(simple_trie[key], path + [key])
if not v.empty:
self.empty = False
self._tables[key.Kind()]['internal'][key] = v
if self.height < v.height:
self.height = v.height
self.height += 1
def SubTrees(self):
"""Returns a list of all sub-|GroupedTree|s of the current GroupTree."""
r = []
for k in self._key_kinds:
for key in sorted(self._tables[k]['internal']):
r.append(self._tables[k]['internal'][key])
return r
class Assembler:
"""Convert a parse tree via a GroupedTree to a C++ header."""
def __init__(self, args, dtree):
self._name = args.data_name
self._type = args.type_name
self._gtree = GroupedTree(dtree)
def Write(self, out):
# First pass: determine table sizes and locations.
self.Pass(None, self._gtree)
# Second pass: write the array.
out.write('\nstatic const uint16_t {}Tree[] = {{\n'.format(self._name))
end = self.Pass(out, self._gtree)
out.write('};\n\n')
# Write the description structure.
out.write('static const {} {} = {{\n'
.format(self._type, self._name))
out.write(' {}, // maximum sequence length\n'.format(self._gtree.height))
out.write(' {}, // tree array entries\n'.format(end))
out.write(' {}Tree\n'.format(self._name))
out.write('};\n\n')
def Pass(self, out, gtree, location=0):
gtree.location = location
# Write tree header.
if out:
out.write('\n // offset 0x{:04X}:\n'.format(location))
if gtree.path:
out.write(' // prefix:\n')
for key in gtree.path:
out.write(' // {}\n'.format(key.Pretty()))
# Write tables.
for k in gtree._key_kinds:
for p in gtree._sub_parts:
# Write table size.
location += 1
if out:
out.write(' // {} {} table\n'.format(p, k))
out.write(' 0x{:04X}, // number of entries\n'
.format(len(gtree._tables[k][p])))
# Write table body.
for key in sorted(gtree._tables[k][p]):
location += 2
if out:
out.write(' 0x{:04X}, // {}\n'
.format(key.CharacterCode(), key.ShorterUnicodeName()))
result = gtree._tables[k][p][key]
if p == 'internal':
out.write(' 0x{:04X},\n'.format(result.location))
else:
out.write(' 0x{:04X}, // -> {}\n'
.format(result.CharacterCode(),
result.ShorterUnicodeName()))
# Assemble subtrees of the current tree.
for t in gtree.SubTrees():
location = self.Pass(out, t, location)
return location
def main(argv):
parser = argparse.ArgumentParser()
parser.add_argument('--type_name',
default='ui::TreeComposeChecker::CompositionData')
parser.add_argument('--data_name', default='kCompositions')
parser.add_argument('--output', default='character_composer_data.h')
parser.add_argument('--guard', default=None)
parser.add_argument('inputs', nargs='*')
args = parser.parse_args(argv[1:])
parse_tree = Parser(Lexer(Input(args.inputs))).Parse()
with (sys.stdout if args.output == '-' else open(args.output, 'wb')) as out:
out.write('// Copyright 2015 The Chromium Authors. All rights reserved.\n')
out.write('// Use of this source code is governed by a BSD-style license')
out.write(' that can be\n// found in the LICENSE file.\n\n')
out.write('// DO NOT EDIT.\n')
out.write('// GENERATED BY {}\n'.format(sys.argv[0]))
out.write('// FROM {}\n\n'.format(' '.join(args.inputs)))
guard = args.guard if args.guard else args.output
guard = ''.join([c.upper() if c.isalpha() else '_' for c in guard])
out.write('#ifndef {0}_\n#define {0}_\n'.format(guard))
Assembler(args, parse_tree).Write(out)
out.write('#endif // {}_\n'.format(guard))
return g_fail
if __name__ == '__main__':
sys.exit(main(sys.argv))