blob: e30b420c335543807b1a0069dee2feb7075df237 [file] [log] [blame]
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2017 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Helper for extracting ranges for wcwidth.
In print mode, we'll display the new tables. Useful for comparing against
older releases and debugging this script.
In update mode, we'll update lib_wc.js directly. Useful for lazy devs.
You'll need to provide the relevant Unicode database files.
The latest version can be found here:
https://www.unicode.org/Public/UNIDATA/UCD.zip
"""
from __future__ import print_function
import argparse
import re
import os
import sys
def load_proplist():
"""Return codepoints based on their various properties."""
db = {}
with open('PropList.txt', encoding='utf-8') as fp:
data = fp.read()
for line in data.splitlines():
line = line.split('#', 1)[0].strip()
if not line:
continue
codepoint, prop = line.split(';')
if '..' in codepoint:
first, last = codepoint.split('..')
else:
first = last = codepoint
first = int(first, 16)
last = int(last, 16)
prop = prop.strip()
db.setdefault(prop, set())
db[prop].update(range(first, last + 1))
return db
def load_unicode_data():
"""Return codepoints based on their General Category.
See these docs for details on the UnicodeData.txt format.
https://unicode.org/reports/tr44/#UnicodeData.txt
https://unicode.org/reports/tr44/#General_Category_Values
"""
db = {
'Cc': set(),
# Format Character: https://unicode.org/glossary/#format_character
'Cf': set(),
'Co': set(),
'Cs': set(),
'Ll': set(),
'Lm': set(),
'Lo': set(),
'Lt': set(),
'Lu': set(),
'Mc': set(),
# Enclosing Mark: https://unicode.org/glossary/#enclosing_mark
'Me': set(),
# Nonspacing Mark: https://unicode.org/glossary/#nonspacing_mark
'Mn': set(),
'Nd': set(),
'Nl': set(),
'No': set(),
'Pc': set(),
'Pd': set(),
'Pe': set(),
'Pf': set(),
'Pi': set(),
'Po': set(),
'Ps': set(),
'Sc': set(),
'Sk': set(),
'Sm': set(),
'So': set(),
'Zl': set(),
'Zp': set(),
'Zs': set(),
}
with open('UnicodeData.txt', encoding='utf-8') as fp:
data = fp.read()
for line in data.splitlines():
line = line.split('#', 1)[0].strip()
if not line:
continue
eles = line.split(';')
codepoint = eles[0]
cat = eles[2]
assert cat in db, line
db[cat].add(int(codepoint, 16))
return db
def load_east_asian():
"""Return codepoints based on their east asian width.
See these docs for details on the EastAsianWidth.txt format.
https://unicode.org/reports/tr44/#EastAsianWidth.txt
https://www.unicode.org/reports/tr11/
"""
db = {
'A': set(), # Ambiguous.
'F': set(), # Full-width.
'H': set(), # Half-width.
'N': set(), # Neutral.
'Na': set(), # Narrow.
'W': set(), # Wide.
}
with open('EastAsianWidth.txt', encoding='utf-8') as fp:
data = fp.read()
for line in data.splitlines():
line = line.split('#', 1)[0].strip()
if not line:
continue
codepoint, width = line.split(';')
assert width in db, 'Missing classification: %s' % width
if '..' in codepoint:
first, last = codepoint.split('..')
else:
first = last = codepoint
first = int(first, 16)
last = int(last, 16)
db[width].update(range(first, last + 1))
# The Unicode database only lists allocated codepoints. While not a problem
# by itself, it causes holes to appear in our output tables which we could
# optimize around. Specifically, if there's a block of codepoints that have
# been preallocated for certain character classes, then we can pretty safely
# assume that future codepoints in those blocks will have the same width.
# Here we add entire blocks to fill in unallocated holes which in turn helps
# collapse adjacent ranges which shrinks the table. And it makes us a bit
# future proof for when those codepoints are allocated but we haven't yet
# updated. It's pretty uncommon to mix codepoints of different widths in a
# single block, so it shouldn't bite us. If it does happen, the add_block
# check below will catch it!
def add_block(start, end):
block = set(range(start, end + 1))
db['W'].update(block)
# Make sure some codepoints weren't allocated with different width.
# This is for future proofing to avoid silent corruption.
non_wide = db['A'] | db['H'] | db['N'] | db['Na']
overlap = ['U+%04X' % x for x in sorted(non_wide & block)]
assert not overlap, 'duplicates found: %s' % overlap
# CJK Radicals Supplement.
add_block(0x2e80, 0x2eff)
# Kangxi Radicals.
add_block(0x2f00, 0x2fdf)
# Ideographic Description Characters.
add_block(0x2ff0, 0x2fff)
# Hiragana.
add_block(0x3040, 0x309f)
# Katakana.
add_block(0x30a0, 0x30ff)
# Bopomofo.
add_block(0x3100, 0x312f)
# Hangul Compatibility Jamo.
add_block(0x3130, 0x318f)
# Bopomofo Extended.
add_block(0x31a0, 0x31bf)
# CJK Strokes.
add_block(0x31c0, 0x31ef)
# Enclosed CJK Letters and Months.
# This block has a few narrow chars in the middle.
add_block(0x3200, 0x3247)
add_block(0x3250, 0x32ff)
# CJK Compatibility.
add_block(0x3300, 0x33ff)
# CJK Unified Ideographs Extension A.
add_block(0x3400, 0x4dbf)
# CJK Unified Ideographs.
add_block(0x4e00, 0x9fff)
# Yi Syllables.
add_block(0xa000, 0xa48f)
# Yi Radicals.
add_block(0xa490, 0xa4cf)
# Hangul Jamo Extended-A.
add_block(0xa960, 0xa97f)
# CJK Compatibility Ideographs.
add_block(0xf900, 0xfaff)
# CJK Small Form Variants.
add_block(0xfe50, 0xfe6f)
# Tangut.
add_block(0x17000, 0x187ff)
# Tangut Components.
add_block(0x18800, 0x18aff)
# Kana Supplement.
add_block(0x1b000, 0x1b0ff)
# Kana Extended-A.
add_block(0x1b100, 0x1b12f)
# Nushu.
add_block(0x1b170, 0x1b2ff)
return db
def gen_table(codepoints):
"""Generate a binary search table using |codepoints|."""
codepoints = sorted(codepoints)
ranges = []
start = last = codepoints.pop(0)
for codepoint in codepoints:
if codepoint != last + 1:
ranges.append([start, last])
start = last = codepoint
else:
last = codepoint
ranges.append([start, last])
return ranges
def js_dumps(ranges):
"""Dump a binary search table |ranges| as a Javascript object.
This is currently ad-hoc code but could easily use the json
module. We do this to have better control over output format.
"""
ret = '[\n'
i = 0
for r in ranges:
if i == 0:
# Indent this new line.
ret += ' '
else:
# Add a space after the previous element.
ret += ' '
ret += '[%#06x, %#06x],' % (r[0], r[1])
i += 1
if i == 3:
ret += '\n'
i = 0
if i:
ret += '\n'
ret += '];\n'
return ret
def gen_combining(db, prob_db):
"""Generate the table of all zero-width/combining characters."""
# The classes that are explicitly zero width.
combining_chars = db['Me'] | db['Mn'] | db['Cf']
# A note on soft-hyphen (U+OOAD): Previous versions that used the tables
# from Markus Kuhn's code marked this as wcwidth of 1. Unicode has changed
# it to a wcwidth of 0. Chrome and Firefox treat it as invisible. We now
# treat this as 0 to keep aligned with those platforms. If those change,
# then we can adopt as well.
# https://www.cs.tut.fi/~jkorpela/shy.html
# https://sourceware.org/bugzilla/show_bug.cgi?id=22073
# https://github.com/jquast/wcwidth/issues/8
# Remove Arabic Signs Spanning Numbers (0600-0605).
# https://www.unicode.org/versions/Unicode10.0.0/ch09.pdf
# Unicode 10.0.0 chapter 9 section 2 page 377 states:
# Signs Spanning Numbers. Several other special signs are written in
# association with numbers in the Arabic script. All of these signs can
# span multiple-digit numbers, rather than just a single digit. They are
# not formally considered combining marks in the sense used by the Unicode
# Standard, although they clearly interact graphically with their associated
# sequence of digits. In the text representation they precede the sequence
# of digits that they span, rather than follow a base character, as would be
# the case for a combining mark. Their General_Category value is Cf (format
# character). Unlike most other format characters, however, they should be
# rendered with a visible glyph, even in circumstances where no Middle
# East-I 378 9.2 Arabic suitable digit or sequence of digits follows them
# in logical order.
# A few similar signs spanning numbers or letters are associated with
# scripts other than Arabic. See the discussion of U+070F syriac
# abbreviation mark in Section 9.3, Syriac, and the discussion of U+110BD
# kaithi number sign in Section 15.2, Kaithi. All of these prefixed format
# controls, including the non-Arabic ones, are given the property value
# Prepended_Concatenation_Mark=True, to identify them as a class. They also
# have special behavior in text segmentation. (See Unicode Standard Annex
# #29, "Unicode Text Segmentation.")
combining_chars -= set(prob_db['Prepended_Concatenation_Mark'])
# Add the Hangul Jungseong and Jongseong block (1160-11FF).
# While they are not marked as combining characters, they are used
# with the Hangul Choseong block to form complete characters.
# TODO: This is actually more nuanced than "always 1" or "always 0".
# https://sourceware.org/bugzilla/show_bug.cgi?id=22074
combining_chars |= set(range(0x1160, 0x11FF + 1))
return gen_table(combining_chars)
def gen_east_asian(db):
"""Generate the table of all explicitly wide east asian characters."""
return gen_table(db['W'] | db['F'])
def gen_east_asian_ambiguous(db):
"""Generate the table of explicit & ambiguous wide east asian characters."""
return gen_table(db['W'] | db['F'] | db['A'])
def find_js(js):
"""Locate the JavaScript file to update."""
if js is None:
js = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'lib_wc.js')
return js
def get_parser():
"""Return an argparse parser for the CLI."""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--js',
help='JavaScript file to update')
parser.add_argument('action', choices=('print', 'update'),
help='Operating mode')
return parser
def main(argv):
"""The main entry point!"""
parser = get_parser()
opts = parser.parse_args(argv)
prop_db = load_proplist()
uni_db = load_unicode_data()
cjk_db = load_east_asian()
tables = (
('lib.wc.combining', js_dumps(gen_combining(uni_db, prop_db))),
('lib.wc.unambiguous', js_dumps(gen_east_asian(cjk_db))),
('lib.wc.ambiguous', js_dumps(gen_east_asian_ambiguous(cjk_db))),
)
if opts.action == 'print':
for name, text in tables:
print(name + ' = ' + text)
else:
js = find_js(opts.js)
with open(js, encoding='utf-8') as fp:
data = fp.read()
for name, text in tables:
data = re.sub(r'^%s = .*?^\];\n$' % name, name + ' = ' + text, data,
flags=re.M|re.S)
with open(js, 'w', encoding='utf-8') as fp:
fp.write(data)
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))