libdot/third_party/wcwidth/ranges.py - apps/libapps - Git at Google

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # Copyright 2017 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Helper for extracting ranges for wcwidth.

 In print mode, we'll display the new tables.  Useful for comparing against
 older releases and debugging this script.

 In update mode, we'll update lib_wc.js directly.  Useful for lazy devs.

 You'll need to provide the relevant Unicode database files.
 The latest version can be found here:
 https://www.unicode.org/Public/UNIDATA/UCD.zip
 """

 from __future__ import print_function

 import argparse
 import re
 import os
 import sys


 def load_proplist():
     """Return codepoints based on their various properties."""
     db = {}

     with open('PropList.txt', encoding='utf-8') as fp:
         data = fp.read()

     for line in data.splitlines():
         line = line.split('#', 1)[0].strip()
         if not line:
             continue

         codepoint, prop = line.split(';')
         if '..' in codepoint:
             first, last = codepoint.split('..')
         else:
             first = last = codepoint
         first = int(first, 16)
         last = int(last, 16)
         prop = prop.strip()

         db.setdefault(prop, set())
         db[prop].update(range(first, last + 1))

     return db


 def load_unicode_data():
     """Return codepoints based on their General Category.

     See these docs for details on the UnicodeData.txt format.
     https://unicode.org/reports/tr44/#UnicodeData.txt
     https://unicode.org/reports/tr44/#General_Category_Values
     """
     db = {
         'Cc': set(),
         # Format Character: https://unicode.org/glossary/#format_character
         'Cf': set(),
         'Co': set(),
         'Cs': set(),
         'Ll': set(),
         'Lm': set(),
         'Lo': set(),
         'Lt': set(),
         'Lu': set(),
         'Mc': set(),
         # Enclosing Mark: https://unicode.org/glossary/#enclosing_mark
         'Me': set(),
         # Nonspacing Mark: https://unicode.org/glossary/#nonspacing_mark
         'Mn': set(),
         'Nd': set(),
         'Nl': set(),
         'No': set(),
         'Pc': set(),
         'Pd': set(),
         'Pe': set(),
         'Pf': set(),
         'Pi': set(),
         'Po': set(),
         'Ps': set(),
         'Sc': set(),
         'Sk': set(),
         'Sm': set(),
         'So': set(),
         'Zl': set(),
         'Zp': set(),
         'Zs': set(),
     }

     with open('UnicodeData.txt', encoding='utf-8') as fp:
         data = fp.read()

     for line in data.splitlines():
         line = line.split('#', 1)[0].strip()
         if not line:
             continue

         eles = line.split(';')
         codepoint = eles[0]
         cat = eles[2]

         assert cat in db, line

         db[cat].add(int(codepoint, 16))

     return db


 def load_east_asian():
     """Return codepoints based on their east asian width.

     See these docs for details on the EastAsianWidth.txt format.
     https://unicode.org/reports/tr44/#EastAsianWidth.txt
     https://www.unicode.org/reports/tr11/
     """
     db = {
         'A': set(),  # Ambiguous.
         'F': set(),  # Full-width.
         'H': set(),  # Half-width.
         'N': set(),  # Neutral.
         'Na': set(),  # Narrow.
         'W': set(),  # Wide.
     }

     with open('EastAsianWidth.txt', encoding='utf-8') as fp:
         data = fp.read()

     for line in data.splitlines():
         line = line.split('#', 1)[0].strip()
         if not line:
             continue

         codepoint, width = line.split(';')
         assert width in db, 'Missing classification: %s' % width

         if '..' in codepoint:
             first, last = codepoint.split('..')
         else:
             first = last = codepoint
         first = int(first, 16)
         last = int(last, 16)

         db[width].update(range(first, last + 1))

     # The Unicode database only lists allocated codepoints.  While not a problem
     # by itself, it causes holes to appear in our output tables which we could
     # optimize around.  Specifically, if there's a block of codepoints that have
     # been preallocated for certain character classes, then we can pretty safely
     # assume that future codepoints in those blocks will have the same width.
     # Here we add entire blocks to fill in unallocated holes which in turn helps
     # collapse adjacent ranges which shrinks the table.  And it makes us a bit
     # future proof for when those codepoints are allocated but we haven't yet
     # updated.  It's pretty uncommon to mix codepoints of different widths in a
     # single block, so it shouldn't bite us.  If it does happen, the add_block
     # check below will catch it!

     def add_block(start, end):
         block = set(range(start, end + 1))
         db['W'].update(block)
         # Make sure some codepoints weren't allocated with different width.
         # This is for future proofing to avoid silent corruption.
         non_wide = db['A'] | db['H'] | db['N'] | db['Na']
         overlap = ['U+%04X' % x for x in sorted(non_wide & block)]
         assert not overlap, 'duplicates found: %s' % overlap

     # CJK Radicals Supplement.
     add_block(0x2e80, 0x2eff)
     # Kangxi Radicals.
     add_block(0x2f00, 0x2fdf)
     # Ideographic Description Characters.
     add_block(0x2ff0, 0x2fff)
     # Hiragana.
     add_block(0x3040, 0x309f)
     # Katakana.
     add_block(0x30a0, 0x30ff)
     # Bopomofo.
     add_block(0x3100, 0x312f)
     # Hangul Compatibility Jamo.
     add_block(0x3130, 0x318f)
     # Bopomofo Extended.
     add_block(0x31a0, 0x31bf)
     # CJK Strokes.
     add_block(0x31c0, 0x31ef)
     # Enclosed CJK Letters and Months.
     # This block has a few narrow chars in the middle.
     add_block(0x3200, 0x3247)
     add_block(0x3250, 0x32ff)
     # CJK Compatibility.
     add_block(0x3300, 0x33ff)
     # CJK Unified Ideographs Extension A.
     add_block(0x3400, 0x4dbf)
     # CJK Unified Ideographs.
     add_block(0x4e00, 0x9fff)
     # Yi Syllables.
     add_block(0xa000, 0xa48f)
     # Yi Radicals.
     add_block(0xa490, 0xa4cf)
     # Hangul Jamo Extended-A.
     add_block(0xa960, 0xa97f)
     # CJK Compatibility Ideographs.
     add_block(0xf900, 0xfaff)
     # CJK Small Form Variants.
     add_block(0xfe50, 0xfe6f)
     # Tangut.
     add_block(0x17000, 0x187ff)
     # Tangut Components.
     add_block(0x18800, 0x18aff)
     # Kana Supplement.
     add_block(0x1b000, 0x1b0ff)
     # Kana Extended-A.
     add_block(0x1b100, 0x1b12f)
     # Nushu.
     add_block(0x1b170, 0x1b2ff)

     return db


 def gen_table(codepoints):
     """Generate a binary search table using |codepoints|."""
     codepoints = sorted(codepoints)

     ranges = []
     start = last = codepoints.pop(0)
     for codepoint in codepoints:
         if codepoint != last + 1:
             ranges.append([start, last])
             start = last = codepoint
         else:
             last = codepoint
     ranges.append([start, last])
     return ranges


 def js_dumps(ranges):
     """Dump a binary search table |ranges| as a Javascript object.

     This is currently ad-hoc code but could easily use the json
     module.  We do this to have better control over output format.
     """
     ret = '[\n'
     i = 0
     for r in ranges:
         if i == 0:
             # Indent this new line.
             ret += '    '
         else:
             # Add a space after the previous element.
             ret += ' '
         ret += '[%#06x, %#06x],' % (r[0], r[1])
         i += 1
         if i == 3:
             ret += '\n'
             i = 0
     if i:
         ret += '\n'
     ret += '];\n'

     return ret


 def gen_combining(db, prob_db):
     """Generate the table of all zero-width/combining characters."""
     # The classes that are explicitly zero width.
     combining_chars = db['Me'] | db['Mn'] | db['Cf']

     # A note on soft-hyphen (U+OOAD): Previous versions that used the tables
     # from Markus Kuhn's code marked this as wcwidth of 1.  Unicode has changed
     # it to a wcwidth of 0.  Chrome and Firefox treat it as invisible.  We now
     # treat this as 0 to keep aligned with those platforms.  If those change,
     # then we can adopt as well.
     # https://www.cs.tut.fi/~jkorpela/shy.html
     # https://sourceware.org/bugzilla/show_bug.cgi?id=22073
     # https://github.com/jquast/wcwidth/issues/8

     # Remove Arabic Signs Spanning Numbers (0600-0605).
     # https://www.unicode.org/versions/Unicode10.0.0/ch09.pdf
     # Unicode 10.0.0 chapter 9 section 2 page 377 states:
     # Signs Spanning Numbers. Several other special signs are written in
     # association with numbers in the Arabic script. All of these signs can
     # span multiple-digit numbers, rather than just a single digit. They are
     # not formally considered combining marks in the sense used by the Unicode
     # Standard, although they clearly interact graphically with their associated
     # sequence of digits. In the text representation they precede the sequence
     # of digits that they span, rather than follow a base character, as would be
     # the case for a combining mark. Their General_Category value is Cf (format
     # character). Unlike most other format characters, however, they should be
     # rendered with a visible glyph, even in circumstances where no Middle
     # East-I 378 9.2 Arabic suitable digit or sequence of digits follows them
     # in logical order.
     # A few similar signs spanning numbers or letters are associated with
     # scripts other than Arabic. See the discussion of U+070F syriac
     # abbreviation mark in Section 9.3, Syriac, and the discussion of U+110BD
     # kaithi number sign in Section 15.2, Kaithi. All of these prefixed format
     # controls, including the non-Arabic ones, are given the property value
     # Prepended_Concatenation_Mark=True, to identify them as a class. They also
     # have special behavior in text segmentation. (See Unicode Standard Annex
     # #29, "Unicode Text Segmentation.")
     combining_chars -= set(prob_db['Prepended_Concatenation_Mark'])

     # Add the Hangul Jungseong and Jongseong block (1160-11FF).
     # While they are not marked as combining characters, they are used
     # with the Hangul Choseong block to form complete characters.
     # TODO: This is actually more nuanced than "always 1" or "always 0".
     # https://sourceware.org/bugzilla/show_bug.cgi?id=22074
     combining_chars |= set(range(0x1160, 0x11FF + 1))

     return gen_table(combining_chars)


 def gen_east_asian(db):
     """Generate the table of all explicitly wide east asian characters."""
     return gen_table(db['W'] | db['F'])


 def gen_east_asian_ambiguous(db):
     """Generate the table of explicit & ambiguous wide east asian characters."""
     return gen_table(db['W'] | db['F'] | db['A'])


 def find_js(js):
     """Locate the JavaScript file to update."""
     if js is None:
         js = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                           'lib_wc.js')

     return js


 def get_parser():
     """Return an argparse parser for the CLI."""
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument('--js',
                         help='JavaScript file to update')
     parser.add_argument('action', choices=('print', 'update'),
                         help='Operating mode')
     return parser


 def main(argv):
     """The main entry point!"""
     parser = get_parser()
     opts = parser.parse_args(argv)

     prop_db = load_proplist()
     uni_db = load_unicode_data()
     cjk_db = load_east_asian()

     tables = (
         ('lib.wc.combining', js_dumps(gen_combining(uni_db, prop_db))),
         ('lib.wc.unambiguous', js_dumps(gen_east_asian(cjk_db))),
         ('lib.wc.ambiguous', js_dumps(gen_east_asian_ambiguous(cjk_db))),
     )

     if opts.action == 'print':
         for name, text in tables:
             print(name + ' = ' + text)
     else:
         js = find_js(opts.js)
         with open(js, encoding='utf-8') as fp:
             data = fp.read()

         for name, text in tables:
             data = re.sub(r'^%s = .*?^\];\n$' % name, name + ' = ' + text, data,
                           flags=re.M|re.S)

         with open(js, 'w', encoding='utf-8') as fp:
             fp.write(data)


 if __name__ == '__main__':
     sys.exit(main(sys.argv[1:]))
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	# Copyright 2017 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Helper for extracting ranges for wcwidth.

	In print mode, we'll display the new tables. Useful for comparing against
	older releases and debugging this script.

	In update mode, we'll update lib_wc.js directly. Useful for lazy devs.

	You'll need to provide the relevant Unicode database files.
	The latest version can be found here:
	https://www.unicode.org/Public/UNIDATA/UCD.zip
	"""

	from __future__ import print_function

	import argparse
	import re
	import os
	import sys


	def load_proplist():
	"""Return codepoints based on their various properties."""
	db = {}

	with open('PropList.txt', encoding='utf-8') as fp:
	data = fp.read()

	for line in data.splitlines():
	line = line.split('#', 1)[0].strip()
	if not line:
	continue

	codepoint, prop = line.split(';')
	if '..' in codepoint:
	first, last = codepoint.split('..')
	else:
	first = last = codepoint
	first = int(first, 16)
	last = int(last, 16)
	prop = prop.strip()

	db.setdefault(prop, set())
	db[prop].update(range(first, last + 1))

	return db


	def load_unicode_data():
	"""Return codepoints based on their General Category.

	See these docs for details on the UnicodeData.txt format.
	https://unicode.org/reports/tr44/#UnicodeData.txt
	https://unicode.org/reports/tr44/#General_Category_Values
	"""
	db = {
	'Cc': set(),
	# Format Character: https://unicode.org/glossary/#format_character
	'Cf': set(),
	'Co': set(),
	'Cs': set(),
	'Ll': set(),
	'Lm': set(),
	'Lo': set(),
	'Lt': set(),
	'Lu': set(),
	'Mc': set(),
	# Enclosing Mark: https://unicode.org/glossary/#enclosing_mark
	'Me': set(),
	# Nonspacing Mark: https://unicode.org/glossary/#nonspacing_mark
	'Mn': set(),
	'Nd': set(),
	'Nl': set(),
	'No': set(),
	'Pc': set(),
	'Pd': set(),
	'Pe': set(),
	'Pf': set(),
	'Pi': set(),
	'Po': set(),
	'Ps': set(),
	'Sc': set(),
	'Sk': set(),
	'Sm': set(),
	'So': set(),
	'Zl': set(),
	'Zp': set(),
	'Zs': set(),
	}

	with open('UnicodeData.txt', encoding='utf-8') as fp:
	data = fp.read()

	for line in data.splitlines():
	line = line.split('#', 1)[0].strip()
	if not line:
	continue

	eles = line.split(';')
	codepoint = eles[0]
	cat = eles[2]

	assert cat in db, line

	db[cat].add(int(codepoint, 16))

	return db


	def load_east_asian():
	"""Return codepoints based on their east asian width.

	See these docs for details on the EastAsianWidth.txt format.
	https://unicode.org/reports/tr44/#EastAsianWidth.txt
	https://www.unicode.org/reports/tr11/
	"""
	db = {
	'A': set(), # Ambiguous.
	'F': set(), # Full-width.
	'H': set(), # Half-width.
	'N': set(), # Neutral.
	'Na': set(), # Narrow.
	'W': set(), # Wide.
	}

	with open('EastAsianWidth.txt', encoding='utf-8') as fp:
	data = fp.read()

	for line in data.splitlines():
	line = line.split('#', 1)[0].strip()
	if not line:
	continue

	codepoint, width = line.split(';')
	assert width in db, 'Missing classification: %s' % width

	if '..' in codepoint:
	first, last = codepoint.split('..')
	else:
	first = last = codepoint
	first = int(first, 16)
	last = int(last, 16)

	db[width].update(range(first, last + 1))

	# The Unicode database only lists allocated codepoints. While not a problem
	# by itself, it causes holes to appear in our output tables which we could
	# optimize around. Specifically, if there's a block of codepoints that have
	# been preallocated for certain character classes, then we can pretty safely
	# assume that future codepoints in those blocks will have the same width.
	# Here we add entire blocks to fill in unallocated holes which in turn helps
	# collapse adjacent ranges which shrinks the table. And it makes us a bit
	# future proof for when those codepoints are allocated but we haven't yet
	# updated. It's pretty uncommon to mix codepoints of different widths in a
	# single block, so it shouldn't bite us. If it does happen, the add_block
	# check below will catch it!

	def add_block(start, end):
	block = set(range(start, end + 1))
	db['W'].update(block)
	# Make sure some codepoints weren't allocated with different width.
	# This is for future proofing to avoid silent corruption.
	non_wide = db['A'] \| db['H'] \| db['N'] \| db['Na']
	overlap = ['U+%04X' % x for x in sorted(non_wide & block)]
	assert not overlap, 'duplicates found: %s' % overlap

	# CJK Radicals Supplement.
	add_block(0x2e80, 0x2eff)
	# Kangxi Radicals.
	add_block(0x2f00, 0x2fdf)
	# Ideographic Description Characters.
	add_block(0x2ff0, 0x2fff)
	# Hiragana.
	add_block(0x3040, 0x309f)
	# Katakana.
	add_block(0x30a0, 0x30ff)
	# Bopomofo.
	add_block(0x3100, 0x312f)
	# Hangul Compatibility Jamo.
	add_block(0x3130, 0x318f)
	# Bopomofo Extended.
	add_block(0x31a0, 0x31bf)
	# CJK Strokes.
	add_block(0x31c0, 0x31ef)
	# Enclosed CJK Letters and Months.
	# This block has a few narrow chars in the middle.
	add_block(0x3200, 0x3247)
	add_block(0x3250, 0x32ff)
	# CJK Compatibility.
	add_block(0x3300, 0x33ff)
	# CJK Unified Ideographs Extension A.
	add_block(0x3400, 0x4dbf)
	# CJK Unified Ideographs.
	add_block(0x4e00, 0x9fff)
	# Yi Syllables.
	add_block(0xa000, 0xa48f)
	# Yi Radicals.
	add_block(0xa490, 0xa4cf)
	# Hangul Jamo Extended-A.
	add_block(0xa960, 0xa97f)
	# CJK Compatibility Ideographs.
	add_block(0xf900, 0xfaff)
	# CJK Small Form Variants.
	add_block(0xfe50, 0xfe6f)
	# Tangut.
	add_block(0x17000, 0x187ff)
	# Tangut Components.
	add_block(0x18800, 0x18aff)
	# Kana Supplement.
	add_block(0x1b000, 0x1b0ff)
	# Kana Extended-A.
	add_block(0x1b100, 0x1b12f)
	# Nushu.
	add_block(0x1b170, 0x1b2ff)

	return db


	def gen_table(codepoints):
	"""Generate a binary search table using \|codepoints\|."""
	codepoints = sorted(codepoints)

	ranges = []
	start = last = codepoints.pop(0)
	for codepoint in codepoints:
	if codepoint != last + 1:
	ranges.append([start, last])
	start = last = codepoint
	else:
	last = codepoint
	ranges.append([start, last])
	return ranges


	def js_dumps(ranges):
	"""Dump a binary search table \|ranges\| as a Javascript object.

	This is currently ad-hoc code but could easily use the json
	module. We do this to have better control over output format.
	"""
	ret = '[\n'
	i = 0
	for r in ranges:
	if i == 0:
	# Indent this new line.
	ret += ' '
	else:
	# Add a space after the previous element.
	ret += ' '
	ret += '[%#06x, %#06x],' % (r[0], r[1])
	i += 1
	if i == 3:
	ret += '\n'
	i = 0
	if i:
	ret += '\n'
	ret += '];\n'

	return ret


	def gen_combining(db, prob_db):
	"""Generate the table of all zero-width/combining characters."""
	# The classes that are explicitly zero width.
	combining_chars = db['Me'] \| db['Mn'] \| db['Cf']

	# A note on soft-hyphen (U+OOAD): Previous versions that used the tables
	# from Markus Kuhn's code marked this as wcwidth of 1. Unicode has changed
	# it to a wcwidth of 0. Chrome and Firefox treat it as invisible. We now
	# treat this as 0 to keep aligned with those platforms. If those change,
	# then we can adopt as well.
	# https://www.cs.tut.fi/~jkorpela/shy.html
	# https://sourceware.org/bugzilla/show_bug.cgi?id=22073
	# https://github.com/jquast/wcwidth/issues/8

	# Remove Arabic Signs Spanning Numbers (0600-0605).
	# https://www.unicode.org/versions/Unicode10.0.0/ch09.pdf
	# Unicode 10.0.0 chapter 9 section 2 page 377 states:
	# Signs Spanning Numbers. Several other special signs are written in
	# association with numbers in the Arabic script. All of these signs can
	# span multiple-digit numbers, rather than just a single digit. They are
	# not formally considered combining marks in the sense used by the Unicode
	# Standard, although they clearly interact graphically with their associated
	# sequence of digits. In the text representation they precede the sequence
	# of digits that they span, rather than follow a base character, as would be
	# the case for a combining mark. Their General_Category value is Cf (format
	# character). Unlike most other format characters, however, they should be
	# rendered with a visible glyph, even in circumstances where no Middle
	# East-I 378 9.2 Arabic suitable digit or sequence of digits follows them
	# in logical order.
	# A few similar signs spanning numbers or letters are associated with
	# scripts other than Arabic. See the discussion of U+070F syriac
	# abbreviation mark in Section 9.3, Syriac, and the discussion of U+110BD
	# kaithi number sign in Section 15.2, Kaithi. All of these prefixed format
	# controls, including the non-Arabic ones, are given the property value
	# Prepended_Concatenation_Mark=True, to identify them as a class. They also
	# have special behavior in text segmentation. (See Unicode Standard Annex
	# #29, "Unicode Text Segmentation.")
	combining_chars -= set(prob_db['Prepended_Concatenation_Mark'])

	# Add the Hangul Jungseong and Jongseong block (1160-11FF).
	# While they are not marked as combining characters, they are used
	# with the Hangul Choseong block to form complete characters.
	# TODO: This is actually more nuanced than "always 1" or "always 0".
	# https://sourceware.org/bugzilla/show_bug.cgi?id=22074
	combining_chars \|= set(range(0x1160, 0x11FF + 1))

	return gen_table(combining_chars)


	def gen_east_asian(db):
	"""Generate the table of all explicitly wide east asian characters."""
	return gen_table(db['W'] \| db['F'])


	def gen_east_asian_ambiguous(db):
	"""Generate the table of explicit & ambiguous wide east asian characters."""
	return gen_table(db['W'] \| db['F'] \| db['A'])


	def find_js(js):
	"""Locate the JavaScript file to update."""
	if js is None:
	js = os.path.join(os.path.dirname(os.path.realpath(__file__)),
	'lib_wc.js')

	return js


	def get_parser():
	"""Return an argparse parser for the CLI."""
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument('--js',
	help='JavaScript file to update')
	parser.add_argument('action', choices=('print', 'update'),
	help='Operating mode')
	return parser


	def main(argv):
	"""The main entry point!"""
	parser = get_parser()
	opts = parser.parse_args(argv)

	prop_db = load_proplist()
	uni_db = load_unicode_data()
	cjk_db = load_east_asian()

	tables = (
	('lib.wc.combining', js_dumps(gen_combining(uni_db, prop_db))),
	('lib.wc.unambiguous', js_dumps(gen_east_asian(cjk_db))),
	('lib.wc.ambiguous', js_dumps(gen_east_asian_ambiguous(cjk_db))),
	)

	if opts.action == 'print':
	for name, text in tables:
	print(name + ' = ' + text)
	else:
	js = find_js(opts.js)
	with open(js, encoding='utf-8') as fp:
	data = fp.read()

	for name, text in tables:
	data = re.sub(r'^%s = .*?^\];\n$' % name, name + ' = ' + text, data,
	flags=re.M\|re.S)

	with open(js, 'w', encoding='utf-8') as fp:
	fp.write(data)


	if __name__ == '__main__':
	sys.exit(main(sys.argv[1:]))