blob: 5e4a05792ad713d5692441e314ddc379a96fb79d [file] [log] [blame] [edit]
#!/usr/bin/env python3
import argparse, collections, datetime, os, re, sys, unicodedata
from urllib.request import urlopen
# Use intranges.intranges_from_list() from the sibling idna directory
sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "idna"))
from intranges import intranges_from_list
if sys.version_info[0] < 3:
print("Only Python 3 supported.")
sys.exit(2)
PREFERRED_VERSION = "16.0.0"
UCD_URL = "http://www.unicode.org/Public/{version}/ucd/{filename}"
UTS46_URL = "http://www.unicode.org/Public/idna/{version}/{filename}"
DEFAULT_CACHE_DIR = "~/.cache/unidata"
# Scripts affected by IDNA contextual rules
SCRIPT_WHITELIST = sorted(["Greek", "Han", "Hebrew", "Hiragana", "Katakana"])
# Used to piece apart UTS#46 data for Jython compatibility
UTS46_SEGMENT_SIZE = 100
UTS46_STATUSES = {
"valid": ("V", False),
"ignored": ("I", False),
"mapped": ("M", True),
"deviation": ("D", True),
"disallowed": ("X", False),
"disallowed_STD3_valid": ("3", False),
"disallowed_STD3_mapped": ("3", True),
}
# Exceptions are manually assigned in Section 2.6 of RFC 5892.
exceptions = {
0x00DF: "PVALID", # LATIN SMALL LETTER SHARP S
0x03C2: "PVALID", # GREEK SMALL LETTER FINAL SIGMA
0x06FD: "PVALID", # ARABIC SIGN SINDHI AMPERSAND
0x06FE: "PVALID", # ARABIC SIGN SINDHI POSTPOSITION MEN
0x0F0B: "PVALID", # TIBETAN MARK INTERSYLLABIC TSHEG
0x3007: "PVALID", # IDEOGRAPHIC NUMBER ZERO
0x00B7: "CONTEXTO", # MIDDLE DOT
0x0375: "CONTEXTO", # GREEK LOWER NUMERAL SIGN (KERAIA)
0x05F3: "CONTEXTO", # HEBREW PUNCTUATION GERESH
0x05F4: "CONTEXTO", # HEBREW PUNCTUATION GERSHAYIM
0x30FB: "CONTEXTO", # KATAKANA MIDDLE DOT
0x0660: "CONTEXTO", # ARABIC-INDIC DIGIT ZERO
0x0661: "CONTEXTO", # ARABIC-INDIC DIGIT ONE
0x0662: "CONTEXTO", # ARABIC-INDIC DIGIT TWO
0x0663: "CONTEXTO", # ARABIC-INDIC DIGIT THREE
0x0664: "CONTEXTO", # ARABIC-INDIC DIGIT FOUR
0x0665: "CONTEXTO", # ARABIC-INDIC DIGIT FIVE
0x0666: "CONTEXTO", # ARABIC-INDIC DIGIT SIX
0x0667: "CONTEXTO", # ARABIC-INDIC DIGIT SEVEN
0x0668: "CONTEXTO", # ARABIC-INDIC DIGIT EIGHT
0x0669: "CONTEXTO", # ARABIC-INDIC DIGIT NINE
0x06F0: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT ZERO
0x06F1: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT ONE
0x06F2: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT TWO
0x06F3: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT THREE
0x06F4: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT FOUR
0x06F5: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT FIVE
0x06F6: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT SIX
0x06F7: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT SEVEN
0x06F8: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT EIGHT
0x06F9: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT NINE
0x0640: "DISALLOWED", # ARABIC TATWEEL
0x07FA: "DISALLOWED", # NKO LAJANYALAN
0x302E: "DISALLOWED", # HANGUL SINGLE DOT TONE MARK
0x302F: "DISALLOWED", # HANGUL DOUBLE DOT TONE MARK
0x3031: "DISALLOWED", # VERTICAL KANA REPEAT MARK
0x3032: "DISALLOWED", # VERTICAL KANA REPEAT WITH VOICED SOUND MARK
0x3033: "DISALLOWED", # VERTICAL KANA REPEAT MARK UPPER HALF
0x3034: "DISALLOWED", # VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HA
0x3035: "DISALLOWED", # VERTICAL KANA REPEAT MARK LOWER HALF
0x303B: "DISALLOWED", # VERTICAL IDEOGRAPHIC ITERATION MARK
}
backwardscompatible = {}
def hexrange(start, end):
return range(int(start, 16), int(end, 16) + 1)
def hexvalue(value):
return int(value, 16)
_RE_UNICODE = re.compile("\\\\u([0-9a-fA-F]{4})")
_RE_SURROGATE = re.compile("[\ud800-\udbff][\udc00-\udfff]")
def unicode_fixup(string):
"""Replace backslash-u-XXXX with appropriate unicode characters."""
return _RE_SURROGATE.sub(
lambda match: chr((ord(match.group(0)[0]) - 0xD800) * 0x400 + ord(match.group(0)[1]) - 0xDC00 + 0x10000),
_RE_UNICODE.sub(lambda match: chr(int(match.group(1), 16)), string),
)
class UnicodeVersion(object):
def __init__(self, version):
result = re.match(r"^(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)$", version)
if result:
self.major = int(result.group("major"))
self.minor = int(result.group("minor"))
self.patch = int(result.group("patch"))
self.numerical = (self.major << 8) + (self.minor << 4) + self.patch
self.latest = False
elif version == "latest":
self.latest = True
else:
raise ValueError("Unrecognized Unicode version")
def __repr__(self, with_date=True):
if self.latest:
if with_date:
return "latest@{}".format(datetime.datetime.now().strftime("%Y-%m-%d"))
else:
return "latest"
else:
return "{}.{}.{}".format(self.major, self.minor, self.patch)
@property
def tag(self):
return self.__repr__(with_date=False)
def __gt__(self, other):
if self.latest:
return True
return self.numerical > other.numerical
def __eq__(self, other):
if self.latest:
return False
return self.numerical == other.numerical
class UnicodeData(object):
def __init__(self, version, cache, args):
self.version = UnicodeVersion(version)
self.system_version = UnicodeVersion(unicodedata.unidata_version)
self.source = args.source
self.cache = cache
self.max = 0
if self.system_version < self.version:
print(
"Warning: Character stability not guaranteed as Python Unicode data {} older than requested {}".format(
self.system_version, self.version
)
)
self._load_unicodedata()
self._load_proplist()
self._load_derivedcoreprops()
self._load_blocks()
self._load_casefolding()
self._load_hangulst()
self._load_arabicshaping()
self._load_scripts()
self._load_uts46mapping()
self._load_uts46testvectors()
def _load_unicodedata(self):
f_ud = self._ucdfile("UnicodeData.txt")
self.ucd_data = {}
range_begin = None
for line in f_ud.splitlines():
fields = line.split(";")
value = int(fields[0], 16)
start_marker = re.match("^<(?P<name>.*?), First>$", fields[1])
end_marker = re.match("^<(?P<name>.*?), Last>$", fields[1])
if start_marker:
range_begin = value
elif end_marker:
for i in range(range_begin, value + 1):
fields[1] = "<{}>".format(end_marker.group("name"))
self.ucd_data[i] = fields[1:]
range_begin = None
else:
self.ucd_data[value] = fields[1:]
def _load_proplist(self):
f_pl = self._ucdfile("PropList.txt")
self.ucd_props = collections.defaultdict(list)
for line in f_pl.splitlines():
result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$", line)
if result:
if result.group("end"):
for i in hexrange(result.group("start"), result.group("end")):
self.ucd_props[i].append(result.group("prop"))
else:
i = hexvalue(result.group("start"))
self.ucd_props[i].append(result.group("prop"))
def _load_derivedcoreprops(self):
f_dcp = self._ucdfile("DerivedCoreProperties.txt")
for line in f_dcp.splitlines():
result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$", line)
if result:
if result.group("end"):
for i in hexrange(result.group("start"), result.group("end")):
self.ucd_props[i].append(result.group("prop"))
else:
i = hexvalue(result.group("start"))
self.ucd_props[i].append(result.group("prop"))
def _load_blocks(self):
self.ucd_block = {}
f_b = self._ucdfile("Blocks.txt")
for line in f_b.splitlines():
result = re.match(r"^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<block>.*)\s*$", line)
if result:
for i in hexrange(result.group("start"), result.group("end")):
self.ucd_block[i] = result.group("block")
self.max = max(self.max, i)
def _load_casefolding(self):
self.ucd_cf = {}
f_cf = self._ucdfile("CaseFolding.txt")
for line in f_cf.splitlines():
result = re.match(r"^(?P<cp>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*;\s*(?P<subst>[0-9A-F\s]+)\s*", line)
if result:
if result.group("type") in ("C", "F"):
self.ucd_cf[int(result.group("cp"), 16)] = "".join(
[chr(int(x, 16)) for x in result.group("subst").split(" ")]
)
def _load_hangulst(self):
self.ucd_hst = {}
f_hst = self._ucdfile("HangulSyllableType.txt")
for line in f_hst.splitlines():
result = re.match(r"^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*(|\#.*)$", line)
if result:
for i in hexrange(result.group("start"), result.group("end")):
self.ucd_hst[i] = result.group("type")
def _load_arabicshaping(self):
self.ucd_as = {}
f_as = self._ucdfile("extracted/DerivedJoiningType.txt")
for line in f_as.splitlines():
result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<jt>\S+)\s*(|\#.*)$", line)
if result:
if result.group("end"):
for i in hexrange(result.group("start"), result.group("end")):
self.ucd_as[i] = result.group("jt")
else:
i = hexvalue(result.group("start"))
self.ucd_as[i] = result.group("jt")
def _load_scripts(self):
self.ucd_s = {}
f_s = self._ucdfile("Scripts.txt")
for line in f_s.splitlines():
result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<script>\S+)\s*(|\#.*)$", line)
if result:
if not result.group("script") in self.ucd_s:
self.ucd_s[result.group("script")] = set()
if result.group("end"):
for i in hexrange(result.group("start"), result.group("end")):
self.ucd_s[result.group("script")].add(i)
else:
i = hexvalue(result.group("start"))
self.ucd_s[result.group("script")].add(i)
def _load_uts46mapping(self):
self.ucd_idnamt = {}
f_idnamt = self._ucdfile("IdnaMappingTable.txt", urlbase=UTS46_URL)
for line in f_idnamt.splitlines():
result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<fields>[^#]+)", line)
if result:
fields = [x.strip() for x in result.group("fields").split(";")]
if result.group("end"):
for i in hexrange(result.group("start"), result.group("end")):
self.ucd_idnamt[i] = fields
else:
i = hexvalue(result.group("start"))
self.ucd_idnamt[i] = fields
def _load_uts46testvectors(self):
self.ucd_uts46tests = []
f_uts46tests = self._ucdfile("IdnaTestV2.txt", urlbase=UTS46_URL)
for lineno, line in enumerate(f_uts46tests.splitlines()):
if "#" in line:
line = line.split("#", 1)[0]
if not line:
continue
self.ucd_uts46tests.append((lineno + 1, tuple(field.strip() for field in unicode_fixup(line).split(";"))))
def _ucdfile(self, filename, urlbase=UCD_URL):
if self.source:
f = open("{}/{}".format(self.source, filename))
return f.read()
else:
cache_file = None
if self.cache:
cache_file = os.path.expanduser("{}/{}/{}".format(self.cache, self.version.tag, filename))
if os.path.isfile(cache_file):
f = open(cache_file)
return f.read()
version_path = self.version.tag
if version_path == "latest":
version_path = "UCD/latest"
url = urlbase.format(
version=version_path,
filename=filename,
)
content = urlopen(url).read().decode("utf-8")
if cache_file:
if not os.path.isdir(os.path.dirname(cache_file)):
os.makedirs(os.path.dirname(cache_file))
f = open(cache_file, "wb")
f.write(content.encode("utf-8"))
f.close()
return str(content)
def codepoints(self):
for i in range(0, self.max + 1):
yield CodePoint(i, ucdata=self)
class CodePoint:
def __init__(self, value=None, ucdata=None):
self.value = value
self.ucdata = ucdata
def _casefold(self, s):
r = ""
for c in s:
r += self.ucdata.ucd_cf.get(ord(c), c)
return r
@property
def exception_value(self):
return exceptions.get(self.value, False)
@property
def compat_value(self):
return backwardscompatible.get(self.value, False)
@property
def name(self):
if self.value in self.ucdata.ucd_data:
return self.ucdata.ucd_data[self.value][0]
elif "Noncharacter_Code_Point" in self.ucdata.ucd_props[self.value]:
return "<noncharacter>"
else:
return "<reserved>"
@property
def general_category(self):
return self.ucdata.ucd_data.get(self.value, [None, None])[1]
@property
def unassigned(self):
return not ("Noncharacter_Code_Point" in self.ucdata.ucd_props[self.value] or self.value in self.ucdata.ucd_data)
@property
def ldh(self):
if self.value == 0x002D or self.value in range(0x0030, 0x0039 + 1) or self.value in range(0x0061, 0x007A + 1):
return True
return False
@property
def join_control(self):
return "Join_Control" in self.ucdata.ucd_props[self.value]
@property
def joining_type(self):
return self.ucdata.ucd_as.get(self.value, None)
@property
def char(self):
return chr(self.value)
@property
def nfkc_cf(self):
return unicodedata.normalize("NFKC", self._casefold(unicodedata.normalize("NFKC", self.char)))
@property
def unstable(self):
return self.char != self.nfkc_cf
@property
def in_ignorableproperties(self):
for prop in ["Default_Ignorable_Code_Point", "White_Space", "Noncharacter_Code_Point"]:
if prop in self.ucdata.ucd_props[self.value]:
return True
return False
@property
def in_ignorableblocks(self):
return self.ucdata.ucd_block.get(self.value) in (
"Combining Diacritical Marks for Symbols",
"Musical Symbols",
"Ancient Greek Musical Notation",
)
@property
def oldhanguljamo(self):
return self.ucdata.ucd_hst.get(self.value) in ("L", "V", "T")
@property
def in_lettersdigits(self):
return self.general_category in ("Ll", "Lu", "Lo", "Nd", "Lm", "Mn", "Mc")
@property
def idna2008_status(self):
if self.exception_value:
return self.exception_value
elif self.compat_value:
return self.compat_value
elif self.unassigned:
return "UNASSIGNED"
elif self.ldh:
return "PVALID"
elif self.join_control:
return "CONTEXTJ"
elif self.unstable:
return "DISALLOWED"
elif self.in_ignorableproperties:
return "DISALLOWED"
elif self.in_ignorableblocks:
return "DISALLOWED"
elif self.oldhanguljamo:
return "DISALLOWED"
elif self.in_lettersdigits:
return "PVALID"
else:
return "DISALLOWED"
@property
def uts46_data(self):
return self.ucdata.ucd_idnamt.get(self.value, None)
@property
def uts46_status(self):
return " ".join(self.uts46_data)
def diagnose_codepoint(codepoint, args, ucdata):
cp = CodePoint(codepoint, ucdata=ucdata)
print("U+{:04X}:".format(codepoint))
print(" Name: {}".format(cp.name))
print("1 Exceptions: {}".format(exceptions.get(codepoint, False)))
print("2 Backwards Compat: {}".format(backwardscompatible.get(codepoint, False)))
print("3 Unassigned: {}".format(cp.unassigned))
print("4 LDH: {}".format(cp.ldh))
print(" Properties: {}".format(" ".join(sorted(ucdata.ucd_props.get(codepoint, ["None"])))))
print("5 .Join Control: {}".format(cp.join_control))
print(" NFKC CF: {}".format(" ".join(["U+{:04X}".format(ord(x)) for x in cp.nfkc_cf])))
print("6 .Unstable: {}".format(cp.unstable))
print("7 .Ignorable Prop: {}".format(cp.in_ignorableproperties))
print(" Block: {}".format(ucdata.ucd_block.get(codepoint, None)))
print("8 .Ignorable Block: {}".format(cp.in_ignorableblocks))
print(" Hangul Syll Type: {}".format(ucdata.ucd_hst.get(codepoint, None)))
print("9 .Old Hangul Jamo: {}".format(cp.oldhanguljamo))
print(" General Category: {}".format(cp.general_category))
print("10 .Letters Digits: {}".format(cp.in_lettersdigits))
print("== IDNA 2008: {}".format(cp.idna2008_status))
print("== UTS 46: {}".format(cp.uts46_status))
print("(Unicode {} [sys:{}])".format(ucdata.version, ucdata.system_version))
def ucdrange(start, end):
if start == end:
return ("{:04X}".format(start.value), start.name)
else:
return ("{:04X}..{:04X}".format(start.value, end.value), "{}..{}".format(start.name, end.name))
def upper_hex(value):
num = hex(value)
return num[:2] + num[2:].upper()
def optimised_list(d):
values = intranges_from_list(d)
if len(values) == 1:
for value in values:
# Respect ruff format style
yield "({},),".format(upper_hex(value))
else:
yield "("
for value in values:
yield " {},".format(upper_hex(value))
yield " ),"
def make_table(args, ucdata):
last_status = None
cps = []
table_data = []
for cp in ucdata.codepoints():
status = cp.idna2008_status
if last_status and last_status != status:
(values, description) = ucdrange(cps[0], cps[-1])
table_data.append([values, last_status, description])
cps = []
last_status = status
cps.append(cp)
(values, description) = ucdrange(cps[0], cps[-1])
table_data.append([values, last_status, description])
if args.dir:
f = open("{}/idna-table-{}.txt".format(args.dir, ucdata.version), "wb")
for row in table_data:
f.write("{:12}; {:12}# {:.44}\n".format(*row).encode("ascii"))
f.close()
else:
for row in table_data:
print("{:12}; {:12}# {:.44}".format(*row))
def idna_libdata(ucdata):
yield "# This file is automatically generated by tools/idna-data\n"
yield '__version__ = "{}"\n'.format(ucdata.version)
#
# Script classifications are used by some CONTEXTO rules in RFC 5891
#
yield "scripts = {"
for script in SCRIPT_WHITELIST:
prefix = ' "{}": '.format(script)
for line in optimised_list(ucdata.ucd_s[script]):
yield prefix + line
prefix = ""
yield "}"
#
# Joining types are used by CONTEXTJ rule A.1
#
yield "joining_types = {"
for cp in ucdata.codepoints():
if cp.joining_type:
yield " 0x{:X}: {},".format(cp.value, ord(cp.joining_type))
yield "}"
#
# These are the classification of codepoints into PVALID, CONTEXTO, CONTEXTJ, etc.
#
yield "codepoint_classes = {"
classes = {}
for cp in ucdata.codepoints():
status = cp.idna2008_status
if status in ("UNASSIGNED", "DISALLOWED"):
continue
if not status in classes:
classes[status] = set()
classes[status].add(cp.value)
for status in ["PVALID", "CONTEXTJ", "CONTEXTO"]:
prefix = ' "{}": '.format(status)
for line in optimised_list(classes[status]):
yield prefix + line
prefix = ""
yield "}"
def uts46_ranges(ucdata):
last = (None, None)
for cp in ucdata.codepoints():
fields = cp.uts46_data
if not fields:
continue
status, mapping = UTS46_STATUSES[fields[0]]
if mapping:
mapping = "".join(chr(int(codepoint, 16)) for codepoint in fields[1].split())
mapping = mapping.replace("\\", "\\\\")
else:
mapping = None
if cp.value > 255 and (status, mapping) == last:
continue
last = (status, mapping)
if mapping is not None:
if '"' in mapping:
yield "(0x{:X}, \"{}\", '{}')".format(cp.value, status, mapping)
else:
yield '(0x{:X}, "{}", "{}")'.format(cp.value, status, mapping)
else:
yield '(0x{:X}, "{}")'.format(cp.value, status)
def uts46_libdata(ucdata):
yield "# This file is automatically generated by tools/idna-data"
yield "# vim: set fileencoding=utf-8 :\n"
yield "from typing import List, Tuple, Union\n"
yield '"""IDNA Mapping Table from UTS46."""\n\n'
yield '__version__ = "{}"\n'.format(ucdata.version)
idx = -1
for row in uts46_ranges(ucdata):
idx += 1
if idx % UTS46_SEGMENT_SIZE == 0:
if idx != 0:
yield " ]\n"
yield "\ndef _seg_{}() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]:\n return [".format(
idx // UTS46_SEGMENT_SIZE
)
yield " {},".format(row)
yield " ]\n"
yield "\nuts46data = tuple("
yield " _seg_0()"
for i in range(1, idx // UTS46_SEGMENT_SIZE + 1):
yield " + _seg_{}()".format(i)
yield ") # type: Tuple[Union[Tuple[int, str], Tuple[int, str, str]], ...]"
def uts46_tests(ucdata):
yield "# This file is automatically generated by tools/idna-data\n"
yield "import unittest\n"
yield "class UTS46Tests(unittest.TestCase):\n"
yield " def test_uts46(self):\n"
for lineno, fields in ucdata.ucd_uts46tests:
(
source,
to_unicode,
to_unicode_status,
to_ascii,
to_ascii_status,
to_ascii_t,
to_ascii_t_status,
) = fields
# Per UTS46 test vectors, if the result is "", expect a blank string, but
# if the result is blank, expect the input string.
if to_unicode == '""':
to_unicode = ""
elif not to_unicode:
to_unicode = source
if not to_unicode_status:
to_unicode_status = "[]"
if to_ascii == '""':
to_ascii = ""
elif not to_ascii:
to_ascii = to_unicode
if not to_ascii_status:
to_ascii_status = to_unicode_status
if to_ascii_t == '""':
to_ascii_t = ""
elif not to_ascii_t:
to_ascii_t = to_ascii
if not to_ascii_t_status:
to_ascii_t_status = to_ascii_status
# Is this label IDNA 2008 legal according to UTS46 mapping table?
nv8 = False
for codepoint in to_unicode:
try:
field = ucdata.ucd_idnamt[ord(codepoint)][2]
except IndexError:
field = ""
if field == "NV8" or field == "XV8":
nv8 = ord(codepoint)
yield " # line {}: {}".format(lineno, repr(fields))
if nv8:
yield " # nv8: U+{:X}".format(nv8)
if to_unicode_status == "[]":
yield (
' self.assertEqual(idna.decode({}, uts46=True, strict=True), {}, "Expected \\"{}\\"'
' as decode() output")'.format(repr(source), repr(to_unicode), repr(to_unicode))
)
yield ""
# self.assertEqual(idna.uts46_remap("A_", std3_rules=False), "a_")
# self.assertRaises(idna.InvalidCodepoint, idna.uts46_remap, "A_", std3_rules=True)
"""
try:
output = idna.decode(source, uts46=True, strict=True)
if to_unicode_status != "[]":
self.fail("decode() did not emit required error {} for {}".format(to_unicode, repr(source)))
self.assertEqual(output, to_unicode, "unexpected decode() output")
except (idna.IDNAError, UnicodeError, ValueError) as exc:
if str(exc).startswith("Unknown"):
raise unittest.SkipTest("Test requires support for a newer version of Unicode than this Python supports")
if to_unicode_status == "[]":
raise
try:
output = idna.encode(source, uts46=True, strict=True).decode("ascii")
if to_ascii_status != "[]":
self.fail("encode() did not emit required error {} for {}".format(to_ascii_status, repr(source)))
self.assertEqual(output, to_ascii, "unexpected encode() output")
except (idna.IDNAError, UnicodeError, ValueError) as exc:
if str(exc).startswith("Unknown"):
raise unittest.SkipTest("Test requires support for a newer version of Unicode than this Python supports")
if to_ascii_status == "[]":
raise
try:
output = idna.encode(source, uts46=True, strict=True, transitional=True).decode("ascii")
if to_ascii_t_status != "[]":
self.fail(
"encode(transitional=True) did not emit required error {} for {}".format(to_ascii_t_status, repr(source))
)
self.assertEqual(output, to_ascii_t, "unexpected encode() output")
except (idna.IDNAError, UnicodeError, ValueError) as exc:
if str(exc).startswith("Unknown"):
raise unittest.SkipTest("Test requires support for a newer version of Unicode than this Python supports")
if to_ascii_t_status == "[]":
raise
"""
def make_libdata(args, ucdata):
dest_dir = args.dir or "."
target_filename = os.path.join(dest_dir, "idnadata.py")
with open(target_filename, "wb") as target:
for line in idna_libdata(ucdata):
target.write((line + "\n").encode("utf-8"))
target_filename = os.path.join(dest_dir, "uts46data.py")
with open(target_filename, "wb") as target:
for line in uts46_libdata(ucdata):
target.write((line + "\n").encode("utf-8"))
target_filename = os.path.join(dest_dir, "test_idna_uts46.py")
with open(target_filename, "wb") as target:
for line in uts46_tests(ucdata):
target.write((line + "\n").encode("utf-8"))
def arg_error(message, parser):
parser.print_usage()
print("{}: error: {}".format(sys.argv[0], message))
sys.exit(2)
def main():
parser = argparse.ArgumentParser(description="Determine IDNA code-point validity data")
parser.add_argument(
"action", type=str, default="preferred", help="Task to perform (make-libdata, make-tables, <codepoint>)"
)
parser.add_argument("--version", type=str, default="preferred", help="Unicode version to use (preferred, latest, <x.y.z>)")
parser.add_argument("--source", type=str, default=None, help="Where to fetch Unicode data (file path)")
parser.add_argument("--dir", type=str, default=None, help="Where to export the output")
parser.add_argument("--cache", type=str, default=None, help="Where to cache Unicode data")
parser.add_argument("--no-cache", action="store_true", help="Don't cache Unicode data")
libdata = parser.add_argument_group("make-libdata", "Make module data for Python IDNA library")
tables = parser.add_argument_group("make-table", "Make IANA-style reference table")
codepoint = parser.add_argument_group("codepoint", "Display related data for given codepoint (e.g. U+0061)")
args = parser.parse_args()
if args.version == "preferred":
target_version = PREFERRED_VERSION
else:
target_version = args.version
if args.cache and args.no_cache:
arg_error("I can't both --cache and --no-cache", parser)
cache = args.cache or DEFAULT_CACHE_DIR
if args.no_cache:
cache = None
ucdata = UnicodeData(target_version, cache, args)
if args.action == "make-table":
make_table(args, ucdata)
elif args.action == "make-libdata":
make_libdata(args, ucdata)
else:
result = re.match(r"(?i)^(U\+|)(?P<cp>[0-9A-F]{4,6})$", args.action)
if result:
codepoint = int(result.group("cp"), 16)
diagnose_codepoint(codepoint, args, ucdata)
sys.exit(0)
arg_error("Don't recognize action or codepoint value", parser)
if __name__ == "__main__":
main()