| #!/usr/bin/env python3 |
| |
| import argparse, collections, datetime, os, re, sys, unicodedata |
| from urllib.request import urlopen |
| |
| # Use intranges.intranges_from_list() from the sibling idna directory |
| sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "idna")) |
| from intranges import intranges_from_list |
| |
| if sys.version_info[0] < 3: |
| print("Only Python 3 supported.") |
| sys.exit(2) |
| |
| PREFERRED_VERSION = "16.0.0" |
| UCD_URL = "http://www.unicode.org/Public/{version}/ucd/{filename}" |
| UTS46_URL = "http://www.unicode.org/Public/idna/{version}/{filename}" |
| |
| DEFAULT_CACHE_DIR = "~/.cache/unidata" |
| |
| # Scripts affected by IDNA contextual rules |
| SCRIPT_WHITELIST = sorted(["Greek", "Han", "Hebrew", "Hiragana", "Katakana"]) |
| |
| # Used to piece apart UTS#46 data for Jython compatibility |
| UTS46_SEGMENT_SIZE = 100 |
| |
| UTS46_STATUSES = { |
| "valid": ("V", False), |
| "ignored": ("I", False), |
| "mapped": ("M", True), |
| "deviation": ("D", True), |
| "disallowed": ("X", False), |
| "disallowed_STD3_valid": ("3", False), |
| "disallowed_STD3_mapped": ("3", True), |
| } |
| |
| # Exceptions are manually assigned in Section 2.6 of RFC 5892. |
| exceptions = { |
| 0x00DF: "PVALID", # LATIN SMALL LETTER SHARP S |
| 0x03C2: "PVALID", # GREEK SMALL LETTER FINAL SIGMA |
| 0x06FD: "PVALID", # ARABIC SIGN SINDHI AMPERSAND |
| 0x06FE: "PVALID", # ARABIC SIGN SINDHI POSTPOSITION MEN |
| 0x0F0B: "PVALID", # TIBETAN MARK INTERSYLLABIC TSHEG |
| 0x3007: "PVALID", # IDEOGRAPHIC NUMBER ZERO |
| 0x00B7: "CONTEXTO", # MIDDLE DOT |
| 0x0375: "CONTEXTO", # GREEK LOWER NUMERAL SIGN (KERAIA) |
| 0x05F3: "CONTEXTO", # HEBREW PUNCTUATION GERESH |
| 0x05F4: "CONTEXTO", # HEBREW PUNCTUATION GERSHAYIM |
| 0x30FB: "CONTEXTO", # KATAKANA MIDDLE DOT |
| 0x0660: "CONTEXTO", # ARABIC-INDIC DIGIT ZERO |
| 0x0661: "CONTEXTO", # ARABIC-INDIC DIGIT ONE |
| 0x0662: "CONTEXTO", # ARABIC-INDIC DIGIT TWO |
| 0x0663: "CONTEXTO", # ARABIC-INDIC DIGIT THREE |
| 0x0664: "CONTEXTO", # ARABIC-INDIC DIGIT FOUR |
| 0x0665: "CONTEXTO", # ARABIC-INDIC DIGIT FIVE |
| 0x0666: "CONTEXTO", # ARABIC-INDIC DIGIT SIX |
| 0x0667: "CONTEXTO", # ARABIC-INDIC DIGIT SEVEN |
| 0x0668: "CONTEXTO", # ARABIC-INDIC DIGIT EIGHT |
| 0x0669: "CONTEXTO", # ARABIC-INDIC DIGIT NINE |
| 0x06F0: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT ZERO |
| 0x06F1: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT ONE |
| 0x06F2: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT TWO |
| 0x06F3: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT THREE |
| 0x06F4: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT FOUR |
| 0x06F5: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT FIVE |
| 0x06F6: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT SIX |
| 0x06F7: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT SEVEN |
| 0x06F8: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT EIGHT |
| 0x06F9: "CONTEXTO", # EXTENDED ARABIC-INDIC DIGIT NINE |
| 0x0640: "DISALLOWED", # ARABIC TATWEEL |
| 0x07FA: "DISALLOWED", # NKO LAJANYALAN |
| 0x302E: "DISALLOWED", # HANGUL SINGLE DOT TONE MARK |
| 0x302F: "DISALLOWED", # HANGUL DOUBLE DOT TONE MARK |
| 0x3031: "DISALLOWED", # VERTICAL KANA REPEAT MARK |
| 0x3032: "DISALLOWED", # VERTICAL KANA REPEAT WITH VOICED SOUND MARK |
| 0x3033: "DISALLOWED", # VERTICAL KANA REPEAT MARK UPPER HALF |
| 0x3034: "DISALLOWED", # VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HA |
| 0x3035: "DISALLOWED", # VERTICAL KANA REPEAT MARK LOWER HALF |
| 0x303B: "DISALLOWED", # VERTICAL IDEOGRAPHIC ITERATION MARK |
| } |
| backwardscompatible = {} |
| |
| |
| def hexrange(start, end): |
| return range(int(start, 16), int(end, 16) + 1) |
| |
| |
| def hexvalue(value): |
| return int(value, 16) |
| |
| |
| _RE_UNICODE = re.compile("\\\\u([0-9a-fA-F]{4})") |
| _RE_SURROGATE = re.compile("[\ud800-\udbff][\udc00-\udfff]") |
| |
| |
| def unicode_fixup(string): |
| """Replace backslash-u-XXXX with appropriate unicode characters.""" |
| return _RE_SURROGATE.sub( |
| lambda match: chr((ord(match.group(0)[0]) - 0xD800) * 0x400 + ord(match.group(0)[1]) - 0xDC00 + 0x10000), |
| _RE_UNICODE.sub(lambda match: chr(int(match.group(1), 16)), string), |
| ) |
| |
| |
| class UnicodeVersion(object): |
| def __init__(self, version): |
| result = re.match(r"^(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)$", version) |
| if result: |
| self.major = int(result.group("major")) |
| self.minor = int(result.group("minor")) |
| self.patch = int(result.group("patch")) |
| self.numerical = (self.major << 8) + (self.minor << 4) + self.patch |
| self.latest = False |
| elif version == "latest": |
| self.latest = True |
| else: |
| raise ValueError("Unrecognized Unicode version") |
| |
| def __repr__(self, with_date=True): |
| if self.latest: |
| if with_date: |
| return "latest@{}".format(datetime.datetime.now().strftime("%Y-%m-%d")) |
| else: |
| return "latest" |
| else: |
| return "{}.{}.{}".format(self.major, self.minor, self.patch) |
| |
| @property |
| def tag(self): |
| return self.__repr__(with_date=False) |
| |
| def __gt__(self, other): |
| if self.latest: |
| return True |
| return self.numerical > other.numerical |
| |
| def __eq__(self, other): |
| if self.latest: |
| return False |
| return self.numerical == other.numerical |
| |
| |
| class UnicodeData(object): |
| def __init__(self, version, cache, args): |
| self.version = UnicodeVersion(version) |
| self.system_version = UnicodeVersion(unicodedata.unidata_version) |
| self.source = args.source |
| self.cache = cache |
| self.max = 0 |
| |
| if self.system_version < self.version: |
| print( |
| "Warning: Character stability not guaranteed as Python Unicode data {} older than requested {}".format( |
| self.system_version, self.version |
| ) |
| ) |
| |
| self._load_unicodedata() |
| self._load_proplist() |
| self._load_derivedcoreprops() |
| self._load_blocks() |
| self._load_casefolding() |
| self._load_hangulst() |
| self._load_arabicshaping() |
| self._load_scripts() |
| self._load_uts46mapping() |
| self._load_uts46testvectors() |
| |
| def _load_unicodedata(self): |
| f_ud = self._ucdfile("UnicodeData.txt") |
| self.ucd_data = {} |
| range_begin = None |
| for line in f_ud.splitlines(): |
| fields = line.split(";") |
| value = int(fields[0], 16) |
| start_marker = re.match("^<(?P<name>.*?), First>$", fields[1]) |
| end_marker = re.match("^<(?P<name>.*?), Last>$", fields[1]) |
| if start_marker: |
| range_begin = value |
| elif end_marker: |
| for i in range(range_begin, value + 1): |
| fields[1] = "<{}>".format(end_marker.group("name")) |
| self.ucd_data[i] = fields[1:] |
| range_begin = None |
| else: |
| self.ucd_data[value] = fields[1:] |
| |
| def _load_proplist(self): |
| f_pl = self._ucdfile("PropList.txt") |
| self.ucd_props = collections.defaultdict(list) |
| for line in f_pl.splitlines(): |
| result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$", line) |
| if result: |
| if result.group("end"): |
| for i in hexrange(result.group("start"), result.group("end")): |
| self.ucd_props[i].append(result.group("prop")) |
| else: |
| i = hexvalue(result.group("start")) |
| self.ucd_props[i].append(result.group("prop")) |
| |
| def _load_derivedcoreprops(self): |
| f_dcp = self._ucdfile("DerivedCoreProperties.txt") |
| for line in f_dcp.splitlines(): |
| result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$", line) |
| if result: |
| if result.group("end"): |
| for i in hexrange(result.group("start"), result.group("end")): |
| self.ucd_props[i].append(result.group("prop")) |
| else: |
| i = hexvalue(result.group("start")) |
| self.ucd_props[i].append(result.group("prop")) |
| |
| def _load_blocks(self): |
| self.ucd_block = {} |
| f_b = self._ucdfile("Blocks.txt") |
| for line in f_b.splitlines(): |
| result = re.match(r"^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<block>.*)\s*$", line) |
| if result: |
| for i in hexrange(result.group("start"), result.group("end")): |
| self.ucd_block[i] = result.group("block") |
| self.max = max(self.max, i) |
| |
| def _load_casefolding(self): |
| self.ucd_cf = {} |
| f_cf = self._ucdfile("CaseFolding.txt") |
| for line in f_cf.splitlines(): |
| result = re.match(r"^(?P<cp>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*;\s*(?P<subst>[0-9A-F\s]+)\s*", line) |
| if result: |
| if result.group("type") in ("C", "F"): |
| self.ucd_cf[int(result.group("cp"), 16)] = "".join( |
| [chr(int(x, 16)) for x in result.group("subst").split(" ")] |
| ) |
| |
| def _load_hangulst(self): |
| self.ucd_hst = {} |
| f_hst = self._ucdfile("HangulSyllableType.txt") |
| for line in f_hst.splitlines(): |
| result = re.match(r"^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*(|\#.*)$", line) |
| if result: |
| for i in hexrange(result.group("start"), result.group("end")): |
| self.ucd_hst[i] = result.group("type") |
| |
| def _load_arabicshaping(self): |
| self.ucd_as = {} |
| f_as = self._ucdfile("extracted/DerivedJoiningType.txt") |
| for line in f_as.splitlines(): |
| result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<jt>\S+)\s*(|\#.*)$", line) |
| if result: |
| if result.group("end"): |
| for i in hexrange(result.group("start"), result.group("end")): |
| self.ucd_as[i] = result.group("jt") |
| else: |
| i = hexvalue(result.group("start")) |
| self.ucd_as[i] = result.group("jt") |
| |
| def _load_scripts(self): |
| self.ucd_s = {} |
| f_s = self._ucdfile("Scripts.txt") |
| for line in f_s.splitlines(): |
| result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<script>\S+)\s*(|\#.*)$", line) |
| if result: |
| if not result.group("script") in self.ucd_s: |
| self.ucd_s[result.group("script")] = set() |
| if result.group("end"): |
| for i in hexrange(result.group("start"), result.group("end")): |
| self.ucd_s[result.group("script")].add(i) |
| else: |
| i = hexvalue(result.group("start")) |
| self.ucd_s[result.group("script")].add(i) |
| |
| def _load_uts46mapping(self): |
| self.ucd_idnamt = {} |
| f_idnamt = self._ucdfile("IdnaMappingTable.txt", urlbase=UTS46_URL) |
| for line in f_idnamt.splitlines(): |
| result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<fields>[^#]+)", line) |
| if result: |
| fields = [x.strip() for x in result.group("fields").split(";")] |
| if result.group("end"): |
| for i in hexrange(result.group("start"), result.group("end")): |
| self.ucd_idnamt[i] = fields |
| else: |
| i = hexvalue(result.group("start")) |
| self.ucd_idnamt[i] = fields |
| |
| def _load_uts46testvectors(self): |
| self.ucd_uts46tests = [] |
| f_uts46tests = self._ucdfile("IdnaTestV2.txt", urlbase=UTS46_URL) |
| for lineno, line in enumerate(f_uts46tests.splitlines()): |
| if "#" in line: |
| line = line.split("#", 1)[0] |
| if not line: |
| continue |
| self.ucd_uts46tests.append((lineno + 1, tuple(field.strip() for field in unicode_fixup(line).split(";")))) |
| |
| def _ucdfile(self, filename, urlbase=UCD_URL): |
| if self.source: |
| f = open("{}/{}".format(self.source, filename)) |
| return f.read() |
| else: |
| cache_file = None |
| if self.cache: |
| cache_file = os.path.expanduser("{}/{}/{}".format(self.cache, self.version.tag, filename)) |
| if os.path.isfile(cache_file): |
| f = open(cache_file) |
| return f.read() |
| |
| version_path = self.version.tag |
| if version_path == "latest": |
| version_path = "UCD/latest" |
| url = urlbase.format( |
| version=version_path, |
| filename=filename, |
| ) |
| content = urlopen(url).read().decode("utf-8") |
| |
| if cache_file: |
| if not os.path.isdir(os.path.dirname(cache_file)): |
| os.makedirs(os.path.dirname(cache_file)) |
| f = open(cache_file, "wb") |
| f.write(content.encode("utf-8")) |
| f.close() |
| |
| return str(content) |
| |
| def codepoints(self): |
| for i in range(0, self.max + 1): |
| yield CodePoint(i, ucdata=self) |
| |
| |
| class CodePoint: |
| def __init__(self, value=None, ucdata=None): |
| self.value = value |
| self.ucdata = ucdata |
| |
| def _casefold(self, s): |
| r = "" |
| for c in s: |
| r += self.ucdata.ucd_cf.get(ord(c), c) |
| return r |
| |
| @property |
| def exception_value(self): |
| return exceptions.get(self.value, False) |
| |
| @property |
| def compat_value(self): |
| return backwardscompatible.get(self.value, False) |
| |
| @property |
| def name(self): |
| if self.value in self.ucdata.ucd_data: |
| return self.ucdata.ucd_data[self.value][0] |
| elif "Noncharacter_Code_Point" in self.ucdata.ucd_props[self.value]: |
| return "<noncharacter>" |
| else: |
| return "<reserved>" |
| |
| @property |
| def general_category(self): |
| return self.ucdata.ucd_data.get(self.value, [None, None])[1] |
| |
| @property |
| def unassigned(self): |
| return not ("Noncharacter_Code_Point" in self.ucdata.ucd_props[self.value] or self.value in self.ucdata.ucd_data) |
| |
| @property |
| def ldh(self): |
| if self.value == 0x002D or self.value in range(0x0030, 0x0039 + 1) or self.value in range(0x0061, 0x007A + 1): |
| return True |
| return False |
| |
| @property |
| def join_control(self): |
| return "Join_Control" in self.ucdata.ucd_props[self.value] |
| |
| @property |
| def joining_type(self): |
| return self.ucdata.ucd_as.get(self.value, None) |
| |
| @property |
| def char(self): |
| return chr(self.value) |
| |
| @property |
| def nfkc_cf(self): |
| return unicodedata.normalize("NFKC", self._casefold(unicodedata.normalize("NFKC", self.char))) |
| |
| @property |
| def unstable(self): |
| return self.char != self.nfkc_cf |
| |
| @property |
| def in_ignorableproperties(self): |
| for prop in ["Default_Ignorable_Code_Point", "White_Space", "Noncharacter_Code_Point"]: |
| if prop in self.ucdata.ucd_props[self.value]: |
| return True |
| return False |
| |
| @property |
| def in_ignorableblocks(self): |
| return self.ucdata.ucd_block.get(self.value) in ( |
| "Combining Diacritical Marks for Symbols", |
| "Musical Symbols", |
| "Ancient Greek Musical Notation", |
| ) |
| |
| @property |
| def oldhanguljamo(self): |
| return self.ucdata.ucd_hst.get(self.value) in ("L", "V", "T") |
| |
| @property |
| def in_lettersdigits(self): |
| return self.general_category in ("Ll", "Lu", "Lo", "Nd", "Lm", "Mn", "Mc") |
| |
| @property |
| def idna2008_status(self): |
| if self.exception_value: |
| return self.exception_value |
| elif self.compat_value: |
| return self.compat_value |
| elif self.unassigned: |
| return "UNASSIGNED" |
| elif self.ldh: |
| return "PVALID" |
| elif self.join_control: |
| return "CONTEXTJ" |
| elif self.unstable: |
| return "DISALLOWED" |
| elif self.in_ignorableproperties: |
| return "DISALLOWED" |
| elif self.in_ignorableblocks: |
| return "DISALLOWED" |
| elif self.oldhanguljamo: |
| return "DISALLOWED" |
| elif self.in_lettersdigits: |
| return "PVALID" |
| else: |
| return "DISALLOWED" |
| |
| @property |
| def uts46_data(self): |
| return self.ucdata.ucd_idnamt.get(self.value, None) |
| |
| @property |
| def uts46_status(self): |
| return " ".join(self.uts46_data) |
| |
| |
| def diagnose_codepoint(codepoint, args, ucdata): |
| cp = CodePoint(codepoint, ucdata=ucdata) |
| |
| print("U+{:04X}:".format(codepoint)) |
| print(" Name: {}".format(cp.name)) |
| print("1 Exceptions: {}".format(exceptions.get(codepoint, False))) |
| print("2 Backwards Compat: {}".format(backwardscompatible.get(codepoint, False))) |
| print("3 Unassigned: {}".format(cp.unassigned)) |
| print("4 LDH: {}".format(cp.ldh)) |
| print(" Properties: {}".format(" ".join(sorted(ucdata.ucd_props.get(codepoint, ["None"]))))) |
| print("5 .Join Control: {}".format(cp.join_control)) |
| print(" NFKC CF: {}".format(" ".join(["U+{:04X}".format(ord(x)) for x in cp.nfkc_cf]))) |
| print("6 .Unstable: {}".format(cp.unstable)) |
| print("7 .Ignorable Prop: {}".format(cp.in_ignorableproperties)) |
| print(" Block: {}".format(ucdata.ucd_block.get(codepoint, None))) |
| print("8 .Ignorable Block: {}".format(cp.in_ignorableblocks)) |
| print(" Hangul Syll Type: {}".format(ucdata.ucd_hst.get(codepoint, None))) |
| print("9 .Old Hangul Jamo: {}".format(cp.oldhanguljamo)) |
| print(" General Category: {}".format(cp.general_category)) |
| print("10 .Letters Digits: {}".format(cp.in_lettersdigits)) |
| print("== IDNA 2008: {}".format(cp.idna2008_status)) |
| print("== UTS 46: {}".format(cp.uts46_status)) |
| print("(Unicode {} [sys:{}])".format(ucdata.version, ucdata.system_version)) |
| |
| |
| def ucdrange(start, end): |
| if start == end: |
| return ("{:04X}".format(start.value), start.name) |
| else: |
| return ("{:04X}..{:04X}".format(start.value, end.value), "{}..{}".format(start.name, end.name)) |
| |
| |
| def upper_hex(value): |
| num = hex(value) |
| return num[:2] + num[2:].upper() |
| |
| |
| def optimised_list(d): |
| values = intranges_from_list(d) |
| if len(values) == 1: |
| for value in values: |
| # Respect ruff format style |
| yield "({},),".format(upper_hex(value)) |
| else: |
| yield "(" |
| for value in values: |
| yield " {},".format(upper_hex(value)) |
| yield " )," |
| |
| |
| def make_table(args, ucdata): |
| last_status = None |
| cps = [] |
| table_data = [] |
| |
| for cp in ucdata.codepoints(): |
| status = cp.idna2008_status |
| if last_status and last_status != status: |
| (values, description) = ucdrange(cps[0], cps[-1]) |
| table_data.append([values, last_status, description]) |
| cps = [] |
| last_status = status |
| cps.append(cp) |
| (values, description) = ucdrange(cps[0], cps[-1]) |
| table_data.append([values, last_status, description]) |
| |
| if args.dir: |
| f = open("{}/idna-table-{}.txt".format(args.dir, ucdata.version), "wb") |
| for row in table_data: |
| f.write("{:12}; {:12}# {:.44}\n".format(*row).encode("ascii")) |
| f.close() |
| |
| else: |
| for row in table_data: |
| print("{:12}; {:12}# {:.44}".format(*row)) |
| |
| |
| def idna_libdata(ucdata): |
| yield "# This file is automatically generated by tools/idna-data\n" |
| yield '__version__ = "{}"\n'.format(ucdata.version) |
| |
| # |
| # Script classifications are used by some CONTEXTO rules in RFC 5891 |
| # |
| yield "scripts = {" |
| for script in SCRIPT_WHITELIST: |
| prefix = ' "{}": '.format(script) |
| for line in optimised_list(ucdata.ucd_s[script]): |
| yield prefix + line |
| prefix = "" |
| yield "}" |
| |
| # |
| # Joining types are used by CONTEXTJ rule A.1 |
| # |
| yield "joining_types = {" |
| for cp in ucdata.codepoints(): |
| if cp.joining_type: |
| yield " 0x{:X}: {},".format(cp.value, ord(cp.joining_type)) |
| yield "}" |
| |
| # |
| # These are the classification of codepoints into PVALID, CONTEXTO, CONTEXTJ, etc. |
| # |
| yield "codepoint_classes = {" |
| classes = {} |
| for cp in ucdata.codepoints(): |
| status = cp.idna2008_status |
| if status in ("UNASSIGNED", "DISALLOWED"): |
| continue |
| if not status in classes: |
| classes[status] = set() |
| classes[status].add(cp.value) |
| for status in ["PVALID", "CONTEXTJ", "CONTEXTO"]: |
| prefix = ' "{}": '.format(status) |
| for line in optimised_list(classes[status]): |
| yield prefix + line |
| prefix = "" |
| yield "}" |
| |
| |
| def uts46_ranges(ucdata): |
| last = (None, None) |
| for cp in ucdata.codepoints(): |
| fields = cp.uts46_data |
| if not fields: |
| continue |
| status, mapping = UTS46_STATUSES[fields[0]] |
| if mapping: |
| mapping = "".join(chr(int(codepoint, 16)) for codepoint in fields[1].split()) |
| mapping = mapping.replace("\\", "\\\\") |
| else: |
| mapping = None |
| if cp.value > 255 and (status, mapping) == last: |
| continue |
| last = (status, mapping) |
| |
| if mapping is not None: |
| if '"' in mapping: |
| yield "(0x{:X}, \"{}\", '{}')".format(cp.value, status, mapping) |
| else: |
| yield '(0x{:X}, "{}", "{}")'.format(cp.value, status, mapping) |
| else: |
| yield '(0x{:X}, "{}")'.format(cp.value, status) |
| |
| |
| def uts46_libdata(ucdata): |
| yield "# This file is automatically generated by tools/idna-data" |
| yield "# vim: set fileencoding=utf-8 :\n" |
| yield "from typing import List, Tuple, Union\n" |
| yield '"""IDNA Mapping Table from UTS46."""\n\n' |
| |
| yield '__version__ = "{}"\n'.format(ucdata.version) |
| |
| idx = -1 |
| for row in uts46_ranges(ucdata): |
| idx += 1 |
| if idx % UTS46_SEGMENT_SIZE == 0: |
| if idx != 0: |
| yield " ]\n" |
| yield "\ndef _seg_{}() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]:\n return [".format( |
| idx // UTS46_SEGMENT_SIZE |
| ) |
| yield " {},".format(row) |
| yield " ]\n" |
| |
| yield "\nuts46data = tuple(" |
| yield " _seg_0()" |
| for i in range(1, idx // UTS46_SEGMENT_SIZE + 1): |
| yield " + _seg_{}()".format(i) |
| yield ") # type: Tuple[Union[Tuple[int, str], Tuple[int, str, str]], ...]" |
| |
| |
| def uts46_tests(ucdata): |
| yield "# This file is automatically generated by tools/idna-data\n" |
| yield "import unittest\n" |
| yield "class UTS46Tests(unittest.TestCase):\n" |
| yield " def test_uts46(self):\n" |
| |
| for lineno, fields in ucdata.ucd_uts46tests: |
| ( |
| source, |
| to_unicode, |
| to_unicode_status, |
| to_ascii, |
| to_ascii_status, |
| to_ascii_t, |
| to_ascii_t_status, |
| ) = fields |
| |
| # Per UTS46 test vectors, if the result is "", expect a blank string, but |
| # if the result is blank, expect the input string. |
| if to_unicode == '""': |
| to_unicode = "" |
| elif not to_unicode: |
| to_unicode = source |
| if not to_unicode_status: |
| to_unicode_status = "[]" |
| if to_ascii == '""': |
| to_ascii = "" |
| elif not to_ascii: |
| to_ascii = to_unicode |
| if not to_ascii_status: |
| to_ascii_status = to_unicode_status |
| if to_ascii_t == '""': |
| to_ascii_t = "" |
| elif not to_ascii_t: |
| to_ascii_t = to_ascii |
| if not to_ascii_t_status: |
| to_ascii_t_status = to_ascii_status |
| |
| # Is this label IDNA 2008 legal according to UTS46 mapping table? |
| nv8 = False |
| for codepoint in to_unicode: |
| try: |
| field = ucdata.ucd_idnamt[ord(codepoint)][2] |
| except IndexError: |
| field = "" |
| if field == "NV8" or field == "XV8": |
| nv8 = ord(codepoint) |
| |
| yield " # line {}: {}".format(lineno, repr(fields)) |
| if nv8: |
| yield " # nv8: U+{:X}".format(nv8) |
| |
| if to_unicode_status == "[]": |
| yield ( |
| ' self.assertEqual(idna.decode({}, uts46=True, strict=True), {}, "Expected \\"{}\\"' |
| ' as decode() output")'.format(repr(source), repr(to_unicode), repr(to_unicode)) |
| ) |
| |
| yield "" |
| |
| |
| # self.assertEqual(idna.uts46_remap("A_", std3_rules=False), "a_") |
| # self.assertRaises(idna.InvalidCodepoint, idna.uts46_remap, "A_", std3_rules=True) |
| |
| |
| """ |
| try: |
| output = idna.decode(source, uts46=True, strict=True) |
| if to_unicode_status != "[]": |
| self.fail("decode() did not emit required error {} for {}".format(to_unicode, repr(source))) |
| self.assertEqual(output, to_unicode, "unexpected decode() output") |
| except (idna.IDNAError, UnicodeError, ValueError) as exc: |
| if str(exc).startswith("Unknown"): |
| raise unittest.SkipTest("Test requires support for a newer version of Unicode than this Python supports") |
| if to_unicode_status == "[]": |
| raise |
| |
| try: |
| output = idna.encode(source, uts46=True, strict=True).decode("ascii") |
| if to_ascii_status != "[]": |
| self.fail("encode() did not emit required error {} for {}".format(to_ascii_status, repr(source))) |
| self.assertEqual(output, to_ascii, "unexpected encode() output") |
| except (idna.IDNAError, UnicodeError, ValueError) as exc: |
| if str(exc).startswith("Unknown"): |
| raise unittest.SkipTest("Test requires support for a newer version of Unicode than this Python supports") |
| if to_ascii_status == "[]": |
| raise |
| |
| try: |
| output = idna.encode(source, uts46=True, strict=True, transitional=True).decode("ascii") |
| if to_ascii_t_status != "[]": |
| self.fail( |
| "encode(transitional=True) did not emit required error {} for {}".format(to_ascii_t_status, repr(source)) |
| ) |
| self.assertEqual(output, to_ascii_t, "unexpected encode() output") |
| except (idna.IDNAError, UnicodeError, ValueError) as exc: |
| if str(exc).startswith("Unknown"): |
| raise unittest.SkipTest("Test requires support for a newer version of Unicode than this Python supports") |
| if to_ascii_t_status == "[]": |
| raise |
| """ |
| |
| |
| def make_libdata(args, ucdata): |
| dest_dir = args.dir or "." |
| |
| target_filename = os.path.join(dest_dir, "idnadata.py") |
| with open(target_filename, "wb") as target: |
| for line in idna_libdata(ucdata): |
| target.write((line + "\n").encode("utf-8")) |
| |
| target_filename = os.path.join(dest_dir, "uts46data.py") |
| with open(target_filename, "wb") as target: |
| for line in uts46_libdata(ucdata): |
| target.write((line + "\n").encode("utf-8")) |
| |
| target_filename = os.path.join(dest_dir, "test_idna_uts46.py") |
| with open(target_filename, "wb") as target: |
| for line in uts46_tests(ucdata): |
| target.write((line + "\n").encode("utf-8")) |
| |
| |
| def arg_error(message, parser): |
| parser.print_usage() |
| print("{}: error: {}".format(sys.argv[0], message)) |
| sys.exit(2) |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser(description="Determine IDNA code-point validity data") |
| parser.add_argument( |
| "action", type=str, default="preferred", help="Task to perform (make-libdata, make-tables, <codepoint>)" |
| ) |
| |
| parser.add_argument("--version", type=str, default="preferred", help="Unicode version to use (preferred, latest, <x.y.z>)") |
| parser.add_argument("--source", type=str, default=None, help="Where to fetch Unicode data (file path)") |
| parser.add_argument("--dir", type=str, default=None, help="Where to export the output") |
| parser.add_argument("--cache", type=str, default=None, help="Where to cache Unicode data") |
| parser.add_argument("--no-cache", action="store_true", help="Don't cache Unicode data") |
| libdata = parser.add_argument_group("make-libdata", "Make module data for Python IDNA library") |
| |
| tables = parser.add_argument_group("make-table", "Make IANA-style reference table") |
| |
| codepoint = parser.add_argument_group("codepoint", "Display related data for given codepoint (e.g. U+0061)") |
| |
| args = parser.parse_args() |
| |
| if args.version == "preferred": |
| target_version = PREFERRED_VERSION |
| else: |
| target_version = args.version |
| |
| if args.cache and args.no_cache: |
| arg_error("I can't both --cache and --no-cache", parser) |
| cache = args.cache or DEFAULT_CACHE_DIR |
| if args.no_cache: |
| cache = None |
| |
| ucdata = UnicodeData(target_version, cache, args) |
| |
| if args.action == "make-table": |
| make_table(args, ucdata) |
| elif args.action == "make-libdata": |
| make_libdata(args, ucdata) |
| else: |
| result = re.match(r"(?i)^(U\+|)(?P<cp>[0-9A-F]{4,6})$", args.action) |
| if result: |
| codepoint = int(result.group("cp"), 16) |
| diagnose_codepoint(codepoint, args, ucdata) |
| sys.exit(0) |
| arg_error("Don't recognize action or codepoint value", parser) |
| |
| |
| if __name__ == "__main__": |
| main() |