WIP: Pre-compute UTS46 tests as part of make-libdata

Rather than generate them dynamically, pre-compute them just like we do
for the UTS46 data itself.
diff --git a/tools/idna-data b/tools/idna-data
index 5c44ec1..5e4a057 100755
--- a/tools/idna-data
+++ b/tools/idna-data
@@ -4,83 +4,78 @@
 from urllib.request import urlopen
 
 # Use intranges.intranges_from_list() from the sibling idna directory
-sys.path.append(
-    os.path.join(
-        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
-        "idna"
-    )
-)
+sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "idna"))
 from intranges import intranges_from_list
 
 if sys.version_info[0] < 3:
     print("Only Python 3 supported.")
     sys.exit(2)
 
-PREFERRED_VERSION = '16.0.0'
-UCD_URL = 'http://www.unicode.org/Public/{version}/ucd/{filename}'
-UTS46_URL = 'http://www.unicode.org/Public/idna/{version}/{filename}'
+PREFERRED_VERSION = "16.0.0"
+UCD_URL = "http://www.unicode.org/Public/{version}/ucd/{filename}"
+UTS46_URL = "http://www.unicode.org/Public/idna/{version}/{filename}"
 
-DEFAULT_CACHE_DIR = '~/.cache/unidata'
+DEFAULT_CACHE_DIR = "~/.cache/unidata"
 
 # Scripts affected by IDNA contextual rules
-SCRIPT_WHITELIST = sorted(['Greek', 'Han', 'Hebrew', 'Hiragana', 'Katakana'])
+SCRIPT_WHITELIST = sorted(["Greek", "Han", "Hebrew", "Hiragana", "Katakana"])
 
 # Used to piece apart UTS#46 data for Jython compatibility
 UTS46_SEGMENT_SIZE = 100
 
 UTS46_STATUSES = {
-    'valid': ('V', False),
-    'ignored': ('I', False),
-    'mapped': ('M', True),
-    'deviation': ('D', True),
-    'disallowed': ('X', False),
-    'disallowed_STD3_valid': ('3', False),
-    'disallowed_STD3_mapped': ('3', True)
+    "valid": ("V", False),
+    "ignored": ("I", False),
+    "mapped": ("M", True),
+    "deviation": ("D", True),
+    "disallowed": ("X", False),
+    "disallowed_STD3_valid": ("3", False),
+    "disallowed_STD3_mapped": ("3", True),
 }
 
 # Exceptions are manually assigned in Section 2.6 of RFC 5892.
 exceptions = {
-    0x00DF: 'PVALID',      # LATIN SMALL LETTER SHARP S
-    0x03C2: 'PVALID',      # GREEK SMALL LETTER FINAL SIGMA
-    0x06FD: 'PVALID',      # ARABIC SIGN SINDHI AMPERSAND
-    0x06FE: 'PVALID',      # ARABIC SIGN SINDHI POSTPOSITION MEN
-    0x0F0B: 'PVALID',      # TIBETAN MARK INTERSYLLABIC TSHEG
-    0x3007: 'PVALID',      # IDEOGRAPHIC NUMBER ZERO
-    0x00B7: 'CONTEXTO',    # MIDDLE DOT
-    0x0375: 'CONTEXTO',    # GREEK LOWER NUMERAL SIGN (KERAIA)
-    0x05F3: 'CONTEXTO',    # HEBREW PUNCTUATION GERESH
-    0x05F4: 'CONTEXTO',    # HEBREW PUNCTUATION GERSHAYIM
-    0x30FB: 'CONTEXTO',    # KATAKANA MIDDLE DOT
-    0x0660: 'CONTEXTO',    # ARABIC-INDIC DIGIT ZERO
-    0x0661: 'CONTEXTO',    # ARABIC-INDIC DIGIT ONE
-    0x0662: 'CONTEXTO',    # ARABIC-INDIC DIGIT TWO
-    0x0663: 'CONTEXTO',    # ARABIC-INDIC DIGIT THREE
-    0x0664: 'CONTEXTO',    # ARABIC-INDIC DIGIT FOUR
-    0x0665: 'CONTEXTO',    # ARABIC-INDIC DIGIT FIVE
-    0x0666: 'CONTEXTO',    # ARABIC-INDIC DIGIT SIX
-    0x0667: 'CONTEXTO',    # ARABIC-INDIC DIGIT SEVEN
-    0x0668: 'CONTEXTO',    # ARABIC-INDIC DIGIT EIGHT
-    0x0669: 'CONTEXTO',    # ARABIC-INDIC DIGIT NINE
-    0x06F0: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT ZERO
-    0x06F1: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT ONE
-    0x06F2: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT TWO
-    0x06F3: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT THREE
-    0x06F4: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT FOUR
-    0x06F5: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT FIVE
-    0x06F6: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT SIX
-    0x06F7: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT SEVEN
-    0x06F8: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT EIGHT
-    0x06F9: 'CONTEXTO',    # EXTENDED ARABIC-INDIC DIGIT NINE
-    0x0640: 'DISALLOWED',  # ARABIC TATWEEL
-    0x07FA: 'DISALLOWED',  # NKO LAJANYALAN
-    0x302E: 'DISALLOWED',  # HANGUL SINGLE DOT TONE MARK
-    0x302F: 'DISALLOWED',  # HANGUL DOUBLE DOT TONE MARK
-    0x3031: 'DISALLOWED',  # VERTICAL KANA REPEAT MARK
-    0x3032: 'DISALLOWED',  # VERTICAL KANA REPEAT WITH VOICED SOUND MARK
-    0x3033: 'DISALLOWED',  # VERTICAL KANA REPEAT MARK UPPER HALF
-    0x3034: 'DISALLOWED',  # VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HA
-    0x3035: 'DISALLOWED',  # VERTICAL KANA REPEAT MARK LOWER HALF
-    0x303B: 'DISALLOWED',  # VERTICAL IDEOGRAPHIC ITERATION MARK
+    0x00DF: "PVALID",  # LATIN SMALL LETTER SHARP S
+    0x03C2: "PVALID",  # GREEK SMALL LETTER FINAL SIGMA
+    0x06FD: "PVALID",  # ARABIC SIGN SINDHI AMPERSAND
+    0x06FE: "PVALID",  # ARABIC SIGN SINDHI POSTPOSITION MEN
+    0x0F0B: "PVALID",  # TIBETAN MARK INTERSYLLABIC TSHEG
+    0x3007: "PVALID",  # IDEOGRAPHIC NUMBER ZERO
+    0x00B7: "CONTEXTO",  # MIDDLE DOT
+    0x0375: "CONTEXTO",  # GREEK LOWER NUMERAL SIGN (KERAIA)
+    0x05F3: "CONTEXTO",  # HEBREW PUNCTUATION GERESH
+    0x05F4: "CONTEXTO",  # HEBREW PUNCTUATION GERSHAYIM
+    0x30FB: "CONTEXTO",  # KATAKANA MIDDLE DOT
+    0x0660: "CONTEXTO",  # ARABIC-INDIC DIGIT ZERO
+    0x0661: "CONTEXTO",  # ARABIC-INDIC DIGIT ONE
+    0x0662: "CONTEXTO",  # ARABIC-INDIC DIGIT TWO
+    0x0663: "CONTEXTO",  # ARABIC-INDIC DIGIT THREE
+    0x0664: "CONTEXTO",  # ARABIC-INDIC DIGIT FOUR
+    0x0665: "CONTEXTO",  # ARABIC-INDIC DIGIT FIVE
+    0x0666: "CONTEXTO",  # ARABIC-INDIC DIGIT SIX
+    0x0667: "CONTEXTO",  # ARABIC-INDIC DIGIT SEVEN
+    0x0668: "CONTEXTO",  # ARABIC-INDIC DIGIT EIGHT
+    0x0669: "CONTEXTO",  # ARABIC-INDIC DIGIT NINE
+    0x06F0: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT ZERO
+    0x06F1: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT ONE
+    0x06F2: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT TWO
+    0x06F3: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT THREE
+    0x06F4: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT FOUR
+    0x06F5: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT FIVE
+    0x06F6: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT SIX
+    0x06F7: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT SEVEN
+    0x06F8: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT EIGHT
+    0x06F9: "CONTEXTO",  # EXTENDED ARABIC-INDIC DIGIT NINE
+    0x0640: "DISALLOWED",  # ARABIC TATWEEL
+    0x07FA: "DISALLOWED",  # NKO LAJANYALAN
+    0x302E: "DISALLOWED",  # HANGUL SINGLE DOT TONE MARK
+    0x302F: "DISALLOWED",  # HANGUL DOUBLE DOT TONE MARK
+    0x3031: "DISALLOWED",  # VERTICAL KANA REPEAT MARK
+    0x3032: "DISALLOWED",  # VERTICAL KANA REPEAT WITH VOICED SOUND MARK
+    0x3033: "DISALLOWED",  # VERTICAL KANA REPEAT MARK UPPER HALF
+    0x3034: "DISALLOWED",  # VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HA
+    0x3035: "DISALLOWED",  # VERTICAL KANA REPEAT MARK LOWER HALF
+    0x303B: "DISALLOWED",  # VERTICAL IDEOGRAPHIC ITERATION MARK
 }
 backwardscompatible = {}
 
@@ -88,33 +83,45 @@
 def hexrange(start, end):
     return range(int(start, 16), int(end, 16) + 1)
 
+
 def hexvalue(value):
     return int(value, 16)
 
 
-class UnicodeVersion(object):
+_RE_UNICODE = re.compile("\\\\u([0-9a-fA-F]{4})")
+_RE_SURROGATE = re.compile("[\ud800-\udbff][\udc00-\udfff]")
 
+
+def unicode_fixup(string):
+    """Replace backslash-u-XXXX with appropriate unicode characters."""
+    return _RE_SURROGATE.sub(
+        lambda match: chr((ord(match.group(0)[0]) - 0xD800) * 0x400 + ord(match.group(0)[1]) - 0xDC00 + 0x10000),
+        _RE_UNICODE.sub(lambda match: chr(int(match.group(1), 16)), string),
+    )
+
+
+class UnicodeVersion(object):
     def __init__(self, version):
-        result = re.match(r'^(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)$', version)
+        result = re.match(r"^(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)$", version)
         if result:
-            self.major = int(result.group('major'))
-            self.minor = int(result.group('minor'))
-            self.patch = int(result.group('patch'))
+            self.major = int(result.group("major"))
+            self.minor = int(result.group("minor"))
+            self.patch = int(result.group("patch"))
             self.numerical = (self.major << 8) + (self.minor << 4) + self.patch
             self.latest = False
-        elif version == 'latest':
+        elif version == "latest":
             self.latest = True
         else:
-            raise ValueError('Unrecognized Unicode version')
+            raise ValueError("Unrecognized Unicode version")
 
     def __repr__(self, with_date=True):
         if self.latest:
             if with_date:
-                return 'latest@{}'.format(datetime.datetime.now().strftime('%Y-%m-%d'))
+                return "latest@{}".format(datetime.datetime.now().strftime("%Y-%m-%d"))
             else:
-                return 'latest'
+                return "latest"
         else:
-            return '{}.{}.{}'.format(self.major, self.minor, self.patch)
+            return "{}.{}.{}".format(self.major, self.minor, self.patch)
 
     @property
     def tag(self):
@@ -132,7 +139,6 @@
 
 
 class UnicodeData(object):
-
     def __init__(self, version, cache, args):
         self.version = UnicodeVersion(version)
         self.system_version = UnicodeVersion(unicodedata.unidata_version)
@@ -141,8 +147,11 @@
         self.max = 0
 
         if self.system_version < self.version:
-            print('Warning: Character stability not guaranteed as Python Unicode data {}'
-                   ' older than requested {}'.format(self.system_version, self.version))
+            print(
+                "Warning: Character stability not guaranteed as Python Unicode data {} older than requested {}".format(
+                    self.system_version, self.version
+                )
+            )
 
         self._load_unicodedata()
         self._load_proplist()
@@ -153,174 +162,160 @@
         self._load_arabicshaping()
         self._load_scripts()
         self._load_uts46mapping()
+        self._load_uts46testvectors()
 
     def _load_unicodedata(self):
-
-        f_ud = self._ucdfile('UnicodeData.txt')
+        f_ud = self._ucdfile("UnicodeData.txt")
         self.ucd_data = {}
         range_begin = None
         for line in f_ud.splitlines():
-            fields = line.split(';')
+            fields = line.split(";")
             value = int(fields[0], 16)
-            start_marker = re.match('^<(?P<name>.*?), First>$', fields[1])
-            end_marker = re.match('^<(?P<name>.*?), Last>$', fields[1])
+            start_marker = re.match("^<(?P<name>.*?), First>$", fields[1])
+            end_marker = re.match("^<(?P<name>.*?), Last>$", fields[1])
             if start_marker:
                 range_begin = value
             elif end_marker:
-                for i in range(range_begin, value+1):
-                    fields[1] = '<{}>'.format(end_marker.group('name'))
+                for i in range(range_begin, value + 1):
+                    fields[1] = "<{}>".format(end_marker.group("name"))
                     self.ucd_data[i] = fields[1:]
                 range_begin = None
             else:
                 self.ucd_data[value] = fields[1:]
 
     def _load_proplist(self):
-
-        f_pl = self._ucdfile('PropList.txt')
+        f_pl = self._ucdfile("PropList.txt")
         self.ucd_props = collections.defaultdict(list)
         for line in f_pl.splitlines():
-            result = re.match(
-                r'^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$',
-                line)
+            result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$", line)
             if result:
-                if result.group('end'):
-                    for i in hexrange(result.group('start'), result.group('end')):
-                        self.ucd_props[i].append(result.group('prop'))
+                if result.group("end"):
+                    for i in hexrange(result.group("start"), result.group("end")):
+                        self.ucd_props[i].append(result.group("prop"))
                 else:
-                    i = hexvalue(result.group('start'))
-                    self.ucd_props[i].append(result.group('prop'))
+                    i = hexvalue(result.group("start"))
+                    self.ucd_props[i].append(result.group("prop"))
 
     def _load_derivedcoreprops(self):
-
-        f_dcp = self._ucdfile('DerivedCoreProperties.txt')
+        f_dcp = self._ucdfile("DerivedCoreProperties.txt")
         for line in f_dcp.splitlines():
-            result = re.match(
-                r'^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$',
-                line)
+            result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<prop>\S+)\s*(|\#.*)$", line)
             if result:
-                if result.group('end'):
-                    for i in hexrange(result.group('start'), result.group('end')):
-                        self.ucd_props[i].append(result.group('prop'))
+                if result.group("end"):
+                    for i in hexrange(result.group("start"), result.group("end")):
+                        self.ucd_props[i].append(result.group("prop"))
                 else:
-                    i = hexvalue(result.group('start'))
-                    self.ucd_props[i].append(result.group('prop'))
+                    i = hexvalue(result.group("start"))
+                    self.ucd_props[i].append(result.group("prop"))
 
     def _load_blocks(self):
-
         self.ucd_block = {}
-        f_b = self._ucdfile('Blocks.txt')
+        f_b = self._ucdfile("Blocks.txt")
         for line in f_b.splitlines():
-            result = re.match(
-                r'^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<block>.*)\s*$',
-                line)
+            result = re.match(r"^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<block>.*)\s*$", line)
             if result:
-                for i in hexrange(result.group('start'), result.group('end')):
-                    self.ucd_block[i] = result.group('block')
+                for i in hexrange(result.group("start"), result.group("end")):
+                    self.ucd_block[i] = result.group("block")
                     self.max = max(self.max, i)
 
     def _load_casefolding(self):
-
         self.ucd_cf = {}
-        f_cf = self._ucdfile('CaseFolding.txt')
+        f_cf = self._ucdfile("CaseFolding.txt")
         for line in f_cf.splitlines():
-            result = re.match(
-                r'^(?P<cp>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*;\s*(?P<subst>[0-9A-F\s]+)\s*',
-                line)
+            result = re.match(r"^(?P<cp>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*;\s*(?P<subst>[0-9A-F\s]+)\s*", line)
             if result:
-                if result.group('type') in ('C', 'F'):
-                    self.ucd_cf[int(result.group('cp'), 16)] = \
-                        ''.join([chr(int(x, 16)) for x in result.group('subst').split(' ')])
+                if result.group("type") in ("C", "F"):
+                    self.ucd_cf[int(result.group("cp"), 16)] = "".join(
+                        [chr(int(x, 16)) for x in result.group("subst").split(" ")]
+                    )
 
     def _load_hangulst(self):
-
         self.ucd_hst = {}
-        f_hst = self._ucdfile('HangulSyllableType.txt')
+        f_hst = self._ucdfile("HangulSyllableType.txt")
         for line in f_hst.splitlines():
-            result = re.match(
-                r'^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*(|\#.*)$',
-                line)
+            result = re.match(r"^(?P<start>[0-9A-F]{4,6})\.\.(?P<end>[0-9A-F]{4,6})\s*;\s*(?P<type>\S+)\s*(|\#.*)$", line)
             if result:
-                for i in hexrange(result.group('start'), result.group('end')):
-                    self.ucd_hst[i] = result.group('type')
+                for i in hexrange(result.group("start"), result.group("end")):
+                    self.ucd_hst[i] = result.group("type")
 
     def _load_arabicshaping(self):
-
         self.ucd_as = {}
-        f_as = self._ucdfile('extracted/DerivedJoiningType.txt')
+        f_as = self._ucdfile("extracted/DerivedJoiningType.txt")
         for line in f_as.splitlines():
-            result = re.match(
-                r'^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<jt>\S+)\s*(|\#.*)$',
-                line)
+            result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<jt>\S+)\s*(|\#.*)$", line)
             if result:
-                if result.group('end'):
-                    for i in hexrange(result.group('start'), result.group('end')):
-                        self.ucd_as[i] = result.group('jt')
+                if result.group("end"):
+                    for i in hexrange(result.group("start"), result.group("end")):
+                        self.ucd_as[i] = result.group("jt")
                 else:
-                    i = hexvalue(result.group('start'))
-                    self.ucd_as[i] = result.group('jt')
+                    i = hexvalue(result.group("start"))
+                    self.ucd_as[i] = result.group("jt")
 
     def _load_scripts(self):
-
         self.ucd_s = {}
-        f_s = self._ucdfile('Scripts.txt')
+        f_s = self._ucdfile("Scripts.txt")
         for line in f_s.splitlines():
-            result = re.match(
-                r'^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<script>\S+)\s*(|\#.*)$',
-                line)
+            result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<script>\S+)\s*(|\#.*)$", line)
             if result:
-                if not result.group('script') in self.ucd_s:
-                    self.ucd_s[result.group('script')] = set()
-                if result.group('end'):
-                    for i in hexrange(result.group('start'), result.group('end')):
-                        self.ucd_s[result.group('script')].add(i)
+                if not result.group("script") in self.ucd_s:
+                    self.ucd_s[result.group("script")] = set()
+                if result.group("end"):
+                    for i in hexrange(result.group("start"), result.group("end")):
+                        self.ucd_s[result.group("script")].add(i)
                 else:
-                    i = hexvalue(result.group('start'))
-                    self.ucd_s[result.group('script')].add(i)
+                    i = hexvalue(result.group("start"))
+                    self.ucd_s[result.group("script")].add(i)
 
     def _load_uts46mapping(self):
-
         self.ucd_idnamt = {}
-        f_idnamt = self._ucdfile('IdnaMappingTable.txt', urlbase=UTS46_URL)
+        f_idnamt = self._ucdfile("IdnaMappingTable.txt", urlbase=UTS46_URL)
         for line in f_idnamt.splitlines():
-            result = re.match(
-                r'^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<fields>[^#]+)',
-                line)
+            result = re.match(r"^(?P<start>[0-9A-F]{4,6})(|\.\.(?P<end>[0-9A-F]{4,6}))\s*;\s*(?P<fields>[^#]+)", line)
             if result:
-                fields = [x.strip() for x in result.group('fields').split(';')]
-                if result.group('end'):
-                    for i in hexrange(result.group('start'), result.group('end')):
+                fields = [x.strip() for x in result.group("fields").split(";")]
+                if result.group("end"):
+                    for i in hexrange(result.group("start"), result.group("end")):
                         self.ucd_idnamt[i] = fields
                 else:
-                    i = hexvalue(result.group('start'))
+                    i = hexvalue(result.group("start"))
                     self.ucd_idnamt[i] = fields
 
+    def _load_uts46testvectors(self):
+        self.ucd_uts46tests = []
+        f_uts46tests = self._ucdfile("IdnaTestV2.txt", urlbase=UTS46_URL)
+        for lineno, line in enumerate(f_uts46tests.splitlines()):
+            if "#" in line:
+                line = line.split("#", 1)[0]
+            if not line:
+                continue
+            self.ucd_uts46tests.append((lineno + 1, tuple(field.strip() for field in unicode_fixup(line).split(";"))))
+
     def _ucdfile(self, filename, urlbase=UCD_URL):
         if self.source:
-            f = open('{}/{}'.format(self.source, filename))
+            f = open("{}/{}".format(self.source, filename))
             return f.read()
         else:
             cache_file = None
             if self.cache:
-                cache_file = os.path.expanduser('{}/{}/{}'.format(
-                    self.cache, self.version.tag, filename))
+                cache_file = os.path.expanduser("{}/{}/{}".format(self.cache, self.version.tag, filename))
                 if os.path.isfile(cache_file):
                     f = open(cache_file)
                     return f.read()
 
             version_path = self.version.tag
-            if version_path == 'latest':
-                version_path = 'UCD/latest'
+            if version_path == "latest":
+                version_path = "UCD/latest"
             url = urlbase.format(
                 version=version_path,
                 filename=filename,
             )
-            content = urlopen(url).read().decode('utf-8')
+            content = urlopen(url).read().decode("utf-8")
 
             if cache_file:
                 if not os.path.isdir(os.path.dirname(cache_file)):
                     os.makedirs(os.path.dirname(cache_file))
-                f = open(cache_file, 'wb')
-                f.write(content.encode('utf-8'))
+                f = open(cache_file, "wb")
+                f.write(content.encode("utf-8"))
                 f.close()
 
             return str(content)
@@ -331,13 +326,12 @@
 
 
 class CodePoint:
-
     def __init__(self, value=None, ucdata=None):
         self.value = value
         self.ucdata = ucdata
 
     def _casefold(self, s):
-        r = ''
+        r = ""
         for c in s:
             r += self.ucdata.ucd_cf.get(ord(c), c)
         return r
@@ -354,10 +348,10 @@
     def name(self):
         if self.value in self.ucdata.ucd_data:
             return self.ucdata.ucd_data[self.value][0]
-        elif 'Noncharacter_Code_Point' in self.ucdata.ucd_props[self.value]:
-            return '<noncharacter>'
+        elif "Noncharacter_Code_Point" in self.ucdata.ucd_props[self.value]:
+            return "<noncharacter>"
         else:
-            return '<reserved>'
+            return "<reserved>"
 
     @property
     def general_category(self):
@@ -365,20 +359,17 @@
 
     @property
     def unassigned(self):
-        return not ('Noncharacter_Code_Point' in self.ucdata.ucd_props[self.value] or \
-                    self.value in self.ucdata.ucd_data)
+        return not ("Noncharacter_Code_Point" in self.ucdata.ucd_props[self.value] or self.value in self.ucdata.ucd_data)
 
     @property
     def ldh(self):
-        if self.value == 0x002d or \
-           self.value in range(0x0030, 0x0039+1) or \
-           self.value in range(0x0061, 0x007a+1):
+        if self.value == 0x002D or self.value in range(0x0030, 0x0039 + 1) or self.value in range(0x0061, 0x007A + 1):
             return True
         return False
 
     @property
     def join_control(self):
-        return 'Join_Control' in self.ucdata.ucd_props[self.value]
+        return "Join_Control" in self.ucdata.ucd_props[self.value]
 
     @property
     def joining_type(self):
@@ -390,8 +381,7 @@
 
     @property
     def nfkc_cf(self):
-        return unicodedata.normalize('NFKC',
-                                     self._casefold(unicodedata.normalize('NFKC', self.char)))
+        return unicodedata.normalize("NFKC", self._casefold(unicodedata.normalize("NFKC", self.char)))
 
     @property
     def unstable(self):
@@ -399,7 +389,7 @@
 
     @property
     def in_ignorableproperties(self):
-        for prop in ['Default_Ignorable_Code_Point', 'White_Space', 'Noncharacter_Code_Point']:
+        for prop in ["Default_Ignorable_Code_Point", "White_Space", "Noncharacter_Code_Point"]:
             if prop in self.ucdata.ucd_props[self.value]:
                 return True
         return False
@@ -407,17 +397,18 @@
     @property
     def in_ignorableblocks(self):
         return self.ucdata.ucd_block.get(self.value) in (
-            'Combining Diacritical Marks for Symbols', 'Musical Symbols',
-            'Ancient Greek Musical Notation'
+            "Combining Diacritical Marks for Symbols",
+            "Musical Symbols",
+            "Ancient Greek Musical Notation",
         )
 
     @property
     def oldhanguljamo(self):
-        return self.ucdata.ucd_hst.get(self.value) in ('L', 'V', 'T')
+        return self.ucdata.ucd_hst.get(self.value) in ("L", "V", "T")
 
     @property
     def in_lettersdigits(self):
-        return self.general_category in ('Ll', 'Lu', 'Lo', 'Nd', 'Lm', 'Mn', 'Mc')
+        return self.general_category in ("Ll", "Lu", "Lo", "Nd", "Lm", "Mn", "Mc")
 
     @property
     def idna2008_status(self):
@@ -426,23 +417,23 @@
         elif self.compat_value:
             return self.compat_value
         elif self.unassigned:
-            return 'UNASSIGNED'
+            return "UNASSIGNED"
         elif self.ldh:
-            return 'PVALID'
+            return "PVALID"
         elif self.join_control:
-            return 'CONTEXTJ'
+            return "CONTEXTJ"
         elif self.unstable:
-            return 'DISALLOWED'
+            return "DISALLOWED"
         elif self.in_ignorableproperties:
-            return 'DISALLOWED'
+            return "DISALLOWED"
         elif self.in_ignorableblocks:
-            return 'DISALLOWED'
+            return "DISALLOWED"
         elif self.oldhanguljamo:
-            return 'DISALLOWED'
+            return "DISALLOWED"
         elif self.in_lettersdigits:
-            return 'PVALID'
+            return "PVALID"
         else:
-            return 'DISALLOWED'
+            return "DISALLOWED"
 
     @property
     def uts46_data(self):
@@ -450,66 +441,67 @@
 
     @property
     def uts46_status(self):
-        return ' '.join(self.uts46_data)
+        return " ".join(self.uts46_data)
 
 
 def diagnose_codepoint(codepoint, args, ucdata):
-
     cp = CodePoint(codepoint, ucdata=ucdata)
 
-    print('U+{:04X}:'.format(codepoint))
-    print('   Name:             {}'.format(cp.name))
-    print('1  Exceptions:       {}'.format(exceptions.get(codepoint, False)))
-    print('2  Backwards Compat: {}'.format(backwardscompatible.get(codepoint, False)))
-    print('3  Unassigned:       {}'.format(cp.unassigned))
-    print('4  LDH:              {}'.format(cp.ldh))
-    print('   Properties:       {}'.format(' '.join(sorted(ucdata.ucd_props.get(codepoint, ['None'])))))
-    print('5  .Join Control:    {}'.format(cp.join_control))
-    print('   NFKC CF:          {}'.format(' '.join(['U+{:04X}'.format(ord(x)) for x in cp.nfkc_cf])))
-    print('6  .Unstable:        {}'.format(cp.unstable))
-    print('7  .Ignorable Prop:  {}'.format(cp.in_ignorableproperties))
-    print('   Block:            {}'.format(ucdata.ucd_block.get(codepoint, None)))
-    print('8  .Ignorable Block: {}'.format(cp.in_ignorableblocks))
-    print('   Hangul Syll Type: {}'.format(ucdata.ucd_hst.get(codepoint, None)))
-    print('9  .Old Hangul Jamo: {}'.format(cp.oldhanguljamo))
-    print('   General Category: {}'.format(cp.general_category))
-    print('10 .Letters Digits:  {}'.format(cp.in_lettersdigits))
-    print('== IDNA 2008:        {}'.format(cp.idna2008_status))
-    print('== UTS 46:           {}'.format(cp.uts46_status))
-    print('(Unicode {} [sys:{}])'.format(ucdata.version, ucdata.system_version))
+    print("U+{:04X}:".format(codepoint))
+    print("   Name:             {}".format(cp.name))
+    print("1  Exceptions:       {}".format(exceptions.get(codepoint, False)))
+    print("2  Backwards Compat: {}".format(backwardscompatible.get(codepoint, False)))
+    print("3  Unassigned:       {}".format(cp.unassigned))
+    print("4  LDH:              {}".format(cp.ldh))
+    print("   Properties:       {}".format(" ".join(sorted(ucdata.ucd_props.get(codepoint, ["None"])))))
+    print("5  .Join Control:    {}".format(cp.join_control))
+    print("   NFKC CF:          {}".format(" ".join(["U+{:04X}".format(ord(x)) for x in cp.nfkc_cf])))
+    print("6  .Unstable:        {}".format(cp.unstable))
+    print("7  .Ignorable Prop:  {}".format(cp.in_ignorableproperties))
+    print("   Block:            {}".format(ucdata.ucd_block.get(codepoint, None)))
+    print("8  .Ignorable Block: {}".format(cp.in_ignorableblocks))
+    print("   Hangul Syll Type: {}".format(ucdata.ucd_hst.get(codepoint, None)))
+    print("9  .Old Hangul Jamo: {}".format(cp.oldhanguljamo))
+    print("   General Category: {}".format(cp.general_category))
+    print("10 .Letters Digits:  {}".format(cp.in_lettersdigits))
+    print("== IDNA 2008:        {}".format(cp.idna2008_status))
+    print("== UTS 46:           {}".format(cp.uts46_status))
+    print("(Unicode {} [sys:{}])".format(ucdata.version, ucdata.system_version))
+
 
 def ucdrange(start, end):
     if start == end:
-        return ('{:04X}'.format(start.value), start.name)
+        return ("{:04X}".format(start.value), start.name)
     else:
-        return ('{:04X}..{:04X}'.format(start.value, end.value),
-                '{}..{}'.format(start.name, end.name))
+        return ("{:04X}..{:04X}".format(start.value, end.value), "{}..{}".format(start.name, end.name))
+
 
 def upper_hex(value):
     num = hex(value)
     return num[:2] + num[2:].upper()
 
+
 def optimised_list(d):
     values = intranges_from_list(d)
     if len(values) == 1:
         for value in values:
             # Respect ruff format style
-            yield '({},),'.format(upper_hex(value))
+            yield "({},),".format(upper_hex(value))
     else:
-        yield '('
+        yield "("
         for value in values:
-            yield '        {},'.format(upper_hex(value))
-        yield '    ),'
+            yield "        {},".format(upper_hex(value))
+        yield "    ),"
+
 
 def make_table(args, ucdata):
-
     last_status = None
     cps = []
     table_data = []
 
     for cp in ucdata.codepoints():
         status = cp.idna2008_status
-        if (last_status and last_status != status):
+        if last_status and last_status != status:
             (values, description) = ucdrange(cps[0], cps[-1])
             table_data.append([values, last_status, description])
             cps = []
@@ -519,63 +511,61 @@
     table_data.append([values, last_status, description])
 
     if args.dir:
-
-        f = open('{}/idna-table-{}.txt'.format(args.dir, ucdata.version), 'wb')
+        f = open("{}/idna-table-{}.txt".format(args.dir, ucdata.version), "wb")
         for row in table_data:
-            f.write('{:12}; {:12}# {:.44}\n'.format(*row).encode('ascii'))
+            f.write("{:12}; {:12}# {:.44}\n".format(*row).encode("ascii"))
         f.close()
 
     else:
-
         for row in table_data:
-            print('{:12}; {:12}# {:.44}'.format(*row))
+            print("{:12}; {:12}# {:.44}".format(*row))
+
 
 def idna_libdata(ucdata):
-
-    yield '# This file is automatically generated by tools/idna-data\n'
+    yield "# This file is automatically generated by tools/idna-data\n"
     yield '__version__ = "{}"\n'.format(ucdata.version)
 
     #
     # Script classifications are used by some CONTEXTO rules in RFC 5891
     #
-    yield 'scripts = {'
+    yield "scripts = {"
     for script in SCRIPT_WHITELIST:
         prefix = '    "{}": '.format(script)
         for line in optimised_list(ucdata.ucd_s[script]):
             yield prefix + line
-            prefix = ''
-    yield '}'
+            prefix = ""
+    yield "}"
 
     #
     # Joining types are used by CONTEXTJ rule A.1
     #
-    yield 'joining_types = {'
+    yield "joining_types = {"
     for cp in ucdata.codepoints():
         if cp.joining_type:
-            yield '    0x{:X}: {},'.format(cp.value, ord(cp.joining_type))
-    yield '}'
+            yield "    0x{:X}: {},".format(cp.value, ord(cp.joining_type))
+    yield "}"
 
     #
     # These are the classification of codepoints into PVALID, CONTEXTO, CONTEXTJ, etc.
     #
-    yield 'codepoint_classes = {'
+    yield "codepoint_classes = {"
     classes = {}
     for cp in ucdata.codepoints():
         status = cp.idna2008_status
-        if status in ('UNASSIGNED', 'DISALLOWED'):
+        if status in ("UNASSIGNED", "DISALLOWED"):
             continue
         if not status in classes:
             classes[status] = set()
         classes[status].add(cp.value)
-    for status in ['PVALID', 'CONTEXTJ', 'CONTEXTO']:
+    for status in ["PVALID", "CONTEXTJ", "CONTEXTO"]:
         prefix = '    "{}": '.format(status)
         for line in optimised_list(classes[status]):
             yield prefix + line
-            prefix = ''
-    yield '}'
+            prefix = ""
+    yield "}"
+
 
 def uts46_ranges(ucdata):
-
     last = (None, None)
     for cp in ucdata.codepoints():
         fields = cp.uts46_data
@@ -583,8 +573,8 @@
             continue
         status, mapping = UTS46_STATUSES[fields[0]]
         if mapping:
-            mapping = ''.join(chr(int(codepoint, 16)) for codepoint in fields[1].split())
-            mapping = mapping.replace('\\', '\\\\')
+            mapping = "".join(chr(int(codepoint, 16)) for codepoint in fields[1].split())
+            mapping = mapping.replace("\\", "\\\\")
         else:
             mapping = None
         if cp.value > 255 and (status, mapping) == last:
@@ -593,17 +583,17 @@
 
         if mapping is not None:
             if '"' in mapping:
-                yield '(0x{:X}, "{}", \'{}\')'.format(cp.value, status, mapping)
+                yield "(0x{:X}, \"{}\", '{}')".format(cp.value, status, mapping)
             else:
                 yield '(0x{:X}, "{}", "{}")'.format(cp.value, status, mapping)
         else:
             yield '(0x{:X}, "{}")'.format(cp.value, status)
 
-def uts46_libdata(ucdata):
 
-    yield '# This file is automatically generated by tools/idna-data'
-    yield '# vim: set fileencoding=utf-8 :\n'
-    yield 'from typing import List, Tuple, Union\n'
+def uts46_libdata(ucdata):
+    yield "# This file is automatically generated by tools/idna-data"
+    yield "# vim: set fileencoding=utf-8 :\n"
+    yield "from typing import List, Tuple, Union\n"
     yield '"""IDNA Mapping Table from UTS46."""\n\n'
 
     yield '__version__ = "{}"\n'.format(ucdata.version)
@@ -613,84 +603,192 @@
         idx += 1
         if idx % UTS46_SEGMENT_SIZE == 0:
             if idx != 0:
-                yield '    ]\n'
-            yield '\ndef _seg_{}() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]:\n    return ['.format(idx // UTS46_SEGMENT_SIZE)
-        yield '        {},'.format(row)
-    yield '    ]\n'
+                yield "    ]\n"
+            yield "\ndef _seg_{}() -> List[Union[Tuple[int, str], Tuple[int, str, str]]]:\n    return [".format(
+                idx // UTS46_SEGMENT_SIZE
+            )
+        yield "        {},".format(row)
+    yield "    ]\n"
 
-    yield '\nuts46data = tuple('
-    yield '    _seg_0()'
+    yield "\nuts46data = tuple("
+    yield "    _seg_0()"
     for i in range(1, idx // UTS46_SEGMENT_SIZE + 1):
-        yield '    + _seg_{}()'.format(i)
-    yield ')  # type: Tuple[Union[Tuple[int, str], Tuple[int, str, str]], ...]'
+        yield "    + _seg_{}()".format(i)
+    yield ")  # type: Tuple[Union[Tuple[int, str], Tuple[int, str, str]], ...]"
+
+
+def uts46_tests(ucdata):
+    yield "# This file is automatically generated by tools/idna-data\n"
+    yield "import unittest\n"
+    yield "class UTS46Tests(unittest.TestCase):\n"
+    yield "    def test_uts46(self):\n"
+
+    for lineno, fields in ucdata.ucd_uts46tests:
+        (
+            source,
+            to_unicode,
+            to_unicode_status,
+            to_ascii,
+            to_ascii_status,
+            to_ascii_t,
+            to_ascii_t_status,
+        ) = fields
+
+        # Per UTS46 test vectors, if the result is "", expect a blank string, but
+        # if the result is blank, expect the input string.
+        if to_unicode == '""':
+            to_unicode = ""
+        elif not to_unicode:
+            to_unicode = source
+        if not to_unicode_status:
+            to_unicode_status = "[]"
+        if to_ascii == '""':
+            to_ascii = ""
+        elif not to_ascii:
+            to_ascii = to_unicode
+        if not to_ascii_status:
+            to_ascii_status = to_unicode_status
+        if to_ascii_t == '""':
+            to_ascii_t = ""
+        elif not to_ascii_t:
+            to_ascii_t = to_ascii
+        if not to_ascii_t_status:
+            to_ascii_t_status = to_ascii_status
+
+        # Is this label IDNA 2008 legal according to UTS46 mapping table?
+        nv8 = False
+        for codepoint in to_unicode:
+            try:
+                field = ucdata.ucd_idnamt[ord(codepoint)][2]
+            except IndexError:
+                field = ""
+            if field == "NV8" or field == "XV8":
+                nv8 = ord(codepoint)
+
+        yield "        # line {}: {}".format(lineno, repr(fields))
+        if nv8:
+            yield "        # nv8: U+{:X}".format(nv8)
+
+        if to_unicode_status == "[]":
+            yield (
+                '        self.assertEqual(idna.decode({}, uts46=True, strict=True), {}, "Expected \\"{}\\"'
+                ' as decode() output")'.format(repr(source), repr(to_unicode), repr(to_unicode))
+            )
+
+        yield ""
+
+
+#        self.assertEqual(idna.uts46_remap("A_", std3_rules=False), "a_")
+#        self.assertRaises(idna.InvalidCodepoint, idna.uts46_remap, "A_", std3_rules=True)
+
+
+"""
+    try:
+            output = idna.decode(source, uts46=True, strict=True)
+            if to_unicode_status != "[]":
+                self.fail("decode() did not emit required error {} for {}".format(to_unicode, repr(source)))
+            self.assertEqual(output, to_unicode, "unexpected decode() output")
+        except (idna.IDNAError, UnicodeError, ValueError) as exc:
+            if str(exc).startswith("Unknown"):
+                raise unittest.SkipTest("Test requires support for a newer version of Unicode than this Python supports")
+            if to_unicode_status == "[]":
+                raise
+
+        try:
+            output = idna.encode(source, uts46=True, strict=True).decode("ascii")
+            if to_ascii_status != "[]":
+                self.fail("encode() did not emit required error {} for {}".format(to_ascii_status, repr(source)))
+            self.assertEqual(output, to_ascii, "unexpected encode() output")
+        except (idna.IDNAError, UnicodeError, ValueError) as exc:
+            if str(exc).startswith("Unknown"):
+                raise unittest.SkipTest("Test requires support for a newer version of Unicode than this Python supports")
+            if to_ascii_status == "[]":
+                raise
+
+        try:
+            output = idna.encode(source, uts46=True, strict=True, transitional=True).decode("ascii")
+            if to_ascii_t_status != "[]":
+                self.fail(
+                    "encode(transitional=True) did not emit required error {} for {}".format(to_ascii_t_status, repr(source))
+                )
+            self.assertEqual(output, to_ascii_t, "unexpected encode() output")
+        except (idna.IDNAError, UnicodeError, ValueError) as exc:
+            if str(exc).startswith("Unknown"):
+                raise unittest.SkipTest("Test requires support for a newer version of Unicode than this Python supports")
+            if to_ascii_t_status == "[]":
+                raise
+"""
+
 
 def make_libdata(args, ucdata):
+    dest_dir = args.dir or "."
 
-    dest_dir = args.dir or '.'
-
-    target_filename = os.path.join(dest_dir, 'idnadata.py')
-    with open(target_filename, 'wb') as target:
+    target_filename = os.path.join(dest_dir, "idnadata.py")
+    with open(target_filename, "wb") as target:
         for line in idna_libdata(ucdata):
-            target.write((line + '\n').encode('utf-8'))
+            target.write((line + "\n").encode("utf-8"))
 
-    target_filename = os.path.join(dest_dir, 'uts46data.py')
-    with open(target_filename, 'wb') as target:
+    target_filename = os.path.join(dest_dir, "uts46data.py")
+    with open(target_filename, "wb") as target:
         for line in uts46_libdata(ucdata):
-            target.write((line + '\n').encode('utf-8'))
+            target.write((line + "\n").encode("utf-8"))
+
+    target_filename = os.path.join(dest_dir, "test_idna_uts46.py")
+    with open(target_filename, "wb") as target:
+        for line in uts46_tests(ucdata):
+            target.write((line + "\n").encode("utf-8"))
+
 
 def arg_error(message, parser):
-
     parser.print_usage()
-    print('{}: error: {}'.format(sys.argv[0], message))
+    print("{}: error: {}".format(sys.argv[0], message))
     sys.exit(2)
 
+
 def main():
+    parser = argparse.ArgumentParser(description="Determine IDNA code-point validity data")
+    parser.add_argument(
+        "action", type=str, default="preferred", help="Task to perform (make-libdata, make-tables, <codepoint>)"
+    )
 
-    parser = argparse.ArgumentParser(description='Determine IDNA code-point validity data')
-    parser.add_argument('action', type=str, default='preferred',
-                        help='Task to perform (make-libdata, make-tables, <codepoint>)')
+    parser.add_argument("--version", type=str, default="preferred", help="Unicode version to use (preferred, latest, <x.y.z>)")
+    parser.add_argument("--source", type=str, default=None, help="Where to fetch Unicode data (file path)")
+    parser.add_argument("--dir", type=str, default=None, help="Where to export the output")
+    parser.add_argument("--cache", type=str, default=None, help="Where to cache Unicode data")
+    parser.add_argument("--no-cache", action="store_true", help="Don't cache Unicode data")
+    libdata = parser.add_argument_group("make-libdata", "Make module data for Python IDNA library")
 
-    parser.add_argument('--version', type=str, default='preferred',
-                        help='Unicode version to use (preferred, latest, <x.y.z>)')
-    parser.add_argument('--source', type=str, default=None,
-                        help='Where to fetch Unicode data (file path)')
-    parser.add_argument('--dir', type=str, default=None, help='Where to export the output')
-    parser.add_argument('--cache', type=str, default=None, help='Where to cache Unicode data')
-    parser.add_argument('--no-cache', action='store_true', help='Don\'t cache Unicode data')
-    libdata = parser.add_argument_group('make-libdata', 'Make module data for Python IDNA library')
+    tables = parser.add_argument_group("make-table", "Make IANA-style reference table")
 
-    tables = parser.add_argument_group('make-table', 'Make IANA-style reference table')
-
-    codepoint = parser.add_argument_group('codepoint',
-                                          'Display related data for given codepoint (e.g. U+0061)')
+    codepoint = parser.add_argument_group("codepoint", "Display related data for given codepoint (e.g. U+0061)")
 
     args = parser.parse_args()
 
-    if args.version == 'preferred':
+    if args.version == "preferred":
         target_version = PREFERRED_VERSION
     else:
         target_version = args.version
 
     if args.cache and args.no_cache:
-        arg_error('I can\'t both --cache and --no-cache', parser)
+        arg_error("I can't both --cache and --no-cache", parser)
     cache = args.cache or DEFAULT_CACHE_DIR
     if args.no_cache:
         cache = None
 
     ucdata = UnicodeData(target_version, cache, args)
 
-    if args.action == 'make-table':
+    if args.action == "make-table":
         make_table(args, ucdata)
-    elif args.action == 'make-libdata':
+    elif args.action == "make-libdata":
         make_libdata(args, ucdata)
     else:
-        result = re.match(r'(?i)^(U\+|)(?P<cp>[0-9A-F]{4,6})$', args.action)
+        result = re.match(r"(?i)^(U\+|)(?P<cp>[0-9A-F]{4,6})$", args.action)
         if result:
-            codepoint = int(result.group('cp'), 16)
+            codepoint = int(result.group("cp"), 16)
             diagnose_codepoint(codepoint, args, ucdata)
             sys.exit(0)
-        arg_error('Don\'t recognize action or codepoint value', parser)
-        
+        arg_error("Don't recognize action or codepoint value", parser)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()