| # This script can convert IdnaTestV2.txt to JSON, accounting for the requirements in the |
| # URL Standard. |
| # |
| # The goal is to eventually remove --exclude-std3 and --exclude-bidi. For that we need solutions to |
| # these issues: |
| # |
| # * https://github.com/whatwg/url/issues/341 |
| # * https://github.com/whatwg/url/issues/543 |
| # * https://github.com/whatwg/url/issues/733 |
| # * https://github.com/whatwg/url/issues/744 |
| # |
| # Removal of --exclude-ipv4-like is a stretch goal also dependent upon those issues. |
| |
| import argparse |
| import json |
| import os |
| import re |
| import requests |
| |
| def get_IdnaTestV2_lines(): |
| IdnaTestV2 = os.path.join(os.path.dirname(__file__), "IdnaTestV2.txt") |
| if not os.path.exists(IdnaTestV2): |
| # Download IdnaTestV2.txt if it doesn't exist yet |
| open(IdnaTestV2, "w", encoding="utf-8").write(requests.get("https://unicode.org/Public/idna/latest/IdnaTestV2.txt").text) |
| return open(IdnaTestV2, "r", encoding="utf-8").readlines() |
| |
| def remove_escapes(input): |
| return json.loads("\"" + input + "\"") |
| |
| def get_column_value(input, default = ""): |
| if input == "": |
| return default |
| # "" means an empty string |
| if input == "\"\"": |
| return "" |
| # Remove escapes (doesn't handle \x{XXXX} but those do not appear in the source) |
| return remove_escapes(input) |
| |
| def ends_in_a_number(input): |
| # This method is not robust. It uses https://www.unicode.org/reports/tr46/#Notation but there |
| # are likely other ways to end up with a dot, e.g., through decomposition or percent-decoding. |
| # It also does not entirely match https://url.spec.whatwg.org/#ends-in-a-number-checker. It |
| # appears to suffice for the tests in question though. |
| parts = re.split(r"\u002E|\uFF0E|\u3002|\uFF61", input) |
| if not parts: |
| return False |
| if parts[-1] == "": |
| if len(parts) == 1: |
| return False |
| parts.pop() |
| return parts[-1].isascii() and parts[-1].isdigit() |
| |
| def contains_bidi_status(statuses): |
| for status in statuses: |
| if status in ["B1", "B2", "B3", "B4", "B5", "B6"]: |
| return True |
| return False |
| |
| def parse(lines, exclude_ipv4_like, exclude_std3, exclude_bidi): |
| # Main quest. |
| output = ["THIS IS A GENERATED FILE. PLEASE DO NOT MODIFY DIRECTLY. See ../tools/IdnaTestV2-parser.py instead."] |
| output.append(f"--exclude-ipv4-like: {exclude_ipv4_like}; --exclude-std3: {exclude_std3}; --exclude-bidi: {exclude_bidi}") |
| |
| # Side quest. |
| unique_statuses = [] |
| |
| for line in lines: |
| # Remove newlines |
| line = line.rstrip() |
| |
| # Remove lines that are comments or empty |
| if line.startswith("#") or line == "": |
| continue |
| |
| # Normalize columns |
| # |
| # Since we are only interested in ToASCII and enforce Transitional_Processing=false we care |
| # about the following columns: |
| # |
| # * Column 1 (source) |
| # * Column 4 (toAsciiN) |
| # * Column 5 (toAsciiNStatus) |
| # |
| # We also store Column 2 (toUnicode) to help with UseSTD3ASCIIRules exclusion. |
| columns = [column.strip() for column in line.split(";")] |
| |
| # Column 1 (source) and Column 2 (toUnicode; if empty, Column 1 (source)) |
| source = get_column_value(columns[0]) |
| to_unicode = get_column_value(columns[1], source) |
| |
| # Immediately exclude IPv4-like tests when desired. While we could force all their |
| # expectations to be failure instead, it's not clear we need that many additional tests that |
| # were actually trying to test something else. |
| if exclude_ipv4_like: |
| if ends_in_a_number(source): |
| continue |
| |
| if exclude_std3: |
| if re.search(r"\<|\>|\:|\/|\?|\#|\\", to_unicode): |
| continue |
| |
| # Column 4 (toAsciiN; if empty, use Column 2 (toUnicode)) |
| to_ascii = get_column_value(columns[3], to_unicode) |
| |
| # Column 5 (toAsciiNStatus; if empty, use Column 3 (toUnicodeStatus)) |
| temp_statuses = columns[4] |
| if temp_statuses == "": |
| temp_statuses = columns[2] |
| |
| statuses = [] |
| if temp_statuses != "": |
| assert temp_statuses.startswith("[") |
| statuses = [status.strip() for status in temp_statuses[1:-1].split(",")] |
| |
| # Side quest time. |
| for status in statuses: |
| if status not in unique_statuses: |
| unique_statuses.append(status) |
| |
| # The URL Standard has |
| # |
| # * UseSTD3ASCIIRules=false; however there are no tests marked U1 (some should be though) |
| # * CheckHyphens=false; thus ignore V2, V3? |
| # * VerifyDnsLength=false; thus ignore A4_1 and A4_2 |
| ignored_statuses = [] |
| for status in statuses: |
| if status in ["A4_1", "A4_2", "U1", "V2", "V3"]: |
| ignored_statuses.append(status) |
| for status in ignored_statuses: |
| statuses.remove(status) |
| |
| if exclude_bidi and contains_bidi_status(statuses): |
| continue |
| |
| if len(statuses) > 0: |
| to_ascii = None |
| |
| test = { "input": source, "output": to_ascii } |
| comment = "" |
| for status in statuses: |
| comment += status + "; " |
| for status in ignored_statuses: |
| comment += status + " (ignored); " |
| if comment != "": |
| test["comment"] = comment.strip()[:-1] |
| output.append(test) |
| |
| unique_statuses.sort() |
| return { "tests": output, "unique_statuses": unique_statuses } |
| |
| def to_json(data): |
| handle = open(os.path.join(os.path.dirname(__file__), "../resources/IdnaTestV2.json"), "w", encoding="utf-8") |
| handle.write(json.dumps(data, sort_keys=True, allow_nan=False, indent=2, separators=(',', ': '))) |
| handle.write("\n") |
| handle.close() |
| |
| def main(): |
| parser = argparse.ArgumentParser(epilog="Thanks for caring about IDNA!") |
| parser.add_argument("--generate", action="store_true", help="Generate the JSON resource.") |
| parser.add_argument("--exclude-ipv4-like", action="store_true", help="Exclude inputs that end with an ASCII digit label. (Not robust, but works for current input.)") |
| parser.add_argument("--exclude-std3", action="store_true", help="Exclude tests impacted by UseSTD3ASCIIRules. (Not robust, but works for current input.)") |
| parser.add_argument("--exclude-bidi", action="store_true", help="Exclude tests impacted by CheckBidi.") |
| parser.add_argument("--statuses", action="store_true", help="Print the unique statuses in IdnaTestV2.txt.") |
| args = parser.parse_args() |
| |
| if args.generate or args.statuses: |
| output = parse(get_IdnaTestV2_lines(), args.exclude_ipv4_like, args.exclude_std3, args.exclude_bidi) |
| if args.statuses: |
| print(output["unique_statuses"]) |
| else: |
| assert args.generate |
| to_json(output["tests"]) |
| else: |
| parser.print_usage() |
| |
| main() |