url/tools/IdnaTestV2-parser.py - external/w3c/web-platform-tests - Git at Google

 # This script can convert IdnaTestV2.txt to JSON, accounting for the requirements in the
 # URL Standard.
 #
 # The goal is to eventually remove --exclude-std3 and --exclude-bidi. For that we need solutions to
 # these issues:
 #
 # * https://github.com/whatwg/url/issues/341
 # * https://github.com/whatwg/url/issues/543
 # * https://github.com/whatwg/url/issues/733
 # * https://github.com/whatwg/url/issues/744
 #
 # Removal of --exclude-ipv4-like is a stretch goal also dependent upon those issues.

 import argparse
 import json
 import os
 import re
 import requests

 def get_IdnaTestV2_lines():
     IdnaTestV2 = os.path.join(os.path.dirname(__file__), "IdnaTestV2.txt")
     if not os.path.exists(IdnaTestV2):
         # Download IdnaTestV2.txt if it doesn't exist yet
         open(IdnaTestV2, "w", encoding="utf-8").write(requests.get("https://unicode.org/Public/idna/latest/IdnaTestV2.txt").text)
     return open(IdnaTestV2, "r", encoding="utf-8").readlines()

 def remove_escapes(input):
     return json.loads("\"" + input + "\"")

 def get_column_value(input, default = ""):
     if input == "":
         return default
     # "" means an empty string
     if input == "\"\"":
         return ""
     # Remove escapes (doesn't handle \x{XXXX} but those do not appear in the source)
     return remove_escapes(input)

 def ends_in_a_number(input):
     # This method is not robust. It uses https://www.unicode.org/reports/tr46/#Notation but there
     # are likely other ways to end up with a dot, e.g., through decomposition or percent-decoding.
     # It also does not entirely match https://url.spec.whatwg.org/#ends-in-a-number-checker. It
     # appears to suffice for the tests in question though.
     parts = re.split(r"\u002E|\uFF0E|\u3002|\uFF61", input)
     if not parts:
         return False
     if parts[-1] == "":
         if len(parts) == 1:
             return False
         parts.pop()
     return parts[-1].isascii() and parts[-1].isdigit()

 def contains_bidi_status(statuses):
     for status in statuses:
         if status in ["B1", "B2", "B3", "B4", "B5", "B6"]:
             return True
     return False

 def parse(lines, exclude_ipv4_like, exclude_std3, exclude_bidi):
     # Main quest.
     output = ["THIS IS A GENERATED FILE. PLEASE DO NOT MODIFY DIRECTLY. See ../tools/IdnaTestV2-parser.py instead."]
     output.append(f"--exclude-ipv4-like: {exclude_ipv4_like}; --exclude-std3: {exclude_std3}; --exclude-bidi: {exclude_bidi}")

     # Side quest.
     unique_statuses = []

     for line in lines:
         # Remove newlines
         line = line.rstrip()

         # Remove lines that are comments or empty
         if line.startswith("#") or line == "":
             continue

         # Normalize columns
         #
         # Since we are only interested in ToASCII and enforce Transitional_Processing=false we care
         # about the following columns:
         #
         # * Column 1 (source)
         # * Column 4 (toAsciiN)
         # * Column 5 (toAsciiNStatus)
         #
         # We also store Column 2 (toUnicode) to help with UseSTD3ASCIIRules exclusion.
         columns = [column.strip() for column in line.split(";")]

         # Column 1 (source) and Column 2 (toUnicode; if empty, Column 1 (source))
         source = get_column_value(columns[0])
         to_unicode = get_column_value(columns[1], source)

         # Immediately exclude IPv4-like tests when desired. While we could force all their
         # expectations to be failure instead, it's not clear we need that many additional tests that
         # were actually trying to test something else.
         if exclude_ipv4_like:
             if ends_in_a_number(source):
                 continue

         if exclude_std3:
             if re.search(r"\<|\>|\:|\/|\?|\#|\\", to_unicode):
                 continue

         # Column 4 (toAsciiN; if empty, use Column 2 (toUnicode))
         to_ascii = get_column_value(columns[3], to_unicode)

         # Column 5 (toAsciiNStatus; if empty, use Column 3 (toUnicodeStatus))
         temp_statuses = columns[4]
         if temp_statuses == "":
             temp_statuses = columns[2]

         statuses = []
         if temp_statuses != "":
             assert temp_statuses.startswith("[")
             statuses = [status.strip() for status in temp_statuses[1:-1].split(",")]

         # Side quest time.
         for status in statuses:
             if status not in unique_statuses:
                 unique_statuses.append(status)

         # The URL Standard has
         #
         # * UseSTD3ASCIIRules=false; however there are no tests marked U1 (some should be though)
         # * CheckHyphens=false; thus ignore V2, V3?
         # * VerifyDnsLength=false; thus ignore A4_1 and A4_2
         ignored_statuses = []
         for status in statuses:
             if status in ["A4_1", "A4_2", "U1", "V2", "V3"]:
                 ignored_statuses.append(status)
         for status in ignored_statuses:
             statuses.remove(status)

         if exclude_bidi and contains_bidi_status(statuses):
             continue

         if len(statuses) > 0:
             to_ascii = None

         test = { "input": source, "output": to_ascii }
         comment = ""
         for status in statuses:
             comment += status + "; "
         for status in ignored_statuses:
             comment += status + " (ignored); "
         if comment != "":
             test["comment"] = comment.strip()[:-1]
         output.append(test)

     unique_statuses.sort()
     return { "tests": output, "unique_statuses": unique_statuses }

 def to_json(data):
     handle = open(os.path.join(os.path.dirname(__file__), "../resources/IdnaTestV2.json"), "w", encoding="utf-8")
     handle.write(json.dumps(data, sort_keys=True, allow_nan=False, indent=2, separators=(',', ': ')))
     handle.write("\n")
     handle.close()

 def main():
     parser = argparse.ArgumentParser(epilog="Thanks for caring about IDNA!")
     parser.add_argument("--generate", action="store_true", help="Generate the JSON resource.")
     parser.add_argument("--exclude-ipv4-like", action="store_true", help="Exclude inputs that end with an ASCII digit label. (Not robust, but works for current input.)")
     parser.add_argument("--exclude-std3", action="store_true", help="Exclude tests impacted by UseSTD3ASCIIRules. (Not robust, but works for current input.)")
     parser.add_argument("--exclude-bidi", action="store_true", help="Exclude tests impacted by CheckBidi.")
     parser.add_argument("--statuses", action="store_true", help="Print the unique statuses in IdnaTestV2.txt.")
     args = parser.parse_args()

     if args.generate or args.statuses:
         output = parse(get_IdnaTestV2_lines(), args.exclude_ipv4_like, args.exclude_std3, args.exclude_bidi)
         if args.statuses:
             print(output["unique_statuses"])
         else:
             assert args.generate
             to_json(output["tests"])
     else:
         parser.print_usage()

 main()
	# This script can convert IdnaTestV2.txt to JSON, accounting for the requirements in the
	# URL Standard.
	#
	# The goal is to eventually remove --exclude-std3 and --exclude-bidi. For that we need solutions to
	# these issues:
	#
	# * https://github.com/whatwg/url/issues/341
	# * https://github.com/whatwg/url/issues/543
	# * https://github.com/whatwg/url/issues/733
	# * https://github.com/whatwg/url/issues/744
	#
	# Removal of --exclude-ipv4-like is a stretch goal also dependent upon those issues.

	import argparse
	import json
	import os
	import re
	import requests

	def get_IdnaTestV2_lines():
	IdnaTestV2 = os.path.join(os.path.dirname(__file__), "IdnaTestV2.txt")
	if not os.path.exists(IdnaTestV2):
	# Download IdnaTestV2.txt if it doesn't exist yet
	open(IdnaTestV2, "w", encoding="utf-8").write(requests.get("https://unicode.org/Public/idna/latest/IdnaTestV2.txt").text)
	return open(IdnaTestV2, "r", encoding="utf-8").readlines()

	def remove_escapes(input):
	return json.loads("\"" + input + "\"")

	def get_column_value(input, default = ""):
	if input == "":
	return default
	# "" means an empty string
	if input == "\"\"":
	return ""
	# Remove escapes (doesn't handle \x{XXXX} but those do not appear in the source)
	return remove_escapes(input)

	def ends_in_a_number(input):
	# This method is not robust. It uses https://www.unicode.org/reports/tr46/#Notation but there
	# are likely other ways to end up with a dot, e.g., through decomposition or percent-decoding.
	# It also does not entirely match https://url.spec.whatwg.org/#ends-in-a-number-checker. It
	# appears to suffice for the tests in question though.
	parts = re.split(r"\u002E\|\uFF0E\|\u3002\|\uFF61", input)
	if not parts:
	return False
	if parts[-1] == "":
	if len(parts) == 1:
	return False
	parts.pop()
	return parts[-1].isascii() and parts[-1].isdigit()

	def contains_bidi_status(statuses):
	for status in statuses:
	if status in ["B1", "B2", "B3", "B4", "B5", "B6"]:
	return True
	return False

	def parse(lines, exclude_ipv4_like, exclude_std3, exclude_bidi):
	# Main quest.
	output = ["THIS IS A GENERATED FILE. PLEASE DO NOT MODIFY DIRECTLY. See ../tools/IdnaTestV2-parser.py instead."]
	output.append(f"--exclude-ipv4-like: {exclude_ipv4_like}; --exclude-std3: {exclude_std3}; --exclude-bidi: {exclude_bidi}")

	# Side quest.
	unique_statuses = []

	for line in lines:
	# Remove newlines
	line = line.rstrip()

	# Remove lines that are comments or empty
	if line.startswith("#") or line == "":
	continue

	# Normalize columns
	#
	# Since we are only interested in ToASCII and enforce Transitional_Processing=false we care
	# about the following columns:
	#
	# * Column 1 (source)
	# * Column 4 (toAsciiN)
	# * Column 5 (toAsciiNStatus)
	#
	# We also store Column 2 (toUnicode) to help with UseSTD3ASCIIRules exclusion.
	columns = [column.strip() for column in line.split(";")]

	# Column 1 (source) and Column 2 (toUnicode; if empty, Column 1 (source))
	source = get_column_value(columns[0])
	to_unicode = get_column_value(columns[1], source)

	# Immediately exclude IPv4-like tests when desired. While we could force all their
	# expectations to be failure instead, it's not clear we need that many additional tests that
	# were actually trying to test something else.
	if exclude_ipv4_like:
	if ends_in_a_number(source):
	continue

	if exclude_std3:
	if re.search(r"\<\|\>\|\:\|\/\|\?\|\#\|\\", to_unicode):
	continue

	# Column 4 (toAsciiN; if empty, use Column 2 (toUnicode))
	to_ascii = get_column_value(columns[3], to_unicode)

	# Column 5 (toAsciiNStatus; if empty, use Column 3 (toUnicodeStatus))
	temp_statuses = columns[4]
	if temp_statuses == "":
	temp_statuses = columns[2]

	statuses = []
	if temp_statuses != "":
	assert temp_statuses.startswith("[")
	statuses = [status.strip() for status in temp_statuses[1:-1].split(",")]

	# Side quest time.
	for status in statuses:
	if status not in unique_statuses:
	unique_statuses.append(status)

	# The URL Standard has
	#
	# * UseSTD3ASCIIRules=false; however there are no tests marked U1 (some should be though)
	# * CheckHyphens=false; thus ignore V2, V3?
	# * VerifyDnsLength=false; thus ignore A4_1 and A4_2
	ignored_statuses = []
	for status in statuses:
	if status in ["A4_1", "A4_2", "U1", "V2", "V3"]:
	ignored_statuses.append(status)
	for status in ignored_statuses:
	statuses.remove(status)

	if exclude_bidi and contains_bidi_status(statuses):
	continue

	if len(statuses) > 0:
	to_ascii = None

	test = { "input": source, "output": to_ascii }
	comment = ""
	for status in statuses:
	comment += status + "; "
	for status in ignored_statuses:
	comment += status + " (ignored); "
	if comment != "":
	test["comment"] = comment.strip()[:-1]
	output.append(test)

	unique_statuses.sort()
	return { "tests": output, "unique_statuses": unique_statuses }

	def to_json(data):
	handle = open(os.path.join(os.path.dirname(__file__), "../resources/IdnaTestV2.json"), "w", encoding="utf-8")
	handle.write(json.dumps(data, sort_keys=True, allow_nan=False, indent=2, separators=(',', ': ')))
	handle.write("\n")
	handle.close()

	def main():
	parser = argparse.ArgumentParser(epilog="Thanks for caring about IDNA!")
	parser.add_argument("--generate", action="store_true", help="Generate the JSON resource.")
	parser.add_argument("--exclude-ipv4-like", action="store_true", help="Exclude inputs that end with an ASCII digit label. (Not robust, but works for current input.)")
	parser.add_argument("--exclude-std3", action="store_true", help="Exclude tests impacted by UseSTD3ASCIIRules. (Not robust, but works for current input.)")
	parser.add_argument("--exclude-bidi", action="store_true", help="Exclude tests impacted by CheckBidi.")
	parser.add_argument("--statuses", action="store_true", help="Print the unique statuses in IdnaTestV2.txt.")
	args = parser.parse_args()

	if args.generate or args.statuses:
	output = parse(get_IdnaTestV2_lines(), args.exclude_ipv4_like, args.exclude_std3, args.exclude_bidi)
	if args.statuses:
	print(output["unique_statuses"])
	else:
	assert args.generate
	to_json(output["tests"])
	else:
	parser.print_usage()

	main()