Source/JavaScriptCore/parser/generateLexerUnicodePropertyTables.py - external/github.com/WebKit/webkit - Git at Google

 #!/usr/bin/env python3
 # Copyright (C) 2026 Anthropic PBC.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #
 # 1.  Redistributions of source code must retain the above copyright
 #     notice, this list of conditions and the following disclaimer.
 # 2.  Redistributions in binary form must reproduce the above copyright
 #     notice, this list of conditions and the following disclaimer in the
 #     documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
 # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 """
 Generate LexerUnicodePropertyTables.h from UnicodeData.txt

 This script extracts Non-Latin1 Zs (Space_Separator) category characters
 from UnicodeData.txt and generates an inline function for ECMAScript WhiteSpace detection.

 Usage:
     python3 generateLexerUnicodePropertyTables.py <UnicodeData.txt> <output.h>
 """

 import sys
 import os

 header = """/*
  * Copyright (C) 2026 Anthropic PBC.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1.  Redistributions of source code must retain the above copyright
  *     notice, this list of conditions and the following disclaimer.
  * 2.  Redistributions in binary form must reproduce the above copyright
  *     notice, this list of conditions and the following disclaimer in the
  *     documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 // DO NOT EDIT! - This file was generated by {script}

 #pragma once

 namespace JSC {{

 // ECMAScript WhiteSpace: Non-Latin1 Zs (Space_Separator) characters + BOM
 // Generated from UnicodeData.txt
 // https://tc39.es/ecma262/#sec-white-space
 // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3AGeneral_Category%3DSpace_Separator%3A%5D
 bool isNonLatin1WhiteSpace(char16_t ch)
 {{
     // U+FEFF BOM (ZERO WIDTH NO-BREAK SPACE) - ECMAScript WhiteSpace, not Zs
     if (ch == 0xFEFF)
         return true;
 """

 footer = """    return false;
 }

 } // namespace JSC
 """


 def parse_unicode_data(unicode_data_path):
     """Parse UnicodeData.txt and extract Non-Latin1 Zs characters."""
     zs_code_points = []  # (codePoint, name)

     with open(unicode_data_path, 'r') as f:
         for line in f:
             line = line.split('#', 1)[0].rstrip()
             if not line:
                 continue

             fields = line.split(';')
             if len(fields) < 3:
                 continue

             code_point = int(fields[0], 16)
             name = fields[1]
             category = fields[2]

             # Only collect Non-Latin1 Zs (Space_Separator) characters
             if category == "Zs" and code_point > 0xFF:
                 zs_code_points.append((code_point, name))

     return zs_code_points


 def group_code_points(code_points):
     """Group consecutive code points into ranges."""
     if not code_points:
         return [], []

     code_points.sort(key=lambda x: x[0])

     ranges = []
     singles = []
     i = 0

     while i < len(code_points):
         start = code_points[i][0]
         end = start
         start_name = code_points[i][1]
         end_name = start_name

         # Find consecutive code points
         while i + 1 < len(code_points) and code_points[i + 1][0] == end + 1:
             i += 1
             end = code_points[i][0]
             end_name = code_points[i][1]

         if start == end:
             singles.append((start, start_name))
         else:
             ranges.append((start, end, start_name, end_name))
         i += 1

     return singles, ranges


 def generate_output(output_path, singles, ranges):
     """Generate the output header file."""
     script_name = os.path.basename(__file__)

     with open(output_path, 'w') as f:
         f.write(header.format(script=script_name))

         # Merge singles and ranges, then output in code point order
         entries = []
         for (start, name) in singles:
             entries.append((start, start, name, name))
         for (start, end, start_name, end_name) in ranges:
             entries.append((start, end, start_name, end_name))
         entries.sort(key=lambda x: x[0])

         for (start, end, start_name, end_name) in entries:
             if start == end:
                 f.write("    if (ch == 0x{:04X})\n".format(start))
                 f.write("        return true; // {}\n".format(start_name))
             else:
                 f.write("    if (ch >= 0x{:04X} && ch <= 0x{:04X})\n".format(start, end))
                 f.write("        return true; // {} ~ {}\n".format(start_name, end_name))

         f.write(footer)


 def main():
     if len(sys.argv) != 3:
         print("Usage: {} <UnicodeData.txt> <output.h>".format(sys.argv[0]), file=sys.stderr)
         sys.exit(1)

     unicode_data_path = sys.argv[1]
     output_path = sys.argv[2]

     if not os.path.exists(unicode_data_path):
         print("Error: UnicodeData.txt not found: {}".format(unicode_data_path), file=sys.stderr)
         sys.exit(1)

     code_points = parse_unicode_data(unicode_data_path)
     singles, ranges = group_code_points(code_points)
     generate_output(output_path, singles, ranges)


 if __name__ == "__main__":
     main()
	#!/usr/bin/env python3
	# Copyright (C) 2026 Anthropic PBC.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions
	# are met:
	#
	# 1. Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# 2. Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	#
	# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
	# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
	# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	"""
	Generate LexerUnicodePropertyTables.h from UnicodeData.txt

	This script extracts Non-Latin1 Zs (Space_Separator) category characters
	from UnicodeData.txt and generates an inline function for ECMAScript WhiteSpace detection.

	Usage:
	python3 generateLexerUnicodePropertyTables.py <UnicodeData.txt> <output.h>
	"""

	import sys
	import os

	header = """/*
	* Copyright (C) 2026 Anthropic PBC.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	// DO NOT EDIT! - This file was generated by {script}

	#pragma once

	namespace JSC {{

	// ECMAScript WhiteSpace: Non-Latin1 Zs (Space_Separator) characters + BOM
	// Generated from UnicodeData.txt
	// https://tc39.es/ecma262/#sec-white-space
	// https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3AGeneral_Category%3DSpace_Separator%3A%5D
	bool isNonLatin1WhiteSpace(char16_t ch)
	{{
	// U+FEFF BOM (ZERO WIDTH NO-BREAK SPACE) - ECMAScript WhiteSpace, not Zs
	if (ch == 0xFEFF)
	return true;
	"""

	footer = """ return false;
	}

	} // namespace JSC
	"""


	def parse_unicode_data(unicode_data_path):
	"""Parse UnicodeData.txt and extract Non-Latin1 Zs characters."""
	zs_code_points = [] # (codePoint, name)

	with open(unicode_data_path, 'r') as f:
	for line in f:
	line = line.split('#', 1)[0].rstrip()
	if not line:
	continue

	fields = line.split(';')
	if len(fields) < 3:
	continue

	code_point = int(fields[0], 16)
	name = fields[1]
	category = fields[2]

	# Only collect Non-Latin1 Zs (Space_Separator) characters
	if category == "Zs" and code_point > 0xFF:
	zs_code_points.append((code_point, name))

	return zs_code_points


	def group_code_points(code_points):
	"""Group consecutive code points into ranges."""
	if not code_points:
	return [], []

	code_points.sort(key=lambda x: x[0])

	ranges = []
	singles = []
	i = 0

	while i < len(code_points):
	start = code_points[i][0]
	end = start
	start_name = code_points[i][1]
	end_name = start_name

	# Find consecutive code points
	while i + 1 < len(code_points) and code_points[i + 1][0] == end + 1:
	i += 1
	end = code_points[i][0]
	end_name = code_points[i][1]

	if start == end:
	singles.append((start, start_name))
	else:
	ranges.append((start, end, start_name, end_name))
	i += 1

	return singles, ranges


	def generate_output(output_path, singles, ranges):
	"""Generate the output header file."""
	script_name = os.path.basename(__file__)

	with open(output_path, 'w') as f:
	f.write(header.format(script=script_name))

	# Merge singles and ranges, then output in code point order
	entries = []
	for (start, name) in singles:
	entries.append((start, start, name, name))
	for (start, end, start_name, end_name) in ranges:
	entries.append((start, end, start_name, end_name))
	entries.sort(key=lambda x: x[0])

	for (start, end, start_name, end_name) in entries:
	if start == end:
	f.write(" if (ch == 0x{:04X})\n".format(start))
	f.write(" return true; // {}\n".format(start_name))
	else:
	f.write(" if (ch >= 0x{:04X} && ch <= 0x{:04X})\n".format(start, end))
	f.write(" return true; // {} ~ {}\n".format(start_name, end_name))

	f.write(footer)


	def main():
	if len(sys.argv) != 3:
	print("Usage: {} <UnicodeData.txt> <output.h>".format(sys.argv[0]), file=sys.stderr)
	sys.exit(1)

	unicode_data_path = sys.argv[1]
	output_path = sys.argv[2]

	if not os.path.exists(unicode_data_path):
	print("Error: UnicodeData.txt not found: {}".format(unicode_data_path), file=sys.stderr)
	sys.exit(1)

	code_points = parse_unicode_data(unicode_data_path)
	singles, ranges = group_code_points(code_points)
	generate_output(output_path, singles, ranges)


	if __name__ == "__main__":
	main()