blob: fa3140d2756fb3e9f43fc3d27a74b806892f9798 [file] [log] [blame]
#!/usr/bin/env python3
# Copyright (C) 2026 Anthropic PBC.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Generate LexerUnicodePropertyTables.h from UnicodeData.txt
This script extracts Non-Latin1 Zs (Space_Separator) category characters
from UnicodeData.txt and generates an inline function for ECMAScript WhiteSpace detection.
Usage:
python3 generateLexerUnicodePropertyTables.py <UnicodeData.txt> <output.h>
"""
import sys
import os
header = """/*
* Copyright (C) 2026 Anthropic PBC.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// DO NOT EDIT! - This file was generated by {script}
#pragma once
namespace JSC {{
// ECMAScript WhiteSpace: Non-Latin1 Zs (Space_Separator) characters + BOM
// Generated from UnicodeData.txt
// https://tc39.es/ecma262/#sec-white-space
// https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3AGeneral_Category%3DSpace_Separator%3A%5D
bool isNonLatin1WhiteSpace(char16_t ch)
{{
// U+FEFF BOM (ZERO WIDTH NO-BREAK SPACE) - ECMAScript WhiteSpace, not Zs
if (ch == 0xFEFF)
return true;
"""
footer = """ return false;
}
} // namespace JSC
"""
def parse_unicode_data(unicode_data_path):
"""Parse UnicodeData.txt and extract Non-Latin1 Zs characters."""
zs_code_points = [] # (codePoint, name)
with open(unicode_data_path, 'r') as f:
for line in f:
line = line.split('#', 1)[0].rstrip()
if not line:
continue
fields = line.split(';')
if len(fields) < 3:
continue
code_point = int(fields[0], 16)
name = fields[1]
category = fields[2]
# Only collect Non-Latin1 Zs (Space_Separator) characters
if category == "Zs" and code_point > 0xFF:
zs_code_points.append((code_point, name))
return zs_code_points
def group_code_points(code_points):
"""Group consecutive code points into ranges."""
if not code_points:
return [], []
code_points.sort(key=lambda x: x[0])
ranges = []
singles = []
i = 0
while i < len(code_points):
start = code_points[i][0]
end = start
start_name = code_points[i][1]
end_name = start_name
# Find consecutive code points
while i + 1 < len(code_points) and code_points[i + 1][0] == end + 1:
i += 1
end = code_points[i][0]
end_name = code_points[i][1]
if start == end:
singles.append((start, start_name))
else:
ranges.append((start, end, start_name, end_name))
i += 1
return singles, ranges
def generate_output(output_path, singles, ranges):
"""Generate the output header file."""
script_name = os.path.basename(__file__)
with open(output_path, 'w') as f:
f.write(header.format(script=script_name))
# Merge singles and ranges, then output in code point order
entries = []
for (start, name) in singles:
entries.append((start, start, name, name))
for (start, end, start_name, end_name) in ranges:
entries.append((start, end, start_name, end_name))
entries.sort(key=lambda x: x[0])
for (start, end, start_name, end_name) in entries:
if start == end:
f.write(" if (ch == 0x{:04X})\n".format(start))
f.write(" return true; // {}\n".format(start_name))
else:
f.write(" if (ch >= 0x{:04X} && ch <= 0x{:04X})\n".format(start, end))
f.write(" return true; // {} ~ {}\n".format(start_name, end_name))
f.write(footer)
def main():
if len(sys.argv) != 3:
print("Usage: {} <UnicodeData.txt> <output.h>".format(sys.argv[0]), file=sys.stderr)
sys.exit(1)
unicode_data_path = sys.argv[1]
output_path = sys.argv[2]
if not os.path.exists(unicode_data_path):
print("Error: UnicodeData.txt not found: {}".format(unicode_data_path), file=sys.stderr)
sys.exit(1)
code_points = parse_unicode_data(unicode_data_path)
singles, ranges = group_code_points(code_points)
generate_output(output_path, singles, ranges)
if __name__ == "__main__":
main()