| #!/usr/bin/env python3 |
| # Copyright (C) 2026 Anthropic PBC. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions |
| # are met: |
| # |
| # 1. Redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer. |
| # 2. Redistributions in binary form must reproduce the above copyright |
| # notice, this list of conditions and the following disclaimer in the |
| # documentation and/or other materials provided with the distribution. |
| # |
| # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY |
| # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
| # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| """ |
| Generate LexerUnicodePropertyTables.h from UnicodeData.txt |
| |
| This script extracts Non-Latin1 Zs (Space_Separator) category characters |
| from UnicodeData.txt and generates an inline function for ECMAScript WhiteSpace detection. |
| |
| Usage: |
| python3 generateLexerUnicodePropertyTables.py <UnicodeData.txt> <output.h> |
| """ |
| |
| import sys |
| import os |
| |
| header = """/* |
| * Copyright (C) 2026 Anthropic PBC. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| // DO NOT EDIT! - This file was generated by {script} |
| |
| #pragma once |
| |
| namespace JSC {{ |
| |
| // ECMAScript WhiteSpace: Non-Latin1 Zs (Space_Separator) characters + BOM |
| // Generated from UnicodeData.txt |
| // https://tc39.es/ecma262/#sec-white-space |
| // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3AGeneral_Category%3DSpace_Separator%3A%5D |
| bool isNonLatin1WhiteSpace(char16_t ch) |
| {{ |
| // U+FEFF BOM (ZERO WIDTH NO-BREAK SPACE) - ECMAScript WhiteSpace, not Zs |
| if (ch == 0xFEFF) |
| return true; |
| """ |
| |
| footer = """ return false; |
| } |
| |
| } // namespace JSC |
| """ |
| |
| |
| def parse_unicode_data(unicode_data_path): |
| """Parse UnicodeData.txt and extract Non-Latin1 Zs characters.""" |
| zs_code_points = [] # (codePoint, name) |
| |
| with open(unicode_data_path, 'r') as f: |
| for line in f: |
| line = line.split('#', 1)[0].rstrip() |
| if not line: |
| continue |
| |
| fields = line.split(';') |
| if len(fields) < 3: |
| continue |
| |
| code_point = int(fields[0], 16) |
| name = fields[1] |
| category = fields[2] |
| |
| # Only collect Non-Latin1 Zs (Space_Separator) characters |
| if category == "Zs" and code_point > 0xFF: |
| zs_code_points.append((code_point, name)) |
| |
| return zs_code_points |
| |
| |
| def group_code_points(code_points): |
| """Group consecutive code points into ranges.""" |
| if not code_points: |
| return [], [] |
| |
| code_points.sort(key=lambda x: x[0]) |
| |
| ranges = [] |
| singles = [] |
| i = 0 |
| |
| while i < len(code_points): |
| start = code_points[i][0] |
| end = start |
| start_name = code_points[i][1] |
| end_name = start_name |
| |
| # Find consecutive code points |
| while i + 1 < len(code_points) and code_points[i + 1][0] == end + 1: |
| i += 1 |
| end = code_points[i][0] |
| end_name = code_points[i][1] |
| |
| if start == end: |
| singles.append((start, start_name)) |
| else: |
| ranges.append((start, end, start_name, end_name)) |
| i += 1 |
| |
| return singles, ranges |
| |
| |
| def generate_output(output_path, singles, ranges): |
| """Generate the output header file.""" |
| script_name = os.path.basename(__file__) |
| |
| with open(output_path, 'w') as f: |
| f.write(header.format(script=script_name)) |
| |
| # Merge singles and ranges, then output in code point order |
| entries = [] |
| for (start, name) in singles: |
| entries.append((start, start, name, name)) |
| for (start, end, start_name, end_name) in ranges: |
| entries.append((start, end, start_name, end_name)) |
| entries.sort(key=lambda x: x[0]) |
| |
| for (start, end, start_name, end_name) in entries: |
| if start == end: |
| f.write(" if (ch == 0x{:04X})\n".format(start)) |
| f.write(" return true; // {}\n".format(start_name)) |
| else: |
| f.write(" if (ch >= 0x{:04X} && ch <= 0x{:04X})\n".format(start, end)) |
| f.write(" return true; // {} ~ {}\n".format(start_name, end_name)) |
| |
| f.write(footer) |
| |
| |
| def main(): |
| if len(sys.argv) != 3: |
| print("Usage: {} <UnicodeData.txt> <output.h>".format(sys.argv[0]), file=sys.stderr) |
| sys.exit(1) |
| |
| unicode_data_path = sys.argv[1] |
| output_path = sys.argv[2] |
| |
| if not os.path.exists(unicode_data_path): |
| print("Error: UnicodeData.txt not found: {}".format(unicode_data_path), file=sys.stderr) |
| sys.exit(1) |
| |
| code_points = parse_unicode_data(unicode_data_path) |
| singles, ranges = group_code_points(code_points) |
| generate_output(output_path, singles, ranges) |
| |
| |
| if __name__ == "__main__": |
| main() |