| #!/usr/bin/env python |
| # ===----------------------------------------------------------------------===## |
| # |
| # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| # See https://llvm.org/LICENSE.txt for license information. |
| # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| # |
| # ===----------------------------------------------------------------------===## |
| |
| # The code is based on |
| # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_test_data_gen.py |
| # |
| # Copyright (c) Microsoft Corporation. |
| # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| from pathlib import Path |
| from dataclasses import dataclass, field |
| from typing import Optional, TextIO |
| import sys |
| |
| |
| @dataclass |
| class BreakTestItem: |
| code_points: list[int] = field(default_factory=list) |
| encoded: str = "" |
| breaks_utf8: list[int] = field(default_factory=list) |
| breaks_utf16: list[int] = field(default_factory=list) |
| breaks_utf32: list[int] = field(default_factory=list) |
| |
| |
| class CommentLine: |
| pass |
| |
| |
| class EOF: |
| pass |
| |
| |
| def parseBreakTestLine(input: TextIO) -> Optional[BreakTestItem]: |
| result = BreakTestItem() |
| code_point = -1 |
| utf8 = 0 |
| utf16 = 0 |
| utf32 = 0 |
| |
| while True: |
| c = input.read(1) |
| if c == "\N{DIVISION SIGN}": |
| # The line starts with a division sign, don't add it to the output. |
| if code_point != -1: |
| result.code_points.append(code_point) |
| code_point = -1 |
| result.breaks_utf8.append(utf8) |
| result.breaks_utf16.append(utf16) |
| result.breaks_utf32.append(utf32) |
| |
| assert input.read(1).isspace() |
| continue |
| if c == "\N{MULTIPLICATION SIGN}": |
| assert input.read(1).isspace() |
| continue |
| if c.isalnum(): |
| while next := input.read(1): |
| if next.isalnum(): |
| c += next |
| else: |
| assert next.isspace() |
| break |
| i = int(c, base=16) |
| if code_point == -1: |
| code_point = i |
| |
| result.encoded += f"\\U{i:08x}" |
| c = chr(i) |
| utf8 += c.encode().__len__() |
| # Since we only care about the number of code units the byte order |
| # doesn't matter. The byte order is specified to avoid the BOM |
| utf16 += int(c.encode("utf-16-le").__len__() / 2) |
| utf32 += int(c.encode("utf-32-le").__len__() / 4) |
| continue |
| if c == "#": |
| input.readline() |
| return result |
| if c == "\n": |
| return result |
| if c == "": |
| return None |
| assert False |
| |
| |
| cpp_template = """// -*- C++ -*- |
| //===----------------------------------------------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| // WARNING, this entire header is generated by |
| // utils/generate_extended_grapheme_cluster_test.py |
| // DO NOT MODIFY! |
| |
| // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE |
| // |
| // See Terms of Use <https://www.unicode.org/copyright.html> |
| // for definitions of Unicode Inc.'s Data Files and Software. |
| // |
| // NOTICE TO USER: Carefully read the following legal agreement. |
| // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S |
| // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), |
| // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE |
| // TERMS AND CONDITIONS OF THIS AGREEMENT. |
| // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE |
| // THE DATA FILES OR SOFTWARE. |
| // |
| // COPYRIGHT AND PERMISSION NOTICE |
| // |
| // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved. |
| // Distributed under the Terms of Use in https://www.unicode.org/copyright.html. |
| // |
| // Permission is hereby granted, free of charge, to any person obtaining |
| // a copy of the Unicode data files and any associated documentation |
| // (the "Data Files") or Unicode software and any associated documentation |
| // (the "Software") to deal in the Data Files or Software |
| // without restriction, including without limitation the rights to use, |
| // copy, modify, merge, publish, distribute, and/or sell copies of |
| // the Data Files or Software, and to permit persons to whom the Data Files |
| // or Software are furnished to do so, provided that either |
| // (a) this copyright and permission notice appear with all copies |
| // of the Data Files or Software, or |
| // (b) this copyright and permission notice appear in associated |
| // Documentation. |
| // |
| // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF |
| // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE |
| // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| // NONINFRINGEMENT OF THIRD PARTY RIGHTS. |
| // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS |
| // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL |
| // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, |
| // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER |
| // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR |
| // PERFORMANCE OF THE DATA FILES OR SOFTWARE. |
| // |
| // Except as contained in this notice, the name of a copyright holder |
| // shall not be used in advertising or otherwise to promote the sale, |
| // use or other dealings in these Data Files or Software without prior |
| // written authorization of the copyright holder. |
| |
| #ifndef LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H |
| #define LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H |
| |
| #include <array> |
| #include <string_view> |
| #include <vector> |
| |
| #include "test_macros.h" |
| |
| template <class CharT> |
| struct data {{ |
| /// The input to parse. |
| std::basic_string_view<CharT> input; |
| |
| /// The first code point all extended grapheme clusters in the input. |
| std::vector<char32_t> code_points; |
| |
| /// The offset of the last code units of the extended grapheme clusters in the input. |
| /// |
| /// The vector has the same number of entries as \\ref code_points. |
| std::vector<std::size_t> breaks; |
| }}; |
| |
| /// The data for UTF-8. |
| std::array<data<char>, {0}> data_utf8 = {{{{ |
| {1}}}}}; |
| |
| /// The data for UTF-16. |
| /// |
| /// Note that most of the data for the UTF-16 and UTF-32 are identical. However |
| /// since the size of the code units differ the breaks can contain different |
| /// values. |
| #ifndef TEST_HAS_NO_WIDE_CHARACTERS |
| std::array<data<wchar_t>, {0}> data_utf16 = {{{{ |
| {2}}}}}; |
| |
| /// The data for UTF-8. |
| /// |
| /// Note that most of the data for the UTF-16 and UTF-32 are identical. However |
| /// since the size of the code units differ the breaks can contain different |
| /// values. |
| std::array<data<wchar_t>, {0}> data_utf32 = {{{{ |
| {3}}}}}; |
| #endif // TEST_HAS_NO_WIDE_CHARACTERS |
| |
| #endif // LIBCXX_TEST_STD_UTILITIES_FORMAT_FORMAT_STRING_FORMAT_STRING_STD_EXTENDED_GRAPHEME_CLUSTER_H""" |
| |
| cpp_test_data_line_template = " {{{}, {{{}}}, {{{}}}}}" |
| |
| |
| def lineToCppDataLineUtf8(line: BreakTestItem) -> str: |
| return cpp_test_data_line_template.format( |
| f'"{line.encoded}"', |
| ", ".join([str(x) for x in line.code_points]), |
| ", ".join([str(x) for x in line.breaks_utf8]), |
| ) |
| |
| |
| def lineToCppDataLineUtf16(line: BreakTestItem) -> str: |
| return cpp_test_data_line_template.format( |
| f'L"{line.encoded}"', |
| ", ".join([str(x) for x in line.code_points]), |
| ", ".join([str(x) for x in line.breaks_utf16]), |
| ) |
| |
| |
| def lineToCppDataLineUtf32(line: BreakTestItem) -> str: |
| return cpp_test_data_line_template.format( |
| f'L"{line.encoded}"', |
| ", ".join([str(x) for x in line.code_points]), |
| ", ".join([str(x) for x in line.breaks_utf32]), |
| ) |
| |
| |
| """ |
| Generate test data from "GraphemeBreakText.txt" |
| This file can be downloaded from: |
| https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt |
| This script looks for GraphemeBreakTest.txt in same directory as this script |
| """ |
| |
| |
| def generate_all() -> str: |
| test_data_path = Path(__file__) |
| test_data_path = test_data_path.absolute() |
| test_data_path = ( |
| test_data_path.parent / "data" / "unicode" / "GraphemeBreakTest.txt" |
| ) |
| lines = list() |
| with open(test_data_path, mode="rt", encoding="utf-8") as file: |
| while line := parseBreakTestLine(file): |
| if len(line.encoded) > 0: |
| lines.append(line) |
| return cpp_template.format( |
| len(lines), |
| ",\n".join(map(lineToCppDataLineUtf8, lines)), |
| ",\n".join(map(lineToCppDataLineUtf16, lines)), |
| ",\n".join(map(lineToCppDataLineUtf32, lines)), |
| ) |
| |
| |
| if __name__ == "__main__": |
| if len(sys.argv) == 2: |
| sys.stdout = open(sys.argv[1], "w") |
| print(generate_all()) |