blob: b8155ce9a17a6d7c5294f68598c0787712f877d0 [file] [log] [blame]
#!/usr/bin/env python3
# Copyright 2018 the V8 project authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from __future__ import annotations
import sys
import subprocess
import re
import math
from pathlib import Path
from typing import List, Union
INPUT_PATH = Path("src/parsing/keywords.txt")
OUTPUT_PATH = Path("src/parsing/keywords-gen.h")
# TODO(leszeks): Trimming seems to regress performance, investigate.
TRIM_CHAR_TABLE: bool = False
def next_power_of_2(x):
return 1 if x == 0 else 2**int(math.ceil(math.log(x, 2)))
def call_with_input(cmd: List[Union[str, Path]], input_string: str = "") -> str:
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
stdout, _ = p.communicate(input_string.encode())
retcode = p.wait()
if retcode != 0:
raise subprocess.CalledProcessError(retcode, cmd)
return stdout.decode()
def checked_sub(pattern: Union[str, re.Pattern[str]],
sub: str,
out: str,
count: int = 1,
flags: int = 0) -> str:
out, n = re.subn(pattern, sub, out, flags=flags)
if n != count:
raise Exception("Expected %d and got %d replacement(s) for pattern: %s" %
(count, n, pattern))
return out
def change_sizet_to_int(out: str) -> str:
# Literal buffer lengths are given as ints, not size_t
return checked_sub(r'\bsize_t\b', 'int', out, count=4)
def drop_line_directives(out: str) -> str:
# #line causes gcov issue, so drop it
return re.sub(r'^#\s*line .*$\n', '', out, flags=re.MULTILINE)
def trim_and_dcheck_char_table(out: str) -> str:
# Potential keyword strings are known to be lowercase ascii, so chop off the
# rest of the table and mask out the char
reads_re = re.compile(
r'asso_values\[static_cast<unsigned char>\((str\[\w+\](\+\d)?)\)\]')
dchecks = []
for str_read in reads_re.finditer(out):
dchecks.append("DCHECK_LT(%s, 129);" % str_read.group(1))
if TRIM_CHAR_TABLE:
out = checked_sub(
r'static const unsigned char asso_values\[\]\s*=\s*\{(\s*\d+\s*,){96}',
"".join(dchecks) + r'static const unsigned char asso_values[32] = {',
out,
flags=re.MULTILINE)
out = checked_sub(
reads_re.pattern,
r'asso_values[static_cast<unsigned char>(str[(\1)]&31)]',
out,
count=len(dchecks),
flags=re.MULTILINE)
else:
out = checked_sub(
r'static const unsigned char asso_values\[\]\s*=\s*\{',
"".join(dchecks) + r'static const unsigned char asso_values[129] = {',
out,
flags=re.MULTILINE)
return out
def use_isinrange(out: str) -> str:
# Our IsInRange method is more efficient than checking for min/max length
return checked_sub(r'if \(len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH\)',
r'if (base::IsInRange(len, MIN_WORD_LENGTH, '
+ r'MAX_WORD_LENGTH))',
out)
def pad_tables(out: str) -> str:
# We don't want to compare against the max hash value, so pad the tables up
# to a power of two and mask the hash.
# First get the new size
max_hash_value = int(re.search(r'MAX_HASH_VALUE\s*=\s*(\d+)', out).group(1))
old_table_length = max_hash_value + 1
new_table_length = next_power_of_2(old_table_length)
table_padding_len = new_table_length - old_table_length
# Pad the length table.
single_lengthtable_entry = r'\d+'
out = checked_sub(
r"""
static\ const\ unsigned\ char\ kPerfectKeywordLengthTable\[\]\s*=\s*\{
(
\s*%(single_lengthtable_entry)s\s*
(?:,\s*%(single_lengthtable_entry)s\s*)*
)
\}
""" % {'single_lengthtable_entry': single_lengthtable_entry},
r'static const unsigned char kPerfectKeywordLengthTable[%d] = { \1 %s }'
% (new_table_length, "".join([',0'] * table_padding_len)),
out,
flags=re.MULTILINE | re.VERBOSE)
# Pad the word list.
single_wordlist_entry = r"""
(?:\#line\ \d+\ ".*"$\s*)?
\{\s*"[a-z]*"\s*,\s*Token::[a-zA-Z_]+\}
"""
out = checked_sub(
r"""
static\ const\ struct\ PerfectKeywordHashTableEntry\ kPerfectKeywordHashTable\[\]\s*=\s*\{
(
\s*%(single_wordlist_entry)s\s*
(?:,\s*%(single_wordlist_entry)s\s*)*
)
\}
""" % {'single_wordlist_entry': single_wordlist_entry},
r'static const struct PerfectKeywordHashTableEntry kPerfectKeywordHashTable[%d] = {\1 %s }'
% (new_table_length, "".join(
[',{"",Token::kIdentifier}'] * table_padding_len)),
out,
flags=re.MULTILINE | re.VERBOSE)
# Mask the hash and replace the range check with DCHECKs.
out = checked_sub(r'Hash\s*\(\s*str,\s*len\s*\)',
r'Hash(str, len)&0x%x' % (new_table_length - 1), out)
out = checked_sub(
r'if \(key <= MAX_HASH_VALUE\)',
r'DCHECK_LT(key, arraysize(kPerfectKeywordLengthTable));DCHECK_LT(key, arraysize(kPerfectKeywordHashTable));',
out)
return out
def return_token(out: str) -> str:
# We want to return the actual token rather than the table entry.
# Change the return type of the function. Make it inline too.
out = checked_sub(
r'const\s*struct\s*PerfectKeywordHashTableEntry\s*\*\s*((?:PerfectKeywordHash::)?GetToken)',
r'inline Token::Value \1',
out,
count=2)
# Change the return value when the keyword is found
out = checked_sub(r'return &kPerfectKeywordHashTable\[key\];',
r'return kPerfectKeywordHashTable[key].value;', out)
# Change the return value when the keyword is not found
out = checked_sub(r'return 0;', r'return Token::kIdentifier;', out)
return out
def memcmp_to_while(out: str) -> str:
# It's faster to loop over the keyword with a while loop than calling memcmp.
# Careful, this replacement is quite flaky, because otherwise the regex is
# unreadable.
return checked_sub(
re.escape("if (*str == *s && !memcmp (str + 1, s + 1, len - 1))") +
r"\s*" + re.escape("return kPerfectKeywordHashTable[key].value;"),
"""
while(*s!=0) {
if (*s++ != *str++) return Token::kIdentifier;
}
return kPerfectKeywordHashTable[key].value;
""",
out,
flags=re.MULTILINE)
def wrap_namespace(out):
return """// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This file is automatically generated by gen-keywords-gen-h.py and should not
// be modified manually.
#ifndef V8_PARSING_KEYWORDS_GEN_H_
#define V8_PARSING_KEYWORDS_GEN_H_
#include "src/parsing/token.h"
namespace v8 {
namespace internal {
%s
} // namespace internal
} // namespace v8
#endif // V8_PARSING_KEYWORDS_GEN_H_
""" % (out)
def trim_character_set_warning(out: str) -> str:
# gperf generates an error message that is too large, trim it
return out.replace(
'"gperf generated tables don\'t work with this execution character set. Please report a bug to <bug-gperf@gnu.org>."',
'"gperf generated tables don\'t work with this execution character set."\\\n// If you see this error, please report a bug to <bug-gperf@gnu.org>.'
)
def main():
try:
script_dir = Path(sys.argv[0]).parent
root_dir = script_dir.parent
out: str = subprocess.check_output(["gperf", "-m100", INPUT_PATH],
cwd=root_dir,
encoding="UTF-8")
# And now some munging of the generated file.
out = change_sizet_to_int(out)
out = drop_line_directives(out)
out = trim_and_dcheck_char_table(out)
out = use_isinrange(out)
out = pad_tables(out)
out = return_token(out)
out = memcmp_to_while(out)
out = wrap_namespace(out)
out = trim_character_set_warning(out)
# Final formatting.
clang_format_path = root_dir / 'third_party/depot_tools/clang-format'
out = call_with_input([clang_format_path], out)
with (root_dir / OUTPUT_PATH).open('w') as f:
f.write(out)
return 0
except subprocess.CalledProcessError as e:
sys.stderr.write("Error calling '{}'\n".format(" ".join(e.cmd)))
return e.returncode
if __name__ == '__main__':
sys.exit(main())