tools/gen-keywords-gen-h.py - v8/v8.git - Git at Google

 #!/usr/bin/env python3
 # Copyright 2018 the V8 project authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 from __future__ import annotations

 import sys
 import subprocess
 import re
 import math
 from pathlib import Path
 from typing import List, Union

 INPUT_PATH = Path("src/parsing/keywords.txt")
 OUTPUT_PATH = Path("src/parsing/keywords-gen.h")

 # TODO(leszeks): Trimming seems to regress performance, investigate.
 TRIM_CHAR_TABLE: bool = False


 def next_power_of_2(x):
   return 1 if x == 0 else 2**int(math.ceil(math.log(x, 2)))


 def call_with_input(cmd: List[Union[str, Path]], input_string: str = "") -> str:
   p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
   stdout, _ = p.communicate(input_string.encode())
   retcode = p.wait()
   if retcode != 0:
     raise subprocess.CalledProcessError(retcode, cmd)
   return stdout.decode()


 def checked_sub(pattern: Union[str, re.Pattern[str]],
                 sub: str,
                 out: str,
                 count: int = 1,
                 flags: int = 0) -> str:
   out, n = re.subn(pattern, sub, out, flags=flags)
   if n != count:
     raise Exception("Expected %d and got %d replacement(s) for pattern: %s" %
                     (count, n, pattern))
   return out


 def change_sizet_to_int(out: str) -> str:
   # Literal buffer lengths are given as ints, not size_t
   return checked_sub(r'\bsize_t\b', 'int', out, count=4)


 def drop_line_directives(out: str) -> str:
   # #line causes gcov issue, so drop it
   return re.sub(r'^#\s*line .*$\n', '', out, flags=re.MULTILINE)


 def trim_and_dcheck_char_table(out: str) -> str:
   # Potential keyword strings are known to be lowercase ascii, so chop off the
   # rest of the table and mask out the char

   reads_re = re.compile(
       r'asso_values\[static_cast<unsigned char>\((str\[\w+\](\+\d)?)\)\]')

   dchecks = []
   for str_read in reads_re.finditer(out):
     dchecks.append("DCHECK_LT(%s, 129);" % str_read.group(1))

   if TRIM_CHAR_TABLE:
     out = checked_sub(
         r'static const unsigned char asso_values\[\]\s*=\s*\{(\s*\d+\s*,){96}',
         "".join(dchecks) + r'static const unsigned char asso_values[32] = {',
         out,
         flags=re.MULTILINE)
     out = checked_sub(
         reads_re.pattern,
         r'asso_values[static_cast<unsigned char>(str[(\1)]&31)]',
         out,
         count=len(dchecks),
         flags=re.MULTILINE)
   else:
     out = checked_sub(
         r'static const unsigned char asso_values\[\]\s*=\s*\{',
         "".join(dchecks) + r'static const unsigned char asso_values[129] = {',
         out,
         flags=re.MULTILINE)

   return out


 def use_isinrange(out: str) -> str:
   # Our IsInRange method is more efficient than checking for min/max length
   return checked_sub(r'if \(len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH\)',
                      r'if (base::IsInRange(len, MIN_WORD_LENGTH, '
                      + r'MAX_WORD_LENGTH))',
                      out)


 def pad_tables(out: str) -> str:
   # We don't want to compare against the max hash value, so pad the tables up
   # to a power of two and mask the hash.

   # First get the new size
   max_hash_value = int(re.search(r'MAX_HASH_VALUE\s*=\s*(\d+)', out).group(1))
   old_table_length = max_hash_value + 1
   new_table_length = next_power_of_2(old_table_length)
   table_padding_len = new_table_length - old_table_length

   # Pad the length table.
   single_lengthtable_entry = r'\d+'
   out = checked_sub(
       r"""
       static\ const\ unsigned\ char\ kPerfectKeywordLengthTable\[\]\s*=\s*\{
         (
           \s*%(single_lengthtable_entry)s\s*
           (?:,\s*%(single_lengthtable_entry)s\s*)*
         )
       \}
     """ % {'single_lengthtable_entry': single_lengthtable_entry},
       r'static const unsigned char kPerfectKeywordLengthTable[%d] = { \1 %s }'
       % (new_table_length, "".join([',0'] * table_padding_len)),
       out,
       flags=re.MULTILINE | re.VERBOSE)

   # Pad the word list.
   single_wordlist_entry = r"""
       (?:\#line\ \d+\ ".*"$\s*)?
       \{\s*"[a-z]*"\s*,\s*Token::[a-zA-Z_]+\}
     """
   out = checked_sub(
       r"""
       static\ const\ struct\ PerfectKeywordHashTableEntry\ kPerfectKeywordHashTable\[\]\s*=\s*\{
         (
           \s*%(single_wordlist_entry)s\s*
           (?:,\s*%(single_wordlist_entry)s\s*)*
         )
       \}
     """ % {'single_wordlist_entry': single_wordlist_entry},
       r'static const struct PerfectKeywordHashTableEntry kPerfectKeywordHashTable[%d] = {\1 %s }'
       % (new_table_length, "".join(
           [',{"",Token::kIdentifier}'] * table_padding_len)),
       out,
       flags=re.MULTILINE | re.VERBOSE)

   # Mask the hash and replace the range check with DCHECKs.
   out = checked_sub(r'Hash\s*\(\s*str,\s*len\s*\)',
                     r'Hash(str, len)&0x%x' % (new_table_length - 1), out)
   out = checked_sub(
       r'if \(key <= MAX_HASH_VALUE\)',
       r'DCHECK_LT(key, arraysize(kPerfectKeywordLengthTable));DCHECK_LT(key, arraysize(kPerfectKeywordHashTable));',
       out)

   return out


 def return_token(out: str) -> str:
   # We want to return the actual token rather than the table entry.

   # Change the return type of the function. Make it inline too.
   out = checked_sub(
       r'const\s*struct\s*PerfectKeywordHashTableEntry\s*\*\s*((?:PerfectKeywordHash::)?GetToken)',
       r'inline Token::Value \1',
       out,
       count=2)

   # Change the return value when the keyword is found
   out = checked_sub(r'return &kPerfectKeywordHashTable\[key\];',
                     r'return kPerfectKeywordHashTable[key].value;', out)

   # Change the return value when the keyword is not found
   out = checked_sub(r'return 0;', r'return Token::kIdentifier;', out)

   return out


 def memcmp_to_while(out: str) -> str:
   # It's faster to loop over the keyword with a while loop than calling memcmp.
   # Careful, this replacement is quite flaky, because otherwise the regex is
   # unreadable.
   return checked_sub(
       re.escape("if (*str == *s && !memcmp (str + 1, s + 1, len - 1))") +
       r"\s*" + re.escape("return kPerfectKeywordHashTable[key].value;"),
       """
       while(*s!=0) {
         if (*s++ != *str++) return Token::kIdentifier;
       }
       return kPerfectKeywordHashTable[key].value;
       """,
       out,
       flags=re.MULTILINE)


 def wrap_namespace(out):
   return """// Copyright 2018 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // This file is automatically generated by gen-keywords-gen-h.py and should not
 // be modified manually.

 #ifndef V8_PARSING_KEYWORDS_GEN_H_
 #define V8_PARSING_KEYWORDS_GEN_H_

 #include "src/parsing/token.h"

 namespace v8 {
 namespace internal {

 %s

 }  // namespace internal
 }  // namespace v8

 #endif  // V8_PARSING_KEYWORDS_GEN_H_
 """ % (out)


 def trim_character_set_warning(out: str) -> str:
   # gperf generates an error message that is too large, trim it

   return out.replace(
       '"gperf generated tables don\'t work with this execution character set. Please report a bug to <bug-gperf@gnu.org>."',
       '"gperf generated tables don\'t work with this execution character set."\\\n// If you see this error, please report a bug to <bug-gperf@gnu.org>.'
   )


 def main():
   try:
     script_dir = Path(sys.argv[0]).parent
     root_dir = script_dir.parent

     out: str = subprocess.check_output(["gperf", "-m100", INPUT_PATH],
                                        cwd=root_dir,
                                        encoding="UTF-8")

     # And now some munging of the generated file.
     out = change_sizet_to_int(out)
     out = drop_line_directives(out)
     out = trim_and_dcheck_char_table(out)
     out = use_isinrange(out)
     out = pad_tables(out)
     out = return_token(out)
     out = memcmp_to_while(out)
     out = wrap_namespace(out)
     out = trim_character_set_warning(out)

     # Final formatting.
     clang_format_path = root_dir / 'third_party/depot_tools/clang-format'
     out = call_with_input([clang_format_path], out)

     with (root_dir / OUTPUT_PATH).open('w') as f:
       f.write(out)

     return 0

   except subprocess.CalledProcessError as e:
     sys.stderr.write("Error calling '{}'\n".format(" ".join(e.cmd)))
     return e.returncode


 if __name__ == '__main__':
   sys.exit(main())
	#!/usr/bin/env python3
	# Copyright 2018 the V8 project authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	from __future__ import annotations

	import sys
	import subprocess
	import re
	import math
	from pathlib import Path
	from typing import List, Union

	INPUT_PATH = Path("src/parsing/keywords.txt")
	OUTPUT_PATH = Path("src/parsing/keywords-gen.h")

	# TODO(leszeks): Trimming seems to regress performance, investigate.
	TRIM_CHAR_TABLE: bool = False


	def next_power_of_2(x):
	return 1 if x == 0 else 2**int(math.ceil(math.log(x, 2)))


	def call_with_input(cmd: List[Union[str, Path]], input_string: str = "") -> str:
	p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
	stdout, _ = p.communicate(input_string.encode())
	retcode = p.wait()
	if retcode != 0:
	raise subprocess.CalledProcessError(retcode, cmd)
	return stdout.decode()


	def checked_sub(pattern: Union[str, re.Pattern[str]],
	sub: str,
	out: str,
	count: int = 1,
	flags: int = 0) -> str:
	out, n = re.subn(pattern, sub, out, flags=flags)
	if n != count:
	raise Exception("Expected %d and got %d replacement(s) for pattern: %s" %
	(count, n, pattern))
	return out


	def change_sizet_to_int(out: str) -> str:
	# Literal buffer lengths are given as ints, not size_t
	return checked_sub(r'\bsize_t\b', 'int', out, count=4)


	def drop_line_directives(out: str) -> str:
	# #line causes gcov issue, so drop it
	return re.sub(r'^#\sline .$\n', '', out, flags=re.MULTILINE)


	def trim_and_dcheck_char_table(out: str) -> str:
	# Potential keyword strings are known to be lowercase ascii, so chop off the
	# rest of the table and mask out the char

	reads_re = re.compile(
	r'asso_values\[static_cast<unsigned char>\((str\[\w+\](\+\d)?)\)\]')

	dchecks = []
	for str_read in reads_re.finditer(out):
	dchecks.append("DCHECK_LT(%s, 129);" % str_read.group(1))

	if TRIM_CHAR_TABLE:
	out = checked_sub(
	r'static const unsigned char asso_values\[\]\s=\s\{(\s\d+\s,){96}',
	"".join(dchecks) + r'static const unsigned char asso_values[32] = {',
	out,
	flags=re.MULTILINE)
	out = checked_sub(
	reads_re.pattern,
	r'asso_values[static_cast<unsigned char>(str[(\1)]&31)]',
	out,
	count=len(dchecks),
	flags=re.MULTILINE)
	else:
	out = checked_sub(
	r'static const unsigned char asso_values\[\]\s=\s\{',
	"".join(dchecks) + r'static const unsigned char asso_values[129] = {',
	out,
	flags=re.MULTILINE)

	return out


	def use_isinrange(out: str) -> str:
	# Our IsInRange method is more efficient than checking for min/max length
	return checked_sub(r'if \(len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH\)',
	r'if (base::IsInRange(len, MIN_WORD_LENGTH, '
	+ r'MAX_WORD_LENGTH))',
	out)


	def pad_tables(out: str) -> str:
	# We don't want to compare against the max hash value, so pad the tables up
	# to a power of two and mask the hash.

	# First get the new size
	max_hash_value = int(re.search(r'MAX_HASH_VALUE\s=\s(\d+)', out).group(1))
	old_table_length = max_hash_value + 1
	new_table_length = next_power_of_2(old_table_length)
	table_padding_len = new_table_length - old_table_length

	# Pad the length table.
	single_lengthtable_entry = r'\d+'
	out = checked_sub(
	r"""
	static\ const\ unsigned\ char\ kPerfectKeywordLengthTable\[\]\s=\s\{
	(
	\s%(single_lengthtable_entry)s\s
	(?:,\s%(single_lengthtable_entry)s\s)*
	)
	\}
	""" % {'single_lengthtable_entry': single_lengthtable_entry},
	r'static const unsigned char kPerfectKeywordLengthTable[%d] = { \1 %s }'
	% (new_table_length, "".join([',0'] * table_padding_len)),
	out,
	flags=re.MULTILINE \| re.VERBOSE)

	# Pad the word list.
	single_wordlist_entry = r"""
	(?:\#line\ \d+\ "."$\s)?
	\{\s"[a-z]"\s,\sToken::[a-zA-Z_]+\}
	"""
	out = checked_sub(
	r"""
	static\ const\ struct\ PerfectKeywordHashTableEntry\ kPerfectKeywordHashTable\[\]\s=\s\{
	(
	\s%(single_wordlist_entry)s\s
	(?:,\s%(single_wordlist_entry)s\s)*
	)
	\}
	""" % {'single_wordlist_entry': single_wordlist_entry},
	r'static const struct PerfectKeywordHashTableEntry kPerfectKeywordHashTable[%d] = {\1 %s }'
	% (new_table_length, "".join(
	[',{"",Token::kIdentifier}'] * table_padding_len)),
	out,
	flags=re.MULTILINE \| re.VERBOSE)

	# Mask the hash and replace the range check with DCHECKs.
	out = checked_sub(r'Hash\s\(\sstr,\slen\s\)',
	r'Hash(str, len)&0x%x' % (new_table_length - 1), out)
	out = checked_sub(
	r'if \(key <= MAX_HASH_VALUE\)',
	r'DCHECK_LT(key, arraysize(kPerfectKeywordLengthTable));DCHECK_LT(key, arraysize(kPerfectKeywordHashTable));',
	out)

	return out


	def return_token(out: str) -> str:
	# We want to return the actual token rather than the table entry.

	# Change the return type of the function. Make it inline too.
	out = checked_sub(
	r'const\sstruct\sPerfectKeywordHashTableEntry\s\\s*((?:PerfectKeywordHash::)?GetToken)',
	r'inline Token::Value \1',
	out,
	count=2)

	# Change the return value when the keyword is found
	out = checked_sub(r'return &kPerfectKeywordHashTable\[key\];',
	r'return kPerfectKeywordHashTable[key].value;', out)

	# Change the return value when the keyword is not found
	out = checked_sub(r'return 0;', r'return Token::kIdentifier;', out)

	return out


	def memcmp_to_while(out: str) -> str:
	# It's faster to loop over the keyword with a while loop than calling memcmp.
	# Careful, this replacement is quite flaky, because otherwise the regex is
	# unreadable.
	return checked_sub(
	re.escape("if (str == s && !memcmp (str + 1, s + 1, len - 1))") +
	r"\s*" + re.escape("return kPerfectKeywordHashTable[key].value;"),
	"""
	while(*s!=0) {
	if (s++ != str++) return Token::kIdentifier;
	}
	return kPerfectKeywordHashTable[key].value;
	""",
	out,
	flags=re.MULTILINE)


	def wrap_namespace(out):
	return """// Copyright 2018 the V8 project authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	// This file is automatically generated by gen-keywords-gen-h.py and should not
	// be modified manually.

	#ifndef V8_PARSING_KEYWORDS_GEN_H_
	#define V8_PARSING_KEYWORDS_GEN_H_

	#include "src/parsing/token.h"

	namespace v8 {
	namespace internal {

	%s

	} // namespace internal
	} // namespace v8

	#endif // V8_PARSING_KEYWORDS_GEN_H_
	""" % (out)


	def trim_character_set_warning(out: str) -> str:
	# gperf generates an error message that is too large, trim it

	return out.replace(
	'"gperf generated tables don\'t work with this execution character set. Please report a bug to <bug-gperf@gnu.org>."',
	'"gperf generated tables don\'t work with this execution character set."\\\n// If you see this error, please report a bug to <bug-gperf@gnu.org>.'
	)


	def main():
	try:
	script_dir = Path(sys.argv[0]).parent
	root_dir = script_dir.parent

	out: str = subprocess.check_output(["gperf", "-m100", INPUT_PATH],
	cwd=root_dir,
	encoding="UTF-8")

	# And now some munging of the generated file.
	out = change_sizet_to_int(out)
	out = drop_line_directives(out)
	out = trim_and_dcheck_char_table(out)
	out = use_isinrange(out)
	out = pad_tables(out)
	out = return_token(out)
	out = memcmp_to_while(out)
	out = wrap_namespace(out)
	out = trim_character_set_warning(out)

	# Final formatting.
	clang_format_path = root_dir / 'third_party/depot_tools/clang-format'
	out = call_with_input([clang_format_path], out)

	with (root_dir / OUTPUT_PATH).open('w') as f:
	f.write(out)

	return 0

	except subprocess.CalledProcessError as e:
	sys.stderr.write("Error calling '{}'\n".format(" ".join(e.cmd)))
	return e.returncode


	if __name__ == '__main__':
	sys.exit(main())