scripts/extract_sqlite_api.py - chromium/src/third_party/sqlite - Git at Google

 #!/usr/bin/env python
 #
 # Copyright 2018 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 '''
 Parses SQLite source code and produces renaming macros for its exported symbols.

 Usage:
     extract_sqlite_api.py sqlite.h rename_macros.h

 For example, the following renaming macro is produced for sqlite3_initialize().

     #define sqlite3_initialize chrome_sqlite3_initialize
 '''

 import re
 import sys

 def ExtractLineTuples(string):
   '''Returns a list of lines, with start/end whitespace stripped.

   Each line is a tuple of (line number, string).
   '''
   raw_lines = string.split('\n')
   stripped_lines = [line.strip() for line in raw_lines]
   return list(enumerate(stripped_lines, start=1))

 def ExtractPreprocessorDirectives(lines):
   '''Extracts preprocessor directives from lines of C code.

   Each input line should be a tuple of (line number, string).

   Returns a list of preprocessor directives, and a list of C code lines with the
   preprocessor directives removed. The returned code lines are a subset of the
   input tuples.
   '''
   code_lines = []
   directives = []
   in_directive = False
   last_directive = []
   for line_tuple in lines:
     line = line_tuple[1]
     # Preprocessor directives start with #.
     if not in_directive:
       if len(line) > 0 and line[0] == '#':
         in_directive = True
         last_directive = []

     # Preprocessor directives use \ as a line continuation character.
     if in_directive:
       if line[-1] == '\\':
         line = line[:-1]
       else:
         in_directive = False
       last_directive.append(line)

       if not in_directive:
         directives.append('\n'.join(last_directive))
     else:
       code_lines.append(line_tuple)

   return directives, code_lines


 # Regular expression used to parse a macro definition.
 DEFINITION_RE = re.compile(r'^\#\s*define\s+(\w+)(\s|$)')

 def ExtractDefineMacroName(line):
   '''Extracts the macro name from a non-function preprocessor definition.

   Returns None if the preprocessor line is not a preprocessor macro definition.
   Macro functions are not considered preprocessor definitions.
   '''
   match = DEFINITION_RE.match(line)
   if match is None:
     return None
   return match.group(1)


 # Matches C++-style // single-line comments.
 SINGLE_LINE_COMMENT_RE = re.compile(r'//.*$')
 # Matches C-style /* multi-line comments */.
 MULTI_LINE_COMMENT_RE = re.compile(r'/\*.*?\*/', flags=re.MULTILINE|re.DOTALL)
 def RemoveLineComments(line):
   '''Returns the given C code line with comments removed.

   This handles both C-style /* comments */ and C++-style // comments, but cannot
   tackle C-style comments that extend over multiple lines.
   '''
   return SINGLE_LINE_COMMENT_RE.sub('', MULTI_LINE_COMMENT_RE.sub('', line))


 def RemoveComments(code_tuples):
   'Returns the given C code tuples with all comments removed.'

   output_tuples = []
   in_comment = False
   for line_number, line in code_tuples:
     if in_comment:
       if '*/' in line:
         _, line = line.split('*/', 1)
         in_comment = False
     if not in_comment:
       line = RemoveLineComments(line)
       if '/*' in line:
         line, _ = line.split('/*', 1)
         in_comment = True
       output_tuples.append((line_number, line))
   return output_tuples


 # Splits a line of C code into statement pieces.
 STATEMENT_BREAK_RE = re.compile(r'[;{}]')

 def ToStatementTuples(code_tuples):
   '''Converts C code lines into statements.

   The input is tuples of (line number, line code string). The output is
   tuples of (min line, max line, statement).

   The function considers ; { and } to be statement separators. This is
   sufficiently correct, given our goal.
   '''
   statements = []
   current_statement = ''
   current_start = 0

   for line_number, line in code_tuples:
     pieces = STATEMENT_BREAK_RE.split(line)
     for piece in pieces[:-1]:  # The last piece is an unfinished statement.
       if current_statement != '':
         current_statement = current_statement + '\n' + piece
         statements.append(
             (current_start, line_number, current_statement.strip()))
         current_statement = ''
       else:
         statements.append((line_number, line_number, piece.strip()))

     if current_statement == '':
       current_start = line_number
     if pieces[-1] != '':
       current_statement = current_statement + '\n' + pieces[-1]

   return statements


 # Used to break down a line into words.
 WHITESPACE_RE = re.compile(r'\s+')

 # Features unsupported by our extractor.
 #
 # We do not support parsing struct and enum literals because sqlite typedefs
 # them before incorporating them into exported symbols. We can avoid matching
 # curly braces because we do not support enum, struct, or union, and we only
 # need to consider declarations involving typedef names and primitive types.
 UNSUPPORTED_KEYWORDS = set([
   'enum',
   'struct',
   'union',
   'typedef'
 ])

 # Type qualifiers that we can skip over.
 #
 # We discard storage-class specifiers and type qualifiers. For purposes of
 # finding the end of declaration specifiers, they are not needed. This
 # additionally discards any pointer type qualifiers.
 QUALIFIER_KEYWORDS = set([
   'extern',
   'static',
   'auto',
   'register',
   'const',
   'volatile',
 ])

 # Keywords used in composite primitive types.
 #
 # Types using these keywords may have more than one keyword, e.g.
 # "long long int".
 COMPOSITE_TYPE_SPECIFIERS = set([
   'char',
   'short',
   'int',
   'long',
   'float',
   'double',
   'signed',
   'unsigned',
 ])

 # Matches an identifier.
 IDENTIFIER_RE = re.compile(r'^[a-zA-Z_0-9]+$')

 def ExtractApiExport(macro_names, api_export_macro, statement):
   '''Extracts the symbol name from a statement exporting a function.

   Returns None if the statement does not export a symbol. Throws ValueError if
   the parser cannot understand the statement.
   '''
   # See http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf, section 6.7
   # for how to parse C declarations. Note that a declaration is a number of
   # declaration-specifiers, followed by a list of declarators with optional
   # initializer. Multiple declarators would be a declaration like:
   #
   # int a, b;
   #
   # While, in principle, one could declare a pair of C functions like this, no
   # one does it. We assume there is only one declarator.
   #
   # int foo(int), bar(int, int);
   #
   # Jumping to section 6.7.5, a declarator includes some optional pointer
   # specifiers (which may have type qualifiers like 'const' embedded, e.g. 'int
   # * const * const foo') and some grouping. Note, however, that in all cases,
   # the declaration name is the first non-type-qualifier identifier.
   #
   # Thus our goal is to skip the declaration specifiers and get to the
   # declarators.

   # Simplification: get rid of pointer characters.
   statement = statement.replace('*', ' ')

   # Simplification: make sure each open parenthesis is each own word.
   statement = statement.replace('(', ' ( ')
   statement = statement.replace('[', ' [ ')

   words = WHITESPACE_RE.split(statement)

   # Ignore statements that don't deal with exporting symbols.
   if api_export_macro not in words:
     return None

   seen_composite_type = False
   seen_simple_type = False
   for word in words:
     if word in UNSUPPORTED_KEYWORDS:
       raise ValueError("Unsupported keyword %s" % word)

     if word in QUALIFIER_KEYWORDS:
       continue

     # Per section 6.7.2, we must have at least one type specifier (so the first
     # token is one). Moreover, clause 2 implies that if we have a typedef name,
     # enum, struct, or union, it is the only type specifier. If we have a
     # keyword such as 'int', we may have one or more of such keywords.

     if word in COMPOSITE_TYPE_SPECIFIERS:
       if seen_simple_type:
         raise ValueError('Mixed simple (struct_name) and composite (int) types')
       seen_composite_type = True
       continue

     # We assume that macros are only used for qualifiers, which can be skipped.
     if word in macro_names or word == api_export_macro:
       continue

     if not seen_composite_type and not seen_simple_type:
       seen_simple_type = True
       if IDENTIFIER_RE.match(word) is None:
         raise ValueError(
             "%s parsed as type name, which doesn't make sense" % word)
       continue

     if IDENTIFIER_RE.match(word) is None:
       raise ValueError(
           "%s parsed as symbol name, which doesn't make sense" % word)
     return word

   raise ValueError('Failed to find symbol name')


 def ExportedSymbolLine(symbol_prefix, symbol, statement_tuple):
   'Returns an output line for an exported symbol.'
   if statement_tuple[0] == statement_tuple[1]:
     lines = 'Line %d' % statement_tuple[0]
   else:
     lines = 'Lines %d-%d' % (statement_tuple[0], statement_tuple[1])
   return '#define %s %s%s  // %s' % (symbol, symbol_prefix, symbol, lines)


 def ExportedExceptionLine(exception, statement_tuple):
   'Returns an output line for a parsing failure.'

   # Output a TODO without a name so the broken parsing result doesn't
   # accidentally get checked in.
   return '// TODO: Lines %d-%d -- %s' % (
       statement_tuple[0], statement_tuple[1], exception.message)


 def ProcessSource(api_export_macro, symbol_prefix, header_line, footer_line,
                   file_content):
   'Returns a list of lines that rename exported symbols in an C program file.'

   line_tuples = ExtractLineTuples(file_content)
   line_tuples = RemoveComments(line_tuples)
   directives, code_tuples = ExtractPreprocessorDirectives(line_tuples)
   macro_names = set(
       name for name in
       [ExtractDefineMacroName(directive) for directive in directives]
       if name is not None)
   statement_tuples = ToStatementTuples(code_tuples)

   output_lines = []
   for statement_tuple in statement_tuples:
     line = statement_tuple[2]
     try:
       symbol_name = ExtractApiExport(macro_names, api_export_macro, line)
       if symbol_name:
         output_lines.append(
             ExportedSymbolLine(symbol_prefix, symbol_name, statement_tuple))
     except ValueError as exception:
       output_lines.append(ExportedExceptionLine(exception, statement_tuple))

   output_lines.sort()
   return [header_line] + output_lines + [footer_line]


 def ProcessSourceFile(api_export_macro, symbol_prefix, header_line,
                       footer_line, input_file, output_file):
   'Reads in a C program file and outputs macros renaming exported symbols.'

   with open(input_file, 'r') as f:
     file_content = f.read()
   output_lines = ProcessSource(api_export_macro, symbol_prefix, header_line,
                                footer_line, file_content)
   output_lines.append('')
   with open(output_file, 'w') as f:
     f.write('\n'.join(output_lines))

 header_line='''// Copyright 2018 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // This file is generated by extract_sqlite_api.py.

 #ifndef THIRD_PARTY_SQLITE_AMALGAMATION_RENAME_EXPORTS_H_
 #define THIRD_PARTY_SQLITE_AMALGAMATION_RENAME_EXPORTS_H_
 '''

 footer_line ='''
 #endif  // THIRD_PARTY_SQLITE_AMALGAMATION_RENAME_EXPORTS_H_
 '''

 if __name__ == '__main__':
   ProcessSourceFile(api_export_macro='SQLITE_API', symbol_prefix='chrome_',
                     header_line=header_line, footer_line=footer_line,
                     input_file=sys.argv[1], output_file=sys.argv[2])
	#!/usr/bin/env python
	#
	# Copyright 2018 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	'''
	Parses SQLite source code and produces renaming macros for its exported symbols.

	Usage:
	extract_sqlite_api.py sqlite.h rename_macros.h

	For example, the following renaming macro is produced for sqlite3_initialize().

	#define sqlite3_initialize chrome_sqlite3_initialize
	'''

	import re
	import sys

	def ExtractLineTuples(string):
	'''Returns a list of lines, with start/end whitespace stripped.

	Each line is a tuple of (line number, string).
	'''
	raw_lines = string.split('\n')
	stripped_lines = [line.strip() for line in raw_lines]
	return list(enumerate(stripped_lines, start=1))

	def ExtractPreprocessorDirectives(lines):
	'''Extracts preprocessor directives from lines of C code.

	Each input line should be a tuple of (line number, string).

	Returns a list of preprocessor directives, and a list of C code lines with the
	preprocessor directives removed. The returned code lines are a subset of the
	input tuples.
	'''
	code_lines = []
	directives = []
	in_directive = False
	last_directive = []
	for line_tuple in lines:
	line = line_tuple[1]
	# Preprocessor directives start with #.
	if not in_directive:
	if len(line) > 0 and line[0] == '#':
	in_directive = True
	last_directive = []

	# Preprocessor directives use \ as a line continuation character.
	if in_directive:
	if line[-1] == '\\':
	line = line[:-1]
	else:
	in_directive = False
	last_directive.append(line)

	if not in_directive:
	directives.append('\n'.join(last_directive))
	else:
	code_lines.append(line_tuple)

	return directives, code_lines


	# Regular expression used to parse a macro definition.
	DEFINITION_RE = re.compile(r'^\#\s*define\s+(\w+)(\s\|$)')

	def ExtractDefineMacroName(line):
	'''Extracts the macro name from a non-function preprocessor definition.

	Returns None if the preprocessor line is not a preprocessor macro definition.
	Macro functions are not considered preprocessor definitions.
	'''
	match = DEFINITION_RE.match(line)
	if match is None:
	return None
	return match.group(1)


	# Matches C++-style // single-line comments.
	SINGLE_LINE_COMMENT_RE = re.compile(r'//.*$')
	# Matches C-style /* multi-line comments */.
	MULTI_LINE_COMMENT_RE = re.compile(r'/\.?\*/', flags=re.MULTILINE\|re.DOTALL)
	def RemoveLineComments(line):
	'''Returns the given C code line with comments removed.

	This handles both C-style /* comments */ and C++-style // comments, but cannot
	tackle C-style comments that extend over multiple lines.
	'''
	return SINGLE_LINE_COMMENT_RE.sub('', MULTI_LINE_COMMENT_RE.sub('', line))


	def RemoveComments(code_tuples):
	'Returns the given C code tuples with all comments removed.'

	output_tuples = []
	in_comment = False
	for line_number, line in code_tuples:
	if in_comment:
	if '*/' in line:
	_, line = line.split('*/', 1)
	in_comment = False
	if not in_comment:
	line = RemoveLineComments(line)
	if '/*' in line:
	line, _ = line.split('/*', 1)
	in_comment = True
	output_tuples.append((line_number, line))
	return output_tuples


	# Splits a line of C code into statement pieces.
	STATEMENT_BREAK_RE = re.compile(r'[;{}]')

	def ToStatementTuples(code_tuples):
	'''Converts C code lines into statements.

	The input is tuples of (line number, line code string). The output is
	tuples of (min line, max line, statement).

	The function considers ; { and } to be statement separators. This is
	sufficiently correct, given our goal.
	'''
	statements = []
	current_statement = ''
	current_start = 0

	for line_number, line in code_tuples:
	pieces = STATEMENT_BREAK_RE.split(line)
	for piece in pieces[:-1]: # The last piece is an unfinished statement.
	if current_statement != '':
	current_statement = current_statement + '\n' + piece
	statements.append(
	(current_start, line_number, current_statement.strip()))
	current_statement = ''
	else:
	statements.append((line_number, line_number, piece.strip()))

	if current_statement == '':
	current_start = line_number
	if pieces[-1] != '':
	current_statement = current_statement + '\n' + pieces[-1]

	return statements


	# Used to break down a line into words.
	WHITESPACE_RE = re.compile(r'\s+')

	# Features unsupported by our extractor.
	#
	# We do not support parsing struct and enum literals because sqlite typedefs
	# them before incorporating them into exported symbols. We can avoid matching
	# curly braces because we do not support enum, struct, or union, and we only
	# need to consider declarations involving typedef names and primitive types.
	UNSUPPORTED_KEYWORDS = set([
	'enum',
	'struct',
	'union',
	'typedef'
	])

	# Type qualifiers that we can skip over.
	#
	# We discard storage-class specifiers and type qualifiers. For purposes of
	# finding the end of declaration specifiers, they are not needed. This
	# additionally discards any pointer type qualifiers.
	QUALIFIER_KEYWORDS = set([
	'extern',
	'static',
	'auto',
	'register',
	'const',
	'volatile',
	])

	# Keywords used in composite primitive types.
	#
	# Types using these keywords may have more than one keyword, e.g.
	# "long long int".
	COMPOSITE_TYPE_SPECIFIERS = set([
	'char',
	'short',
	'int',
	'long',
	'float',
	'double',
	'signed',
	'unsigned',
	])

	# Matches an identifier.
	IDENTIFIER_RE = re.compile(r'^[a-zA-Z_0-9]+$')

	def ExtractApiExport(macro_names, api_export_macro, statement):
	'''Extracts the symbol name from a statement exporting a function.

	Returns None if the statement does not export a symbol. Throws ValueError if
	the parser cannot understand the statement.
	'''
	# See http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf, section 6.7
	# for how to parse C declarations. Note that a declaration is a number of
	# declaration-specifiers, followed by a list of declarators with optional
	# initializer. Multiple declarators would be a declaration like:
	#
	# int a, b;
	#
	# While, in principle, one could declare a pair of C functions like this, no
	# one does it. We assume there is only one declarator.
	#
	# int foo(int), bar(int, int);
	#
	# Jumping to section 6.7.5, a declarator includes some optional pointer
	# specifiers (which may have type qualifiers like 'const' embedded, e.g. 'int
	# * const * const foo') and some grouping. Note, however, that in all cases,
	# the declaration name is the first non-type-qualifier identifier.
	#
	# Thus our goal is to skip the declaration specifiers and get to the
	# declarators.

	# Simplification: get rid of pointer characters.
	statement = statement.replace('*', ' ')

	# Simplification: make sure each open parenthesis is each own word.
	statement = statement.replace('(', ' ( ')
	statement = statement.replace('[', ' [ ')

	words = WHITESPACE_RE.split(statement)

	# Ignore statements that don't deal with exporting symbols.
	if api_export_macro not in words:
	return None

	seen_composite_type = False
	seen_simple_type = False
	for word in words:
	if word in UNSUPPORTED_KEYWORDS:
	raise ValueError("Unsupported keyword %s" % word)

	if word in QUALIFIER_KEYWORDS:
	continue

	# Per section 6.7.2, we must have at least one type specifier (so the first
	# token is one). Moreover, clause 2 implies that if we have a typedef name,
	# enum, struct, or union, it is the only type specifier. If we have a
	# keyword such as 'int', we may have one or more of such keywords.

	if word in COMPOSITE_TYPE_SPECIFIERS:
	if seen_simple_type:
	raise ValueError('Mixed simple (struct_name) and composite (int) types')
	seen_composite_type = True
	continue

	# We assume that macros are only used for qualifiers, which can be skipped.
	if word in macro_names or word == api_export_macro:
	continue

	if not seen_composite_type and not seen_simple_type:
	seen_simple_type = True
	if IDENTIFIER_RE.match(word) is None:
	raise ValueError(
	"%s parsed as type name, which doesn't make sense" % word)
	continue

	if IDENTIFIER_RE.match(word) is None:
	raise ValueError(
	"%s parsed as symbol name, which doesn't make sense" % word)
	return word

	raise ValueError('Failed to find symbol name')


	def ExportedSymbolLine(symbol_prefix, symbol, statement_tuple):
	'Returns an output line for an exported symbol.'
	if statement_tuple[0] == statement_tuple[1]:
	lines = 'Line %d' % statement_tuple[0]
	else:
	lines = 'Lines %d-%d' % (statement_tuple[0], statement_tuple[1])
	return '#define %s %s%s // %s' % (symbol, symbol_prefix, symbol, lines)


	def ExportedExceptionLine(exception, statement_tuple):
	'Returns an output line for a parsing failure.'

	# Output a TODO without a name so the broken parsing result doesn't
	# accidentally get checked in.
	return '// TODO: Lines %d-%d -- %s' % (
	statement_tuple[0], statement_tuple[1], exception.message)


	def ProcessSource(api_export_macro, symbol_prefix, header_line, footer_line,
	file_content):
	'Returns a list of lines that rename exported symbols in an C program file.'

	line_tuples = ExtractLineTuples(file_content)
	line_tuples = RemoveComments(line_tuples)
	directives, code_tuples = ExtractPreprocessorDirectives(line_tuples)
	macro_names = set(
	name for name in
	[ExtractDefineMacroName(directive) for directive in directives]
	if name is not None)
	statement_tuples = ToStatementTuples(code_tuples)

	output_lines = []
	for statement_tuple in statement_tuples:
	line = statement_tuple[2]
	try:
	symbol_name = ExtractApiExport(macro_names, api_export_macro, line)
	if symbol_name:
	output_lines.append(
	ExportedSymbolLine(symbol_prefix, symbol_name, statement_tuple))
	except ValueError as exception:
	output_lines.append(ExportedExceptionLine(exception, statement_tuple))

	output_lines.sort()
	return [header_line] + output_lines + [footer_line]


	def ProcessSourceFile(api_export_macro, symbol_prefix, header_line,
	footer_line, input_file, output_file):
	'Reads in a C program file and outputs macros renaming exported symbols.'

	with open(input_file, 'r') as f:
	file_content = f.read()
	output_lines = ProcessSource(api_export_macro, symbol_prefix, header_line,
	footer_line, file_content)
	output_lines.append('')
	with open(output_file, 'w') as f:
	f.write('\n'.join(output_lines))

	header_line='''// Copyright 2018 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	// This file is generated by extract_sqlite_api.py.

	#ifndef THIRD_PARTY_SQLITE_AMALGAMATION_RENAME_EXPORTS_H_
	#define THIRD_PARTY_SQLITE_AMALGAMATION_RENAME_EXPORTS_H_
	'''

	footer_line ='''
	#endif // THIRD_PARTY_SQLITE_AMALGAMATION_RENAME_EXPORTS_H_
	'''

	if __name__ == '__main__':
	ProcessSourceFile(api_export_macro='SQLITE_API', symbol_prefix='chrome_',
	header_line=header_line, footer_line=footer_line,
	input_file=sys.argv[1], output_file=sys.argv[2])