tools/cygprofile/symbol_extractor.py - chromium/src - Git at Google

 # Copyright 2015 The Chromium Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Utilities to get and manipulate symbols from a binary."""

 import collections
 import json
 import logging
 import os
 import re
 import subprocess
 import sys

 import cygprofile_utils

 START_OF_TEXT_SYMBOL = 'linker_script_start_of_text'

 _SRC_PATH = os.path.abspath(os.path.join(
     os.path.dirname(__file__), os.pardir, os.pardir))
 _TOOL_PREFIX = os.path.join(_SRC_PATH, 'third_party', 'llvm-build',
                             'Release+Asserts', 'bin', 'llvm-')

 _MAX_WARNINGS_TO_PRINT = 200

 SymbolInfo = collections.namedtuple('SymbolInfo', ('name', 'offset', 'size',
                                                    'section'))


 def _SymbolInfosFromStream(input_file):
   """Parses the output of llvm-readelf, and gets all the symbols from a binary.

   Args:
     input_file: a .json file handle containing the readelf output.

   Returns:
     A list of SymbolInfo.
   """
   # Load the JSON output
   raw_symbols = json.load(input_file)
   # The file is structured as a list containing dictionaries, one per input
   # file.
   assert len(raw_symbols) == 1
   raw_symbols = raw_symbols[0]
   # Next have two sections: FileSummary and Symbols
   assert 'Symbols' in raw_symbols
   raw_symbols = raw_symbols['Symbols']

   name_to_offsets = collections.defaultdict(list)
   symbol_infos = []

   for symbol in raw_symbols:
     symbol = symbol['Symbol']
     name = symbol['Name']['Name']
     offset = symbol['Value']
     size = symbol['Size']
     section = symbol['Section']['Name']
     scope = symbol['Binding']['Name']
     # Output the label that contains the earliest offset. It is needed later for
     # translating offsets from the profile dumps.
     if name == START_OF_TEXT_SYMBOL:
       symbol_infos.append(
           SymbolInfo(name=name, offset=offset, section='.text', size=0))
       continue
     # Check symbol type for validity and ignore some types.
     symbol_type = symbol['Type']['Name']
     if symbol_type == 'None':
       # Ignore local goto labels. Unfortunately, v8 builtins (like
       # 'Builtins_.*') are indistinguishable from labels of size 0 other than
       # by name.
       continue
     if section != '.text':
       # Ignore anything that's outside the primary .text section
       continue
     assert symbol_type in ['Object', 'Function', 'File', 'GNU_IFunc']
     assert scope in ['Local', 'Global', 'Weak']
     # Forbid ARM mapping symbols and other unexpected symbol names, but allow $
     # characters in a non-initial position, which can appear as a component of a
     # mangled name, e.g. Clang can mangle a lambda function to:
     # 02cd61e0 l     F .text  000000c0 _ZZL11get_globalsvENK3$_1clEv
     # The equivalent objdump line from GCC is:
     # 0325c58c l     F .text  000000d0 _ZZL11get_globalsvENKUlvE_clEv
     #
     # Also disallow .internal and .protected symbols (as well as other flags),
     # those have not appeared in the binaries we parse. Rejecting these extra
     # prefixes is done by disallowing spaces in symbol names.
     assert re.match('^[a-zA-Z0-9_.][a-zA-Z0-9_.$]*$', name), name

     symbol_info = SymbolInfo(name=name,
                              offset=offset,
                              section=section,
                              size=size)
     # On ARM the LLD linker inserts pseudo-functions (thunks) that allow
     # jumping distances farther than 16 MiB. Such thunks are known to often
     # reside on multiple offsets, they are not instrumented and hence they do
     # not reach the orderfiles. Exclude the thunk symbols from the warning.
     if not symbol_info.name.startswith('__ThumbV7PILongThunk_'):
       name_to_offsets[symbol_info.name].append(symbol_info.offset)
     symbol_infos.append(symbol_info)

   # Outlined functions are known to be repeated often, so ignore them in the
   # repeated symbol count.
   repeated_symbols = list(
       filter(lambda s: len(name_to_offsets[s]) > 1,
              (k for k in name_to_offsets.keys()
               if not k.startswith('OUTLINED_FUNCTION_'))))
   if repeated_symbols:
     # Log the first 5 repeated offsets of the first 10 repeated symbols.
     logging.warning('%d symbols repeated with multiple offsets:\n %s',
                     len(repeated_symbols), '\n '.join(
                         '{} {}'.format(sym, ' '.join(
                             str(offset) for offset in name_to_offsets[sym][:5]))
                         for sym in repeated_symbols[:10]))

   return symbol_infos


 def SymbolInfosFromBinary(binary_filename):
   """Runs llvm-readelf to get all the symbols from a binary.

   Args:
     binary_filename: path to the binary.

   Returns:
     A list of SymbolInfo from the binary.
   """
   command = [
       _TOOL_PREFIX + 'readelf', '--syms', '--elf-output-style=JSON',
       '--pretty-print', binary_filename
   ]
   try:
     p = subprocess.Popen(command,
                          stdout=subprocess.PIPE,
                          universal_newlines=True)
   except OSError as error:
     logging.error('Failed to execute the command: path=%s, binary_filename=%s',
                   command[0], binary_filename)
     raise error

   try:
     return _SymbolInfosFromStream(p.stdout)
   finally:
     p.wait()


 _LLVM_NM_LINE_RE = re.compile(
     r'^[\-0-9a-f]{8,16}[ ](?P<symbol_type>.)[ ](?P<name>.*)$', re.VERBOSE)


 def _SymbolInfosFromLlvmNm(lines):
   """Extracts all defined symbols names from llvm-nm output.

   Only defined (weak and regular) symbols are extracted.

   Args:
     lines: Iterable of lines.

   Returns:
     [str] A list of symbol names, can be empty.
   """
   symbol_names = []
   for line in lines:
     m = _LLVM_NM_LINE_RE.match(line)
     assert m is not None, line
     if m.group('symbol_type') not in ['t', 'T', 'w', 'W']:
       continue
     symbol_names.append(m.group('name'))
   return symbol_names


 _NM_PATH = os.path.join(_SRC_PATH, 'third_party', 'llvm-build',
                         'Release+Asserts', 'bin', 'llvm-nm')


 def CheckLlvmNmExists():
   assert os.path.exists(_NM_PATH), (
       'llvm-nm not found. Please run '
       '//tools/clang/scripts/update.py --package=objdump to install it.')


 def SymbolNamesFromLlvmBitcodeFile(filename):
   """Extracts all defined symbols names from an LLVM bitcode file.

   Args:
     filename: (str) File to parse.

   Returns:
     [str] A list of symbol names, can be empty.
   """
   command = (_NM_PATH, '--defined-only', filename)
   p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE)
   try:
     result = _SymbolInfosFromLlvmNm(p.stdout)
     if not result:
       file_size = os.stat(filename).st_size
       logging.warning('No symbols for %s (size %d)', filename, file_size)
     return result
   finally:
     _, _ = p.communicate()
     p.stdout.close()
     assert p.wait() == 0


 def GroupSymbolInfosByOffset(symbol_infos):
   """Create a dict {offset: [symbol_info1, ...], ...}.

   As several symbols can be at the same offset, this is a 1-to-many
   relationship.

   Args:
     symbol_infos: iterable of SymbolInfo instances

   Returns:
     a dict {offset: [symbol_info1, ...], ...}
   """
   offset_to_symbol_infos = collections.defaultdict(list)
   for symbol_info in symbol_infos:
     offset_to_symbol_infos[symbol_info.offset].append(symbol_info)
   return dict(offset_to_symbol_infos)


 def GroupSymbolInfosByName(symbol_infos):
   """Create a dict {name: [symbol_info1, ...], ...}.

   A symbol can have several offsets, this is a 1-to-many relationship.

   Args:
     symbol_infos: iterable of SymbolInfo instances

   Returns:
     a dict {name: [symbol_info1, ...], ...}
   """
   name_to_symbol_infos = collections.defaultdict(list)
   for symbol_info in symbol_infos:
     name_to_symbol_infos[symbol_info.name].append(symbol_info)
   return dict(name_to_symbol_infos)


 def CreateNameToSymbolInfo(symbol_infos):
   """Create a dict {name: symbol_info, ...}.

   Args:
     symbol_infos: iterable of SymbolInfo instances

   Returns:
     a dict {name: symbol_info, ...}
     If a symbol name corresponds to more than one symbol_info, the symbol_info
     with the lowest offset is chosen.
   """
   # TODO(lizeb,pasko): move the functionality in this method into
   # check_orderfile.
   symbol_infos_by_name = {}
   warnings = cygprofile_utils.WarningCollector(_MAX_WARNINGS_TO_PRINT)
   for infos in GroupSymbolInfosByName(symbol_infos).values():
     first_symbol_info = min(infos, key=lambda x: x.offset)
     symbol_infos_by_name[first_symbol_info.name] = first_symbol_info
     if len(infos) > 1:
       warnings.Write('Symbol %s appears at %d offsets: %s' %
                      (first_symbol_info.name,
                       len(infos),
                       ','.join([hex(x.offset) for x in infos])))
   warnings.WriteEnd('symbols at multiple offsets.')
   return symbol_infos_by_name


 def DemangleSymbol(mangled_symbol):
   """Return the demangled form of mangled_symbol."""
   cmd = [_TOOL_PREFIX + 'cxxfilt', mangled_symbol]
   return subprocess.check_output(cmd, universal_newlines=True).rstrip()
	# Copyright 2015 The Chromium Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Utilities to get and manipulate symbols from a binary."""

	import collections
	import json
	import logging
	import os
	import re
	import subprocess
	import sys

	import cygprofile_utils

	START_OF_TEXT_SYMBOL = 'linker_script_start_of_text'

	_SRC_PATH = os.path.abspath(os.path.join(
	os.path.dirname(__file__), os.pardir, os.pardir))
	_TOOL_PREFIX = os.path.join(_SRC_PATH, 'third_party', 'llvm-build',
	'Release+Asserts', 'bin', 'llvm-')

	_MAX_WARNINGS_TO_PRINT = 200

	SymbolInfo = collections.namedtuple('SymbolInfo', ('name', 'offset', 'size',
	'section'))


	def _SymbolInfosFromStream(input_file):
	"""Parses the output of llvm-readelf, and gets all the symbols from a binary.

	Args:
	input_file: a .json file handle containing the readelf output.

	Returns:
	A list of SymbolInfo.
	"""
	# Load the JSON output
	raw_symbols = json.load(input_file)
	# The file is structured as a list containing dictionaries, one per input
	# file.
	assert len(raw_symbols) == 1
	raw_symbols = raw_symbols[0]
	# Next have two sections: FileSummary and Symbols
	assert 'Symbols' in raw_symbols
	raw_symbols = raw_symbols['Symbols']

	name_to_offsets = collections.defaultdict(list)
	symbol_infos = []

	for symbol in raw_symbols:
	symbol = symbol['Symbol']
	name = symbol['Name']['Name']
	offset = symbol['Value']
	size = symbol['Size']
	section = symbol['Section']['Name']
	scope = symbol['Binding']['Name']
	# Output the label that contains the earliest offset. It is needed later for
	# translating offsets from the profile dumps.
	if name == START_OF_TEXT_SYMBOL:
	symbol_infos.append(
	SymbolInfo(name=name, offset=offset, section='.text', size=0))
	continue
	# Check symbol type for validity and ignore some types.
	symbol_type = symbol['Type']['Name']
	if symbol_type == 'None':
	# Ignore local goto labels. Unfortunately, v8 builtins (like
	# 'Builtins_.*') are indistinguishable from labels of size 0 other than
	# by name.
	continue
	if section != '.text':
	# Ignore anything that's outside the primary .text section
	continue
	assert symbol_type in ['Object', 'Function', 'File', 'GNU_IFunc']
	assert scope in ['Local', 'Global', 'Weak']
	# Forbid ARM mapping symbols and other unexpected symbol names, but allow $
	# characters in a non-initial position, which can appear as a component of a
	# mangled name, e.g. Clang can mangle a lambda function to:
	# 02cd61e0 l F .text 000000c0 _ZZL11get_globalsvENK3$_1clEv
	# The equivalent objdump line from GCC is:
	# 0325c58c l F .text 000000d0 _ZZL11get_globalsvENKUlvE_clEv
	#
	# Also disallow .internal and .protected symbols (as well as other flags),
	# those have not appeared in the binaries we parse. Rejecting these extra
	# prefixes is done by disallowing spaces in symbol names.
	assert re.match('^[a-zA-Z0-9_.][a-zA-Z0-9_.$]*$', name), name

	symbol_info = SymbolInfo(name=name,
	offset=offset,
	section=section,
	size=size)
	# On ARM the LLD linker inserts pseudo-functions (thunks) that allow
	# jumping distances farther than 16 MiB. Such thunks are known to often
	# reside on multiple offsets, they are not instrumented and hence they do
	# not reach the orderfiles. Exclude the thunk symbols from the warning.
	if not symbol_info.name.startswith('__ThumbV7PILongThunk_'):
	name_to_offsets[symbol_info.name].append(symbol_info.offset)
	symbol_infos.append(symbol_info)

	# Outlined functions are known to be repeated often, so ignore them in the
	# repeated symbol count.
	repeated_symbols = list(
	filter(lambda s: len(name_to_offsets[s]) > 1,
	(k for k in name_to_offsets.keys()
	if not k.startswith('OUTLINED_FUNCTION_'))))
	if repeated_symbols:
	# Log the first 5 repeated offsets of the first 10 repeated symbols.
	logging.warning('%d symbols repeated with multiple offsets:\n %s',
	len(repeated_symbols), '\n '.join(
	'{} {}'.format(sym, ' '.join(
	str(offset) for offset in name_to_offsets[sym][:5]))
	for sym in repeated_symbols[:10]))

	return symbol_infos


	def SymbolInfosFromBinary(binary_filename):
	"""Runs llvm-readelf to get all the symbols from a binary.

	Args:
	binary_filename: path to the binary.

	Returns:
	A list of SymbolInfo from the binary.
	"""
	command = [
	_TOOL_PREFIX + 'readelf', '--syms', '--elf-output-style=JSON',
	'--pretty-print', binary_filename
	]
	try:
	p = subprocess.Popen(command,
	stdout=subprocess.PIPE,
	universal_newlines=True)
	except OSError as error:
	logging.error('Failed to execute the command: path=%s, binary_filename=%s',
	command[0], binary_filename)
	raise error

	try:
	return _SymbolInfosFromStream(p.stdout)
	finally:
	p.wait()


	_LLVM_NM_LINE_RE = re.compile(
	r'^[\-0-9a-f]{8,16}[ ](?P<symbol_type>.)[ ](?P<name>.*)$', re.VERBOSE)


	def _SymbolInfosFromLlvmNm(lines):
	"""Extracts all defined symbols names from llvm-nm output.

	Only defined (weak and regular) symbols are extracted.

	Args:
	lines: Iterable of lines.

	Returns:
	[str] A list of symbol names, can be empty.
	"""
	symbol_names = []
	for line in lines:
	m = _LLVM_NM_LINE_RE.match(line)
	assert m is not None, line
	if m.group('symbol_type') not in ['t', 'T', 'w', 'W']:
	continue
	symbol_names.append(m.group('name'))
	return symbol_names


	_NM_PATH = os.path.join(_SRC_PATH, 'third_party', 'llvm-build',
	'Release+Asserts', 'bin', 'llvm-nm')


	def CheckLlvmNmExists():
	assert os.path.exists(_NM_PATH), (
	'llvm-nm not found. Please run '
	'//tools/clang/scripts/update.py --package=objdump to install it.')


	def SymbolNamesFromLlvmBitcodeFile(filename):
	"""Extracts all defined symbols names from an LLVM bitcode file.

	Args:
	filename: (str) File to parse.

	Returns:
	[str] A list of symbol names, can be empty.
	"""
	command = (_NM_PATH, '--defined-only', filename)
	p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE,
	stderr=subprocess.PIPE)
	try:
	result = _SymbolInfosFromLlvmNm(p.stdout)
	if not result:
	file_size = os.stat(filename).st_size
	logging.warning('No symbols for %s (size %d)', filename, file_size)
	return result
	finally:
	_, _ = p.communicate()
	p.stdout.close()
	assert p.wait() == 0


	def GroupSymbolInfosByOffset(symbol_infos):
	"""Create a dict {offset: [symbol_info1, ...], ...}.

	As several symbols can be at the same offset, this is a 1-to-many
	relationship.

	Args:
	symbol_infos: iterable of SymbolInfo instances

	Returns:
	a dict {offset: [symbol_info1, ...], ...}
	"""
	offset_to_symbol_infos = collections.defaultdict(list)
	for symbol_info in symbol_infos:
	offset_to_symbol_infos[symbol_info.offset].append(symbol_info)
	return dict(offset_to_symbol_infos)


	def GroupSymbolInfosByName(symbol_infos):
	"""Create a dict {name: [symbol_info1, ...], ...}.

	A symbol can have several offsets, this is a 1-to-many relationship.

	Args:
	symbol_infos: iterable of SymbolInfo instances

	Returns:
	a dict {name: [symbol_info1, ...], ...}
	"""
	name_to_symbol_infos = collections.defaultdict(list)
	for symbol_info in symbol_infos:
	name_to_symbol_infos[symbol_info.name].append(symbol_info)
	return dict(name_to_symbol_infos)


	def CreateNameToSymbolInfo(symbol_infos):
	"""Create a dict {name: symbol_info, ...}.

	Args:
	symbol_infos: iterable of SymbolInfo instances

	Returns:
	a dict {name: symbol_info, ...}
	If a symbol name corresponds to more than one symbol_info, the symbol_info
	with the lowest offset is chosen.
	"""
	# TODO(lizeb,pasko): move the functionality in this method into
	# check_orderfile.
	symbol_infos_by_name = {}
	warnings = cygprofile_utils.WarningCollector(_MAX_WARNINGS_TO_PRINT)
	for infos in GroupSymbolInfosByName(symbol_infos).values():
	first_symbol_info = min(infos, key=lambda x: x.offset)
	symbol_infos_by_name[first_symbol_info.name] = first_symbol_info
	if len(infos) > 1:
	warnings.Write('Symbol %s appears at %d offsets: %s' %
	(first_symbol_info.name,
	len(infos),
	','.join([hex(x.offset) for x in infos])))
	warnings.WriteEnd('symbols at multiple offsets.')
	return symbol_infos_by_name


	def DemangleSymbol(mangled_symbol):
	"""Return the demangled form of mangled_symbol."""
	cmd = [_TOOL_PREFIX + 'cxxfilt', mangled_symbol]
	return subprocess.check_output(cmd, universal_newlines=True).rstrip()