tools/cygprofile/symbol_extractor.py - chromium/src.git - Git at Google

 # Copyright 2015 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Utilities to get and manipulate symbols from a binary."""

 import collections
 import logging
 import os
 import re
 import subprocess
 import sys

 import cygprofile_utils

 _SRC_PATH = os.path.abspath(os.path.join(
     os.path.dirname(__file__), os.pardir, os.pardir))

 sys.path.insert(0, os.path.join(_SRC_PATH, 'build', 'android'))
 from pylib.constants import host_paths

 _MAX_WARNINGS_TO_PRINT = 200

 SymbolInfo = collections.namedtuple('SymbolInfo', ('name', 'offset', 'size',
                                                    'section'))

 # Unfortunate global variable :-/
 _arch = 'arm'


 def SetArchitecture(arch):
   """Set the architecture for binaries to be symbolized."""
   global _arch
   _arch = arch


 # Regular expression to match lines printed by 'objdump -t -w'. An example of
 # such line looks like this:
 # 018db2de l     F .text  00000060              .hidden _ZN8SkBitmapC2ERKS_
 #
 # The regex intentionally allows matching more than valid inputs. This gives
 # more protection against potentially incorrectly silently ignoring unmatched
 # input lines. Instead a few assertions early in _FromObjdumpLine() check the
 # validity of a few parts matched as groups.
 _OBJDUMP_LINE_RE = re.compile(r'''
   # The offset of the function, as hex.
   (?P<offset>^[0-9a-f]+)

   # The space character.
   [ ]

   # The 7 groups of flag characters, one character each.
   (
     (?P<assert_scope>.)           # Global, local, unique local, etc.
     (?P<assert_weak_or_strong>.)
     (?P<assert_4spaces>.{4})      # Constructor, warning, indirect ref,
                                   # debugger symbol.
     (?P<symbol_type>.)            # Function, object, file or normal.
   )

   [ ]

   # The section name should start with ".text", can be ".text.foo". With LLD,
   # and especially LTO the traces of input sections are not preserved. Support
   # ".text.foo" for a little longer time because it is easy.
   (?P<section>.text[^0-9a-f]*)

   (?P<assert_tab> \s+)

   # The size of the symbol, as hex.
   (?P<size>[0-9a-f]+)

   # Normally separated out by 14 spaces, but some bits in ELF may theoretically
   # affect this length.
   (?P<assert_14spaces>[ ]+)

   # Hidden symbols should be treated as usual.
   (.hidden [ ])?

   # The symbol name.
   (?P<name>.*)

   $
   ''', re.VERBOSE)


 def _FromObjdumpLine(line):
   """Create a SymbolInfo by parsing a properly formatted objdump output line.

   Args:
     line: line from objdump

   Returns:
     An instance of SymbolInfo if the line represents a symbol, None otherwise.
   """
   m = _OBJDUMP_LINE_RE.match(line)
   if not m:
     return None

   assert m.group('assert_scope') in set(['g', 'l']), line
   assert m.group('assert_weak_or_strong') in set(['w', ' ']), line
   assert m.group('assert_tab') == '\t', line
   assert m.group('assert_4spaces') == ' ' * 4, line
   assert m.group('assert_14spaces') == ' ' * 14, line
   name = m.group('name')
   offset = int(m.group('offset'), 16)

   # Output the label that contains the earliest offset. It is needed later for
   # translating offsets from the profile dumps.
   if name == cygprofile_utils.START_OF_TEXT_SYMBOL:
     return SymbolInfo(name=name, offset=offset, section='.text', size=0)

   # Check symbol type for validity and ignore some types.
   # From objdump manual page: The symbol is the name of a function (F) or a file
   # (f) or an object (O) or just a normal symbol (a space). The 'normal' symbols
   # seens so far has been function-local labels.
   symbol_type = m.group('symbol_type')
   if symbol_type == ' ':
     # Ignore local goto labels. Unfortunately, v8 builtins (like 'Builtins_.*')
     # are indistinguishable from labels of size 0 other than by name.
     return None
   # Guard against file symbols, since they are normally not seen in the
   # binaries we parse.
   assert symbol_type != 'f', line

   # Extract the size from the ELF field. This value sometimes does not reflect
   # the real size of the function. One reason for that is the '.size' directive
   # in the assembler. As a result, a few functions in .S files have the size 0.
   # They are not instrumented (yet), but maintaining their order in the
   # orderfile may be important in some cases.
   size = int(m.group('size'), 16)

   # Forbid ARM mapping symbols and other unexpected symbol names, but allow $
   # characters in a non-initial position, which can appear as a component of a
   # mangled name, e.g. Clang can mangle a lambda function to:
   # 02cd61e0 l     F .text  000000c0 _ZZL11get_globalsvENK3$_1clEv
   # The equivalent objdump line from GCC is:
   # 0325c58c l     F .text  000000d0 _ZZL11get_globalsvENKUlvE_clEv
   #
   # Also disallow .internal and .protected symbols (as well as other flags),
   # those have not appeared in the binaries we parse. Rejecting these extra
   # prefixes is done by disallowing spaces in symbol names.
   assert re.match('^[a-zA-Z0-9_.][a-zA-Z0-9_.$]*$', name), name

   return SymbolInfo(name=name, offset=offset, section=m.group('section'),
                     size=size)


 def _SymbolInfosFromStream(objdump_lines):
   """Parses the output of objdump, and get all the symbols from a binary.

   Args:
     objdump_lines: An iterable of lines

   Returns:
     A list of SymbolInfo.
   """
   name_to_offsets = collections.defaultdict(list)
   symbol_infos = []
   for line in objdump_lines:
     symbol_info = _FromObjdumpLine(line.rstrip('\n'))
     if symbol_info is not None:
       # On ARM the LLD linker inserts pseudo-functions (thunks) that allow
       # jumping distances farther than 16 MiB. Such thunks are known to often
       # reside on multiple offsets, they are not instrumented and hence they do
       # not reach the orderfiles. Exclude the thunk symbols from the warning.
       if not symbol_info.name.startswith('__ThumbV7PILongThunk_'):
         name_to_offsets[symbol_info.name].append(symbol_info.offset)
       symbol_infos.append(symbol_info)

   # Outlined functions are known to be repeated often, so ignore them in the
   # repeated symbol count.
   repeated_symbols = filter(lambda s: len(name_to_offsets[s]) > 1,
                             (k for k in name_to_offsets.keys()
                              if not k.startswith('OUTLINED_FUNCTION_')))
   if repeated_symbols:
     # Log the first 5 repeated offsets of the first 10 repeated symbols.
     logging.warning('%d symbols repeated with multiple offsets:\n %s',
                     len(repeated_symbols), '\n '.join(
                         '{} {}'.format(sym, ' '.join(
                             str(offset) for offset in name_to_offsets[sym][:5]))
                         for sym in repeated_symbols[:10]))

   return symbol_infos


 def SymbolInfosFromBinary(binary_filename):
   """Runs objdump to get all the symbols from a binary.

   Args:
     binary_filename: path to the binary.

   Returns:
     A list of SymbolInfo from the binary.
   """
   command = (host_paths.ToolPath('objdump', _arch), '-t', '-w', binary_filename)
   p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE)
   try:
     result = _SymbolInfosFromStream(p.stdout)
     return result
   finally:
     p.stdout.close()
     p.wait()


 _LLVM_NM_LINE_RE = re.compile(
     r'^[\-0-9a-f]{8,16}[ ](?P<symbol_type>.)[ ](?P<name>.*)$', re.VERBOSE)


 def _SymbolInfosFromLlvmNm(lines):
   """Extracts all defined symbols names from llvm-nm output.

   Only defined (weak and regular) symbols are extracted.

   Args:
     lines: Iterable of lines.

   Returns:
     [str] A list of symbol names, can be empty.
   """
   symbol_names = []
   for line in lines:
     m = _LLVM_NM_LINE_RE.match(line)
     assert m is not None, line
     if m.group('symbol_type') not in ['t', 'T', 'w', 'W']:
       continue
     symbol_names.append(m.group('name'))
   return symbol_names


 _NM_PATH = os.path.join(_SRC_PATH, 'third_party', 'llvm-build',
                         'Release+Asserts', 'bin', 'llvm-nm')


 def CheckLlvmNmExists():
   assert os.path.exists(_NM_PATH), (
       'llvm-nm not found. Please run //tools/clang/scripts/download_objdump.py'
       ' to install it.')


 def SymbolNamesFromLlvmBitcodeFile(filename):
   """Extracts all defined symbols names from an LLVM bitcode file.

   Args:
     filename: (str) File to parse.

   Returns:
     [str] A list of symbol names, can be empty.
   """
   command = (_NM_PATH, '-defined-only', filename)
   p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE)
   try:
     result = _SymbolInfosFromLlvmNm(p.stdout)
     if not result:
       file_size = os.stat(filename).st_size
       logging.warning('No symbols for %s (size %d)', filename, file_size)
     return result
   finally:
     _, _ = p.communicate()
     p.stdout.close()
     assert p.wait() == 0


 def GroupSymbolInfosByOffset(symbol_infos):
   """Create a dict {offset: [symbol_info1, ...], ...}.

   As several symbols can be at the same offset, this is a 1-to-many
   relationship.

   Args:
     symbol_infos: iterable of SymbolInfo instances

   Returns:
     a dict {offset: [symbol_info1, ...], ...}
   """
   offset_to_symbol_infos = collections.defaultdict(list)
   for symbol_info in symbol_infos:
     offset_to_symbol_infos[symbol_info.offset].append(symbol_info)
   return dict(offset_to_symbol_infos)


 def GroupSymbolInfosByName(symbol_infos):
   """Create a dict {name: [symbol_info1, ...], ...}.

   A symbol can have several offsets, this is a 1-to-many relationship.

   Args:
     symbol_infos: iterable of SymbolInfo instances

   Returns:
     a dict {name: [symbol_info1, ...], ...}
   """
   name_to_symbol_infos = collections.defaultdict(list)
   for symbol_info in symbol_infos:
     name_to_symbol_infos[symbol_info.name].append(symbol_info)
   return dict(name_to_symbol_infos)


 def CreateNameToSymbolInfo(symbol_infos):
   """Create a dict {name: symbol_info, ...}.

   Args:
     symbol_infos: iterable of SymbolInfo instances

   Returns:
     a dict {name: symbol_info, ...}
     If a symbol name corresponds to more than one symbol_info, the symbol_info
     with the lowest offset is chosen.
   """
   # TODO(lizeb,pasko): move the functionality in this method into
   # check_orderfile.
   symbol_infos_by_name = {}
   warnings = cygprofile_utils.WarningCollector(_MAX_WARNINGS_TO_PRINT)
   for infos in GroupSymbolInfosByName(symbol_infos).itervalues():
     first_symbol_info = min(infos, key=lambda x: x.offset)
     symbol_infos_by_name[first_symbol_info.name] = first_symbol_info
     if len(infos) > 1:
       warnings.Write('Symbol %s appears at %d offsets: %s' %
                      (first_symbol_info.name,
                       len(infos),
                       ','.join([hex(x.offset) for x in infos])))
   warnings.WriteEnd('symbols at multiple offsets.')
   return symbol_infos_by_name


 def DemangleSymbol(mangled_symbol):
   """Return the demangled form of mangled_symbol."""
   cmd = [host_paths.ToolPath('c++filt', _arch)]
   process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
   demangled_symbol, _ = process.communicate(mangled_symbol + '\n')
   return demangled_symbol
	# Copyright 2015 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Utilities to get and manipulate symbols from a binary."""

	import collections
	import logging
	import os
	import re
	import subprocess
	import sys

	import cygprofile_utils

	_SRC_PATH = os.path.abspath(os.path.join(
	os.path.dirname(__file__), os.pardir, os.pardir))

	sys.path.insert(0, os.path.join(_SRC_PATH, 'build', 'android'))
	from pylib.constants import host_paths

	_MAX_WARNINGS_TO_PRINT = 200

	SymbolInfo = collections.namedtuple('SymbolInfo', ('name', 'offset', 'size',
	'section'))

	# Unfortunate global variable :-/
	_arch = 'arm'


	def SetArchitecture(arch):
	"""Set the architecture for binaries to be symbolized."""
	global _arch
	_arch = arch


	# Regular expression to match lines printed by 'objdump -t -w'. An example of
	# such line looks like this:
	# 018db2de l F .text 00000060 .hidden _ZN8SkBitmapC2ERKS_
	#
	# The regex intentionally allows matching more than valid inputs. This gives
	# more protection against potentially incorrectly silently ignoring unmatched
	# input lines. Instead a few assertions early in _FromObjdumpLine() check the
	# validity of a few parts matched as groups.
	_OBJDUMP_LINE_RE = re.compile(r'''
	# The offset of the function, as hex.
	(?P<offset>^[0-9a-f]+)

	# The space character.
	[ ]

	# The 7 groups of flag characters, one character each.
	(
	(?P<assert_scope>.) # Global, local, unique local, etc.
	(?P<assert_weak_or_strong>.)
	(?P<assert_4spaces>.{4}) # Constructor, warning, indirect ref,
	# debugger symbol.
	(?P<symbol_type>.) # Function, object, file or normal.
	)

	[ ]

	# The section name should start with ".text", can be ".text.foo". With LLD,
	# and especially LTO the traces of input sections are not preserved. Support
	# ".text.foo" for a little longer time because it is easy.
	(?P<section>.text[^0-9a-f]*)

	(?P<assert_tab> \s+)

	# The size of the symbol, as hex.
	(?P<size>[0-9a-f]+)

	# Normally separated out by 14 spaces, but some bits in ELF may theoretically
	# affect this length.
	(?P<assert_14spaces>[ ]+)

	# Hidden symbols should be treated as usual.
	(.hidden [ ])?

	# The symbol name.
	(?P<name>.*)

	$
	''', re.VERBOSE)


	def _FromObjdumpLine(line):
	"""Create a SymbolInfo by parsing a properly formatted objdump output line.

	Args:
	line: line from objdump

	Returns:
	An instance of SymbolInfo if the line represents a symbol, None otherwise.
	"""
	m = _OBJDUMP_LINE_RE.match(line)
	if not m:
	return None

	assert m.group('assert_scope') in set(['g', 'l']), line
	assert m.group('assert_weak_or_strong') in set(['w', ' ']), line
	assert m.group('assert_tab') == '\t', line
	assert m.group('assert_4spaces') == ' ' * 4, line
	assert m.group('assert_14spaces') == ' ' * 14, line
	name = m.group('name')
	offset = int(m.group('offset'), 16)

	# Output the label that contains the earliest offset. It is needed later for
	# translating offsets from the profile dumps.
	if name == cygprofile_utils.START_OF_TEXT_SYMBOL:
	return SymbolInfo(name=name, offset=offset, section='.text', size=0)

	# Check symbol type for validity and ignore some types.
	# From objdump manual page: The symbol is the name of a function (F) or a file
	# (f) or an object (O) or just a normal symbol (a space). The 'normal' symbols
	# seens so far has been function-local labels.
	symbol_type = m.group('symbol_type')
	if symbol_type == ' ':
	# Ignore local goto labels. Unfortunately, v8 builtins (like 'Builtins_.*')
	# are indistinguishable from labels of size 0 other than by name.
	return None
	# Guard against file symbols, since they are normally not seen in the
	# binaries we parse.
	assert symbol_type != 'f', line

	# Extract the size from the ELF field. This value sometimes does not reflect
	# the real size of the function. One reason for that is the '.size' directive
	# in the assembler. As a result, a few functions in .S files have the size 0.
	# They are not instrumented (yet), but maintaining their order in the
	# orderfile may be important in some cases.
	size = int(m.group('size'), 16)

	# Forbid ARM mapping symbols and other unexpected symbol names, but allow $
	# characters in a non-initial position, which can appear as a component of a
	# mangled name, e.g. Clang can mangle a lambda function to:
	# 02cd61e0 l F .text 000000c0 _ZZL11get_globalsvENK3$_1clEv
	# The equivalent objdump line from GCC is:
	# 0325c58c l F .text 000000d0 _ZZL11get_globalsvENKUlvE_clEv
	#
	# Also disallow .internal and .protected symbols (as well as other flags),
	# those have not appeared in the binaries we parse. Rejecting these extra
	# prefixes is done by disallowing spaces in symbol names.
	assert re.match('^[a-zA-Z0-9_.][a-zA-Z0-9_.$]*$', name), name

	return SymbolInfo(name=name, offset=offset, section=m.group('section'),
	size=size)


	def _SymbolInfosFromStream(objdump_lines):
	"""Parses the output of objdump, and get all the symbols from a binary.

	Args:
	objdump_lines: An iterable of lines

	Returns:
	A list of SymbolInfo.
	"""
	name_to_offsets = collections.defaultdict(list)
	symbol_infos = []
	for line in objdump_lines:
	symbol_info = _FromObjdumpLine(line.rstrip('\n'))
	if symbol_info is not None:
	# On ARM the LLD linker inserts pseudo-functions (thunks) that allow
	# jumping distances farther than 16 MiB. Such thunks are known to often
	# reside on multiple offsets, they are not instrumented and hence they do
	# not reach the orderfiles. Exclude the thunk symbols from the warning.
	if not symbol_info.name.startswith('__ThumbV7PILongThunk_'):
	name_to_offsets[symbol_info.name].append(symbol_info.offset)
	symbol_infos.append(symbol_info)

	# Outlined functions are known to be repeated often, so ignore them in the
	# repeated symbol count.
	repeated_symbols = filter(lambda s: len(name_to_offsets[s]) > 1,
	(k for k in name_to_offsets.keys()
	if not k.startswith('OUTLINED_FUNCTION_')))
	if repeated_symbols:
	# Log the first 5 repeated offsets of the first 10 repeated symbols.
	logging.warning('%d symbols repeated with multiple offsets:\n %s',
	len(repeated_symbols), '\n '.join(
	'{} {}'.format(sym, ' '.join(
	str(offset) for offset in name_to_offsets[sym][:5]))
	for sym in repeated_symbols[:10]))

	return symbol_infos


	def SymbolInfosFromBinary(binary_filename):
	"""Runs objdump to get all the symbols from a binary.

	Args:
	binary_filename: path to the binary.

	Returns:
	A list of SymbolInfo from the binary.
	"""
	command = (host_paths.ToolPath('objdump', _arch), '-t', '-w', binary_filename)
	p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE)
	try:
	result = _SymbolInfosFromStream(p.stdout)
	return result
	finally:
	p.stdout.close()
	p.wait()


	_LLVM_NM_LINE_RE = re.compile(
	r'^[\-0-9a-f]{8,16}[ ](?P<symbol_type>.)[ ](?P<name>.*)$', re.VERBOSE)


	def _SymbolInfosFromLlvmNm(lines):
	"""Extracts all defined symbols names from llvm-nm output.

	Only defined (weak and regular) symbols are extracted.

	Args:
	lines: Iterable of lines.

	Returns:
	[str] A list of symbol names, can be empty.
	"""
	symbol_names = []
	for line in lines:
	m = _LLVM_NM_LINE_RE.match(line)
	assert m is not None, line
	if m.group('symbol_type') not in ['t', 'T', 'w', 'W']:
	continue
	symbol_names.append(m.group('name'))
	return symbol_names


	_NM_PATH = os.path.join(_SRC_PATH, 'third_party', 'llvm-build',
	'Release+Asserts', 'bin', 'llvm-nm')


	def CheckLlvmNmExists():
	assert os.path.exists(_NM_PATH), (
	'llvm-nm not found. Please run //tools/clang/scripts/download_objdump.py'
	' to install it.')


	def SymbolNamesFromLlvmBitcodeFile(filename):
	"""Extracts all defined symbols names from an LLVM bitcode file.

	Args:
	filename: (str) File to parse.

	Returns:
	[str] A list of symbol names, can be empty.
	"""
	command = (_NM_PATH, '-defined-only', filename)
	p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE,
	stderr=subprocess.PIPE)
	try:
	result = _SymbolInfosFromLlvmNm(p.stdout)
	if not result:
	file_size = os.stat(filename).st_size
	logging.warning('No symbols for %s (size %d)', filename, file_size)
	return result
	finally:
	_, _ = p.communicate()
	p.stdout.close()
	assert p.wait() == 0


	def GroupSymbolInfosByOffset(symbol_infos):
	"""Create a dict {offset: [symbol_info1, ...], ...}.

	As several symbols can be at the same offset, this is a 1-to-many
	relationship.

	Args:
	symbol_infos: iterable of SymbolInfo instances

	Returns:
	a dict {offset: [symbol_info1, ...], ...}
	"""
	offset_to_symbol_infos = collections.defaultdict(list)
	for symbol_info in symbol_infos:
	offset_to_symbol_infos[symbol_info.offset].append(symbol_info)
	return dict(offset_to_symbol_infos)


	def GroupSymbolInfosByName(symbol_infos):
	"""Create a dict {name: [symbol_info1, ...], ...}.

	A symbol can have several offsets, this is a 1-to-many relationship.

	Args:
	symbol_infos: iterable of SymbolInfo instances

	Returns:
	a dict {name: [symbol_info1, ...], ...}
	"""
	name_to_symbol_infos = collections.defaultdict(list)
	for symbol_info in symbol_infos:
	name_to_symbol_infos[symbol_info.name].append(symbol_info)
	return dict(name_to_symbol_infos)


	def CreateNameToSymbolInfo(symbol_infos):
	"""Create a dict {name: symbol_info, ...}.

	Args:
	symbol_infos: iterable of SymbolInfo instances

	Returns:
	a dict {name: symbol_info, ...}
	If a symbol name corresponds to more than one symbol_info, the symbol_info
	with the lowest offset is chosen.
	"""
	# TODO(lizeb,pasko): move the functionality in this method into
	# check_orderfile.
	symbol_infos_by_name = {}
	warnings = cygprofile_utils.WarningCollector(_MAX_WARNINGS_TO_PRINT)
	for infos in GroupSymbolInfosByName(symbol_infos).itervalues():
	first_symbol_info = min(infos, key=lambda x: x.offset)
	symbol_infos_by_name[first_symbol_info.name] = first_symbol_info
	if len(infos) > 1:
	warnings.Write('Symbol %s appears at %d offsets: %s' %
	(first_symbol_info.name,
	len(infos),
	','.join([hex(x.offset) for x in infos])))
	warnings.WriteEnd('symbols at multiple offsets.')
	return symbol_infos_by_name


	def DemangleSymbol(mangled_symbol):
	"""Return the demangled form of mangled_symbol."""
	cmd = [host_paths.ToolPath('c++filt', _arch)]
	process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
	demangled_symbol, _ = process.communicate(mangled_symbol + '\n')
	return demangled_symbol