| # Copyright 2015 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Utilities to get and manipulate symbols from a binary.""" |
| |
| import collections |
| import logging |
| import os |
| import re |
| import subprocess |
| import sys |
| |
| import cygprofile_utils |
| |
| sys.path.insert( |
| 0, os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, |
| 'build', 'android')) |
| |
| from pylib import constants |
| from pylib.constants import host_paths |
| |
| _MAX_WARNINGS_TO_PRINT = 200 |
| |
| SymbolInfo = collections.namedtuple('SymbolInfo', ('name', 'offset', 'size', |
| 'section')) |
| |
| # Unfortunate global variable :-/ |
| _arch = 'arm' |
| |
| |
| def SetArchitecture(arch): |
| """Set the architecture for binaries to be symbolized.""" |
| global _arch |
| _arch = arch |
| |
| |
| # Regular expression to match lines printed by 'objdump -t -w'. An example of |
| # such line looks like this: |
| # 018db2de l F .text 00000060 .hidden _ZN8SkBitmapC2ERKS_ |
| # |
| # The regex intentionally allows matching more than valid inputs. This gives |
| # more protection against potentially incorrectly silently ignoring unmatched |
| # input lines. Instead a few assertions early in _FromObjdumpLine() check the |
| # validity of a few parts matched as groups. |
| _OBJDUMP_LINE_RE = re.compile(r''' |
| # The offset of the function, as hex. |
| (?P<offset>^[0-9a-f]+) |
| |
| # The space character. |
| [ ] |
| |
| # The 7 groups of flag characters, one character each. |
| ( |
| (?P<assert_scope>.) # Global, local, unique local, etc. |
| (?P<assert_weak_or_strong>.) |
| (?P<assert_4spaces>.{4}) # Constructor, warning, indirect ref, |
| # debugger symbol. |
| (?P<symbol_type>.) # Function, object, file or normal. |
| ) |
| |
| [ ] |
| |
| # The section name should start with ".text", can be ".text.foo". With LLD, |
| # and especially LTO the traces of input sections are not preserved. Support |
| # ".text.foo" for a little longer time because it is easy. |
| (?P<section>.text[^0-9a-f]*) |
| |
| (?P<assert_tab> \s+) |
| |
| # The size of the symbol, as hex. |
| (?P<size>[0-9a-f]+) |
| |
| # Normally separated out by 14 spaces, but some bits in ELF may theoretically |
| # affect this length. |
| (?P<assert_14spaces>[ ]+) |
| |
| # Hidden symbols should be treated as usual. |
| (.hidden [ ])? |
| |
| # The symbol name. |
| (?P<name>.*) |
| |
| $ |
| ''', re.VERBOSE) |
| |
| |
| def _FromObjdumpLine(line): |
| """Create a SymbolInfo by parsing a properly formatted objdump output line. |
| |
| Args: |
| line: line from objdump |
| |
| Returns: |
| An instance of SymbolInfo if the line represents a symbol, None otherwise. |
| """ |
| m = _OBJDUMP_LINE_RE.match(line) |
| if not m: |
| return None |
| |
| assert m.group('assert_scope') in set(['g', 'l']), line |
| assert m.group('assert_weak_or_strong') in set(['w', ' ']), line |
| assert m.group('assert_tab') == '\t', line |
| assert m.group('assert_4spaces') == ' ' * 4, line |
| assert m.group('assert_14spaces') == ' ' * 14, line |
| name = m.group('name') |
| offset = int(m.group('offset'), 16) |
| |
| # Output the label that contains the earliest offset. It is needed later for |
| # translating offsets from the profile dumps. |
| if name == cygprofile_utils.START_OF_TEXT_SYMBOL: |
| return SymbolInfo(name=name, offset=offset, section='.text', size=0) |
| |
| # Check symbol type for validity and ignore some types. |
| # From objdump manual page: The symbol is the name of a function (F) or a file |
| # (f) or an object (O) or just a normal symbol (a space). The 'normal' symbols |
| # seens so far has been function-local labels. |
| symbol_type = m.group('symbol_type') |
| if symbol_type == ' ': |
| # Ignore local goto labels. Unfortunately, v8 builtins (like 'Builtins_.*') |
| # are indistinguishable from labels of size 0 other than by name. |
| return None |
| # Guard against file symbols, since they are normally not seen in the |
| # binaries we parse. |
| assert symbol_type != 'f', line |
| |
| # Extract the size from the ELF field. This value sometimes does not reflect |
| # the real size of the function. One reason for that is the '.size' directive |
| # in the assembler. As a result, a few functions in .S files have the size 0. |
| # They are not instrumented (yet), but maintaining their order in the |
| # orderfile may be important in some cases. |
| size = int(m.group('size'), 16) |
| |
| # Forbid ARM mapping symbols and other unexpected symbol names, but allow $ |
| # characters in a non-initial position, which can appear as a component of a |
| # mangled name, e.g. Clang can mangle a lambda function to: |
| # 02cd61e0 l F .text 000000c0 _ZZL11get_globalsvENK3$_1clEv |
| # The equivalent objdump line from GCC is: |
| # 0325c58c l F .text 000000d0 _ZZL11get_globalsvENKUlvE_clEv |
| # |
| # Also disallow .internal and .protected symbols (as well as other flags), |
| # those have not appeared in the binaries we parse. Rejecting these extra |
| # prefixes is done by disallowing spaces in symbol names. |
| assert re.match('^[a-zA-Z0-9_.][a-zA-Z0-9_.$]*$', name), name |
| |
| return SymbolInfo(name=name, offset=offset, section=m.group('section'), |
| size=size) |
| |
| |
| def _SymbolInfosFromStream(objdump_lines): |
| """Parses the output of objdump, and get all the symbols from a binary. |
| |
| Args: |
| objdump_lines: An iterable of lines |
| |
| Returns: |
| A list of SymbolInfo. |
| """ |
| name_to_offsets = collections.defaultdict(list) |
| symbol_infos = [] |
| for line in objdump_lines: |
| symbol_info = _FromObjdumpLine(line.rstrip('\n')) |
| if symbol_info is not None: |
| # On ARM the LLD linker inserts pseudo-functions (thunks) that allow |
| # jumping distances farther than 16 MiB. Such thunks are known to often |
| # reside on multiple offsets, they are not instrumented and hence they do |
| # not reach the orderfiles. Exclude the thunk symbols from the warning. |
| if not symbol_info.name.startswith('__ThumbV7PILongThunk_'): |
| name_to_offsets[symbol_info.name].append(symbol_info.offset) |
| symbol_infos.append(symbol_info) |
| |
| repeated_symbols = filter(lambda s: len(name_to_offsets[s]) > 1, |
| name_to_offsets.iterkeys()) |
| if repeated_symbols: |
| # Log the first 5 repeated offsets of the first 10 repeated symbols. |
| logging.warning('%d symbols repeated with multiple offsets:\n %s', |
| len(repeated_symbols), '\n '.join( |
| '{} {}'.format(sym, ' '.join( |
| str(offset) for offset in name_to_offsets[sym][:5])) |
| for sym in repeated_symbols[:10])) |
| |
| return symbol_infos |
| |
| |
| def SymbolInfosFromBinary(binary_filename): |
| """Runs objdump to get all the symbols from a binary. |
| |
| Args: |
| binary_filename: path to the binary. |
| |
| Returns: |
| A list of SymbolInfo from the binary. |
| """ |
| command = (host_paths.ToolPath('objdump', _arch), '-t', '-w', binary_filename) |
| p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE) |
| try: |
| result = _SymbolInfosFromStream(p.stdout) |
| return result |
| finally: |
| p.stdout.close() |
| p.wait() |
| |
| |
| def GroupSymbolInfosByOffset(symbol_infos): |
| """Create a dict {offset: [symbol_info1, ...], ...}. |
| |
| As several symbols can be at the same offset, this is a 1-to-many |
| relationship. |
| |
| Args: |
| symbol_infos: iterable of SymbolInfo instances |
| |
| Returns: |
| a dict {offset: [symbol_info1, ...], ...} |
| """ |
| offset_to_symbol_infos = collections.defaultdict(list) |
| for symbol_info in symbol_infos: |
| offset_to_symbol_infos[symbol_info.offset].append(symbol_info) |
| return dict(offset_to_symbol_infos) |
| |
| |
| def GroupSymbolInfosByName(symbol_infos): |
| """Create a dict {name: [symbol_info1, ...], ...}. |
| |
| A symbol can have several offsets, this is a 1-to-many relationship. |
| |
| Args: |
| symbol_infos: iterable of SymbolInfo instances |
| |
| Returns: |
| a dict {name: [symbol_info1, ...], ...} |
| """ |
| name_to_symbol_infos = collections.defaultdict(list) |
| for symbol_info in symbol_infos: |
| name_to_symbol_infos[symbol_info.name].append(symbol_info) |
| return dict(name_to_symbol_infos) |
| |
| |
| def CreateNameToSymbolInfo(symbol_infos): |
| """Create a dict {name: symbol_info, ...}. |
| |
| Args: |
| symbol_infos: iterable of SymbolInfo instances |
| |
| Returns: |
| a dict {name: symbol_info, ...} |
| If a symbol name corresponds to more than one symbol_info, the symbol_info |
| with the lowest offset is chosen. |
| """ |
| # TODO(lizeb,pasko): move the functionality in this method into |
| # check_orderfile. |
| symbol_infos_by_name = {} |
| warnings = cygprofile_utils.WarningCollector(_MAX_WARNINGS_TO_PRINT) |
| for infos in GroupSymbolInfosByName(symbol_infos).itervalues(): |
| first_symbol_info = min(infos, key=lambda x:x.offset) |
| symbol_infos_by_name[first_symbol_info.name] = first_symbol_info |
| if len(infos) > 1: |
| warnings.Write('Symbol %s appears at %d offsets: %s' % |
| (first_symbol_info.name, |
| len(infos), |
| ','.join([hex(x.offset) for x in infos]))) |
| warnings.WriteEnd('symbols at multiple offsets.') |
| return symbol_infos_by_name |
| |
| |
| def DemangleSymbol(mangled_symbol): |
| """Return the demangled form of mangled_symbol.""" |
| cmd = [host_paths.ToolPath("c++filt", _arch)] |
| process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) |
| demangled_symbol, _ = process.communicate(mangled_symbol + '\n') |
| return demangled_symbol |