tools/binary_size/libsupersize/archive.py - chromium/src - Git at Google

 # Copyright 2017 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Main Python API for analyzing binary size."""

 import argparse
 import calendar
 import collections
 import datetime
 import gzip
 import itertools
 import logging
 import os
 import posixpath
 import re
 import subprocess
 import sys
 import tempfile
 import zipfile

 import concurrent
 import describe
 import file_format
 import function_signature
 import linker_map_parser
 import models
 import ninja_parser
 import nm
 import paths


 # Effect of _MAX_SAME_NAME_ALIAS_COUNT (as of Oct 2017, with min_pss = max):
 # 1: shared .text symbols = 1772874 bytes, file size = 9.43MiB (645476 symbols).
 # 2: shared .text symbols = 1065654 bytes, file size = 9.58MiB (669952 symbols).
 # 6: shared .text symbols = 464058 bytes, file size = 10.11MiB (782693 symbols).
 # 10: shared .text symbols = 365648 bytes, file size =10.24MiB (813758 symbols).
 # 20: shared .text symbols = 86202 bytes, file size = 10.38MiB (854548 symbols).
 # 40: shared .text symbols = 48424 bytes, file size = 10.50MiB (890396 symbols).
 # 50: shared .text symbols = 41860 bytes, file size = 10.54MiB (902304 symbols).
 # max: shared .text symbols = 0 bytes, file size = 11.10MiB (1235449 symbols).
 _MAX_SAME_NAME_ALIAS_COUNT = 40  # 50kb is basically negligable.


 def _OpenMaybeGz(path):
   """Calls `gzip.open()` if |path| ends in ".gz", otherwise calls `open()`."""
   if path.endswith('.gz'):
     return gzip.open(path, 'rb')
   return open(path, 'rb')


 def _StripLinkerAddedSymbolPrefixes(raw_symbols):
   """Removes prefixes sometimes added to symbol names during link

   Removing prefixes make symbol names match up with those found in .o files.
   """
   for symbol in raw_symbols:
     full_name = symbol.full_name
     if full_name.startswith('startup.'):
       symbol.flags |= models.FLAG_STARTUP
       symbol.full_name = full_name[8:]
     elif full_name.startswith('unlikely.'):
       symbol.flags |= models.FLAG_UNLIKELY
       symbol.full_name = full_name[9:]
     elif full_name.startswith('rel.local.'):
       symbol.flags |= models.FLAG_REL_LOCAL
       symbol.full_name = full_name[10:]
     elif full_name.startswith('rel.'):
       symbol.flags |= models.FLAG_REL
       symbol.full_name = full_name[4:]


 def _UnmangleRemainingSymbols(raw_symbols, tool_prefix):
   """Uses c++filt to unmangle any symbols that need it."""
   to_process = [s for s in raw_symbols if s.full_name.startswith('_Z')]
   if not to_process:
     return

   logging.info('Unmangling %d names', len(to_process))
   proc = subprocess.Popen([tool_prefix + 'c++filt'], stdin=subprocess.PIPE,
                           stdout=subprocess.PIPE)
   stdout = proc.communicate('\n'.join(s.full_name for s in to_process))[0]
   assert proc.returncode == 0

   for i, line in enumerate(stdout.splitlines()):
     to_process[i].full_name = line


 def _NormalizeNames(raw_symbols):
   """Ensures that all names are formatted in a useful way.

   This includes:
     - Deriving |name| and |template_name| from |full_name|.
     - Stripping of return types (for functions).
     - Moving "vtable for" and the like to be suffixes rather than prefixes.
   """
   found_prefixes = set()
   for symbol in raw_symbols:
     full_name = symbol.full_name
     if full_name.startswith('*'):
       # See comment in _CalculatePadding() about when this
       # can happen.
       symbol.template_name = full_name
       symbol.name = full_name
       continue

     # Remove [clone] suffix, and set flag accordingly.
     # Search from left-to-right, as multiple [clone]s can exist.
     # Example name suffixes:
     #     [clone .part.322]  # GCC
     #     [clone .isra.322]  # GCC
     #     [clone .constprop.1064]  # GCC
     #     [clone .11064]  # clang
     # http://unix.stackexchange.com/questions/223013/function-symbol-gets-part-suffix-after-compilation
     idx = full_name.find(' [clone ')
     if idx != -1:
       full_name = full_name[:idx]
       symbol.flags |= models.FLAG_CLONE

     # Clones for C symbols.
     if symbol.section == 't':
       idx = full_name.rfind('.')
       if idx != -1 and full_name[idx + 1:].isdigit():
         new_name = full_name[:idx]
         # Generated symbols that end with .123 but are not clones.
         # Find these via:
         #   size_info.symbols.WhereInSection('t').WhereIsGroup().SortedByCount()
         if new_name not in ('__tcf_0', 'startup'):
           full_name = new_name
           symbol.flags |= models.FLAG_CLONE
           # Remove .part / .isra / .constprop.
           idx = full_name.rfind('.', 0, idx)
           if idx != -1:
             full_name = full_name[:idx]

     # E.g.: vtable for FOO
     idx = full_name.find(' for ', 0, 30)
     if idx != -1:
       found_prefixes.add(full_name[:idx + 4])
       full_name = '{} [{}]'.format(full_name[idx + 5:], full_name[:idx])

     # E.g.: virtual thunk to FOO
     idx = full_name.find(' to ', 0, 30)
     if idx != -1:
       found_prefixes.add(full_name[:idx + 3])
       full_name = '{} [{}]'.format(full_name[idx + 4:], full_name[:idx])

     # Strip out return type, and split out name, template_name.
     # Function parsing also applies to non-text symbols. E.g. Function statics.
     symbol.full_name, symbol.template_name, symbol.name = (
         function_signature.Parse(full_name))

     # Remove anonymous namespaces (they just harm clustering).
     symbol.template_name = symbol.template_name.replace(
         '(anonymous namespace)::', '')
     symbol.full_name = symbol.full_name.replace(
         '(anonymous namespace)::', '')
     non_anonymous_name = symbol.name.replace('(anonymous namespace)::', '')
     if symbol.name != non_anonymous_name:
       symbol.flags |= models.FLAG_ANONYMOUS
       symbol.name = non_anonymous_name

     # Allow using "is" to compare names (and should help with RAM).
     function_signature.InternSameNames(symbol)

   logging.debug('Found name prefixes of: %r', found_prefixes)


 def _NormalizeObjectPath(path):
   if path.startswith('obj/'):
     # Convert obj/third_party/... -> third_party/...
     path = path[4:]
   elif path.startswith('../../'):
     # Convert ../../third_party/... -> third_party/...
     path = path[6:]
   if path.endswith(')'):
     # Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o
     start_idx = path.index('(')
     path = os.path.join(path[:start_idx], path[start_idx + 1:-1])
   return path


 def _NormalizeSourcePath(path):
   """Returns (is_generated, normalized_path)"""
   if path.startswith('gen/'):
     # Convert gen/third_party/... -> third_party/...
     return True, path[4:]
   if path.startswith('../../'):
     # Convert ../../third_party/... -> third_party/...
     return False, path[6:]
   return True, path


 def _ExtractSourcePathsAndNormalizeObjectPaths(raw_symbols, source_mapper):
   """Fills in the |source_path| attribute and normalizes |object_path|."""
   if source_mapper:
     logging.info('Looking up source paths from ninja files')
     for symbol in raw_symbols:
       object_path = symbol.object_path
       if object_path:
         # We don't have source info for prebuilt .a files.
         if not os.path.isabs(object_path) and not object_path.startswith('..'):
           source_path = source_mapper.FindSourceForPath(object_path)
           if source_path:
             symbol.generated_source, symbol.source_path = (
                 _NormalizeSourcePath(source_path))
         symbol.object_path = _NormalizeObjectPath(object_path)
     assert source_mapper.unmatched_paths_count == 0, (
         'One or more source file paths could not be found. Likely caused by '
         '.ninja files being generated at a different time than the .map file.')
   else:
     logging.info('Normalizing object paths')
     for symbol in raw_symbols:
       if symbol.object_path:
         symbol.object_path = _NormalizeObjectPath(symbol.object_path)


 def _ComputeAncestorPath(path_list, symbol_count):
   """Returns the common ancestor of the given paths."""
   if not path_list:
     return ''

   prefix = os.path.commonprefix(path_list)
   # Check if all paths were the same.
   if prefix == path_list[0]:
     return prefix

   # Put in buckets to cut down on the number of unique paths.
   if symbol_count >= 100:
     symbol_count_str = '100+'
   elif symbol_count >= 50:
     symbol_count_str = '50-99'
   elif symbol_count >= 20:
     symbol_count_str = '20-49'
   elif symbol_count >= 10:
     symbol_count_str = '10-19'
   else:
     symbol_count_str = str(symbol_count)

   # Put the path count as a subdirectory so that grouping by path will show
   # "{shared}" as a bucket, and the symbol counts as leafs.
   if not prefix:
     return os.path.join('{shared}', symbol_count_str)
   return os.path.join(os.path.dirname(prefix), '{shared}', symbol_count_str)


 def _CompactLargeAliasesIntoSharedSymbols(raw_symbols):
   """Converts symbols with large number of aliases into single symbols.

   The merged symbol's path fields are changed to common-ancestor paths in
   the form: common/dir/{shared}/$SYMBOL_COUNT

   Assumes aliases differ only by path (not by name).
   """
   num_raw_symbols = len(raw_symbols)
   num_shared_symbols = 0
   src_cursor = 0
   dst_cursor = 0
   while src_cursor < num_raw_symbols:
     symbol = raw_symbols[src_cursor]
     raw_symbols[dst_cursor] = symbol
     dst_cursor += 1
     aliases = symbol.aliases
     if aliases and len(aliases) > _MAX_SAME_NAME_ALIAS_COUNT:
       symbol.source_path = _ComputeAncestorPath(
           [s.source_path for s in aliases if s.source_path], len(aliases))
       symbol.object_path = _ComputeAncestorPath(
           [s.object_path for s in aliases if s.object_path], len(aliases))
       symbol.generated_source = all(s.generated_source for s in aliases)
       symbol.aliases = None
       num_shared_symbols += 1
       src_cursor += len(aliases)
     else:
       src_cursor += 1
   raw_symbols[dst_cursor:] = []
   num_removed = src_cursor - dst_cursor
   logging.debug('Converted %d aliases into %d shared-path symbols',
                 num_removed, num_shared_symbols)


 def _ConnectNmAliases(raw_symbols):
   """Ensures |aliases| is set correctly for all symbols."""
   prev_sym = raw_symbols[0]
   for sym in raw_symbols[1:]:
     # Don't merge bss symbols.
     if sym.address > 0 and prev_sym.address == sym.address:
       # Don't merge padding-only symbols (** symbol gaps).
       if prev_sym.size > 0:
         # Don't merge if already merged.
         if prev_sym.aliases is None or prev_sym.aliases is not sym.aliases:
           if prev_sym.aliases:
             prev_sym.aliases.append(sym)
           else:
             prev_sym.aliases = [prev_sym, sym]
           sym.aliases = prev_sym.aliases
     prev_sym = sym


 def _AssignNmAliasPathsAndCreatePathAliases(raw_symbols, object_paths_by_name):
   num_found_paths = 0
   num_unknown_names = 0
   num_path_mismatches = 0
   num_aliases_created = 0
   ret = []
   for symbol in raw_symbols:
     ret.append(symbol)
     full_name = symbol.full_name
     if (symbol.IsBss() or
         not full_name or
         full_name[0] in '*.' or  # e.g. ** merge symbols, .Lswitch.table
         full_name == 'startup'):
       continue

     object_paths = object_paths_by_name.get(full_name)
     if object_paths:
       num_found_paths += 1
     else:
       if num_unknown_names < 10:
         logging.warning('Symbol not found in any .o files: %r', symbol)
       num_unknown_names += 1
       continue

     if symbol.object_path and symbol.object_path not in object_paths:
       if num_path_mismatches < 10:
         logging.warning('Symbol path reported by .map not found by nm.')
         logging.warning('sym=%r', symbol)
         logging.warning('paths=%r', object_paths)
       object_paths.append(symbol.object_path)
       object_paths.sort()
       num_path_mismatches += 1

     symbol.object_path = object_paths[0]

     if len(object_paths) > 1:
       # Create one symbol for each object_path.
       aliases = symbol.aliases or [symbol]
       symbol.aliases = aliases
       num_aliases_created += len(object_paths) - 1
       for object_path in object_paths[1:]:
         new_sym = models.Symbol(
             symbol.section_name, symbol.size, address=symbol.address,
             full_name=full_name, object_path=object_path, aliases=aliases)
         aliases.append(new_sym)
         ret.append(new_sym)

   logging.debug('Cross-referenced %d symbols with nm output. '
                 'num_unknown_names=%d num_path_mismatches=%d '
                 'num_aliases_created=%d',
                 num_found_paths, num_unknown_names, num_path_mismatches,
                 num_aliases_created)
   return ret


 def _DiscoverMissedObjectPaths(raw_symbols, elf_object_paths):
   # Missing object paths are caused by .a files added by -l flags, which are not
   # listed as explicit inputs within .ninja rules.
   parsed_inputs = set(elf_object_paths)
   missed_inputs = set()
   for symbol in raw_symbols:
     path = symbol.object_path
     if path.endswith(')'):
       # Convert foo/bar.a(baz.o) -> foo/bar.a
       path = path[:path.index('(')]
     if path and path not in parsed_inputs:
       missed_inputs.add(path)
   return missed_inputs


 def _CreateMergeStringsReplacements(merge_string_syms,
                                     list_of_positions_by_object_path):
   """Creates replacement symbols for |merge_syms|."""
   ret = []
   STRING_LITERAL_NAME = models.STRING_LITERAL_NAME
   assert len(merge_string_syms) == len(list_of_positions_by_object_path)
   tups = itertools.izip(merge_string_syms, list_of_positions_by_object_path)
   for merge_sym, positions_by_object_path in tups:
     merge_sym_address = merge_sym.address
     new_symbols = []
     ret.append(new_symbols)
     for object_path, positions in positions_by_object_path.iteritems():
       for offset, size in positions:
         address = merge_sym_address + offset
         symbol = models.Symbol(
             '.rodata', size, address, STRING_LITERAL_NAME,
             object_path=object_path)
         new_symbols.append(symbol)

   logging.debug('Created %d string literal symbols', sum(len(x) for x in ret))
   logging.debug('Sorting string literals')
   for symbols in ret:
     # In order to achieve a total ordering in the presense of aliases, need to
     # include both |address| and |object_path|.
     # In order to achieve consistent deduping, need to include |size|.
     symbols.sort(key=lambda x: (x.address, -x.size, x.object_path))

   logging.debug('Deduping string literals')
   num_removed = 0
   size_removed = 0
   num_aliases = 0
   for i, symbols in enumerate(ret):
     if not symbols:
       continue
     prev_symbol = symbols[0]
     new_symbols = [prev_symbol]
     for symbol in symbols[1:]:
       padding = symbol.address - prev_symbol.end_address
       if (prev_symbol.address == symbol.address and
           prev_symbol.size == symbol.size):
         # String is an alias.
         num_aliases += 1
         aliases = prev_symbol.aliases
         if aliases:
           aliases.append(symbol)
           symbol.aliases = aliases
         else:
           aliases = [prev_symbol, symbol]
           prev_symbol.aliases = aliases
           symbol.aliases = aliases
       elif padding + symbol.size <= 0:
         # String is a substring of prior one.
         num_removed += 1
         size_removed += symbol.size
         continue
       elif padding < 0:
         # String overlaps previous one. Adjust to not overlap.
         symbol.address -= padding
         symbol.size += padding
       new_symbols.append(symbol)
       prev_symbol = symbol
     ret[i] = new_symbols
     # Aliases come out in random order, so sort to be deterministic.
     ret[i].sort(key=lambda s: (s.address, s.object_path))

   logging.debug(
       'Removed %d overlapping string literals (%d bytes) & created %d aliases',
                 num_removed, size_removed, num_aliases)
   return ret


 def _CalculatePadding(raw_symbols):
   """Populates the |padding| field based on symbol addresses.

   Symbols must already be sorted by |address|.
   """
   seen_sections = []
   for i, symbol in enumerate(raw_symbols[1:]):
     prev_symbol = raw_symbols[i]
     if prev_symbol.section_name != symbol.section_name:
       assert symbol.section_name not in seen_sections, (
           'Input symbols must be sorted by section, then address.')
       seen_sections.append(symbol.section_name)
       continue
     if symbol.address <= 0 or prev_symbol.address <= 0:
       continue

     if symbol.address == prev_symbol.address:
       if symbol.aliases and symbol.aliases is prev_symbol.aliases:
         symbol.padding = prev_symbol.padding
         symbol.size = prev_symbol.size
         continue
       # Padding-only symbols happen for ** symbol gaps.
       assert prev_symbol.size_without_padding == 0, (
           'Found duplicate symbols:\n%r\n%r' % (prev_symbol, symbol))

     padding = symbol.address - prev_symbol.end_address
     # These thresholds were found by experimenting with arm32 Chrome.
     # E.g.: Set them to 0 and see what warnings get logged, then take max value.
     # TODO(agrieve): See if these thresholds make sense for architectures
     #     other than arm32.
     if (not symbol.full_name.startswith('*') and
         not symbol.IsStringLiteral() and (
         symbol.section in 'rd' and padding >= 256 or
         symbol.section in 't' and padding >= 64)):
       # Should not happen.
       logging.warning('Large padding of %d between:\n  A) %r\n  B) %r' % (
                       padding, prev_symbol, symbol))
     symbol.padding = padding
     symbol.size += padding
     assert symbol.size >= 0, (
         'Symbol has negative size (likely not sorted propertly): '
         '%r\nprev symbol: %r' % (symbol, prev_symbol))


 def _AddNmAliases(raw_symbols, names_by_address):
   """Adds symbols that were removed by identical code folding."""
   # Step 1: Create list of (index_of_symbol, name_list).
   logging.debug('Creating alias list')
   replacements = []
   num_new_symbols = 0
   for i, s in enumerate(raw_symbols):
     # Don't alias padding-only symbols (e.g. ** symbol gap)
     if s.size_without_padding == 0:
       continue
     name_list = names_by_address.get(s.address)
     if name_list:
       if s.full_name not in name_list:
         logging.warning('Name missing from aliases: %s %s', s.full_name,
                         name_list)
         continue
       replacements.append((i, name_list))
       num_new_symbols += len(name_list) - 1

   if float(num_new_symbols) / len(raw_symbols) < .05:
     logging.warning('Number of aliases is oddly low (%.0f%%). It should '
                     'usually be around 25%%. Ensure --tool-prefix is correct. ',
                     float(num_new_symbols) / len(raw_symbols) * 100)

   # Step 2: Create new symbols as siblings to each existing one.
   logging.debug('Creating %d new symbols from nm output', num_new_symbols)
   src_cursor_end = len(raw_symbols)
   raw_symbols += [None] * num_new_symbols
   dst_cursor_end = len(raw_symbols)
   for src_index, name_list in reversed(replacements):
     # Copy over symbols that come after the current one.
     chunk_size = src_cursor_end - src_index - 1
     dst_cursor_end -= chunk_size
     src_cursor_end -= chunk_size
     raw_symbols[dst_cursor_end:dst_cursor_end + chunk_size] = (
         raw_symbols[src_cursor_end:src_cursor_end + chunk_size])
     sym = raw_symbols[src_index]
     src_cursor_end -= 1

     # Create symbols (does not bother reusing the existing symbol).
     for i, full_name in enumerate(name_list):
       dst_cursor_end -= 1
       # Do not set |aliases| in order to avoid being pruned by
       # _CompactLargeAliasesIntoSharedSymbols(), which assumes aliases differ
       # only by path. The field will be set afterwards by _ConnectNmAliases().
       raw_symbols[dst_cursor_end] = models.Symbol(
           sym.section_name, sym.size, address=sym.address, full_name=full_name)

   assert dst_cursor_end == src_cursor_end


 def LoadAndPostProcessSizeInfo(path):
   """Returns a SizeInfo for the given |path|."""
   logging.debug('Loading results from: %s', path)
   size_info = file_format.LoadSizeInfo(path)
   logging.info('Normalizing symbol names')
   _NormalizeNames(size_info.raw_symbols)
   logging.info('Calculating padding')
   _CalculatePadding(size_info.raw_symbols)
   logging.info('Loaded %d symbols', len(size_info.raw_symbols))
   return size_info


 def CreateMetadata(map_path, elf_path, apk_path, tool_prefix, output_directory):
   metadata = None
   if elf_path:
     logging.debug('Constructing metadata')
     git_rev = _DetectGitRevision(os.path.dirname(elf_path))
     architecture = _ArchFromElf(elf_path, tool_prefix)
     build_id = BuildIdFromElf(elf_path, tool_prefix)
     timestamp_obj = datetime.datetime.utcfromtimestamp(os.path.getmtime(
         elf_path))
     timestamp = calendar.timegm(timestamp_obj.timetuple())
     relative_tool_prefix = paths.ToSrcRootRelative(tool_prefix)

     metadata = {
         models.METADATA_GIT_REVISION: git_rev,
         models.METADATA_ELF_ARCHITECTURE: architecture,
         models.METADATA_ELF_MTIME: timestamp,
         models.METADATA_ELF_BUILD_ID: build_id,
         models.METADATA_TOOL_PREFIX: relative_tool_prefix,
     }

     if output_directory:
       relative_to_out = lambda path: os.path.relpath(path, output_directory)
       gn_args = _ParseGnArgs(os.path.join(output_directory, 'args.gn'))
       metadata[models.METADATA_MAP_FILENAME] = relative_to_out(map_path)
       metadata[models.METADATA_ELF_FILENAME] = relative_to_out(elf_path)
       metadata[models.METADATA_GN_ARGS] = gn_args

       if apk_path:
         metadata[models.METADATA_APK_FILENAME] = relative_to_out(apk_path)
   return metadata


 def CreateSizeInfo(map_path, elf_path, tool_prefix, output_directory,
                    normalize_names=True, track_string_literals=True):
   """Creates a SizeInfo.

   Args:
     map_path: Path to the linker .map(.gz) file to parse.
     elf_path: Path to the corresponding unstripped ELF file. Used to find symbol
         aliases and inlined functions. Can be None.
     tool_prefix: Prefix for c++filt & nm (required).
     output_directory: Build output directory. If None, source_paths and symbol
         alias information will not be recorded.
     normalize_names: Whether to normalize symbol names.
     track_string_literals: Whether to break down "** merge string" sections into
         smaller symbols (requires output_directory).
   """
   source_mapper = None
   if output_directory:
     # Start by finding the elf_object_paths, so that nm can run on them while
     # the linker .map is being parsed.
     logging.info('Parsing ninja files.')
     source_mapper, elf_object_paths = ninja_parser.Parse(
         output_directory, elf_path)
     logging.debug('Parsed %d .ninja files.', source_mapper.parsed_file_count)
     assert not elf_path or elf_object_paths, (
         'Failed to find link command in ninja files for ' +
         os.path.relpath(elf_path, output_directory))

   if elf_path:
     # Run nm on the elf file to retrieve the list of symbol names per-address.
     # This list is required because the .map file contains only a single name
     # for each address, yet multiple symbols are often coalesced when they are
     # identical. This coalescing happens mainly for small symbols and for C++
     # templates. Such symbols make up ~500kb of libchrome.so on Android.
     elf_nm_result = nm.CollectAliasesByAddressAsync(elf_path, tool_prefix)

     # Run nm on all .o/.a files to retrieve the symbol names within them.
     # The list is used to detect when mutiple .o files contain the same symbol
     # (e.g. inline functions), and to update the object_path / source_path
     # fields accordingly.
     # Looking in object files is required because the .map file choses a
     # single path for these symbols.
     # Rather than record all paths for each symbol, set the paths to be the
     # common ancestor of all paths.
     if output_directory:
       bulk_analyzer = nm.BulkObjectFileAnalyzer(tool_prefix, output_directory)
       bulk_analyzer.AnalyzePaths(elf_object_paths)

   logging.info('Parsing Linker Map')
   with _OpenMaybeGz(map_path) as map_file:
     section_sizes, raw_symbols = (
         linker_map_parser.MapFileParser().Parse(map_file))

   if elf_path:
     logging.debug('Validating section sizes')
     elf_section_sizes = _SectionSizesFromElf(elf_path, tool_prefix)
     for k, v in elf_section_sizes.iteritems():
       if v != section_sizes.get(k):
         logging.error('ELF file and .map file do not agree on section sizes.')
         logging.error('.map file: %r', section_sizes)
         logging.error('readelf: %r', elf_section_sizes)
         sys.exit(1)

   if elf_path and output_directory:
     missed_object_paths = _DiscoverMissedObjectPaths(
         raw_symbols, elf_object_paths)
     bulk_analyzer.AnalyzePaths(missed_object_paths)
     bulk_analyzer.SortPaths()
     if track_string_literals:
       merge_string_syms = [
           s for s in raw_symbols if s.full_name == '** merge strings']
       # More likely for there to be a bug in supersize than an ELF to not have a
       # single string literal.
       assert merge_string_syms
       string_positions = [(s.address, s.size) for s in merge_string_syms]
       bulk_analyzer.AnalyzeStringLiterals(elf_path, string_positions)

   logging.info('Stripping linker prefixes from symbol names')
   _StripLinkerAddedSymbolPrefixes(raw_symbols)
   # Map file for some reason doesn't unmangle all names.
   # Unmangle prints its own log statement.
   _UnmangleRemainingSymbols(raw_symbols, tool_prefix)

   if elf_path:
     logging.info(
         'Adding symbols removed by identical code folding (as reported by nm)')
     # This normally does not block (it's finished by this time).
     names_by_address = elf_nm_result.get()
     _AddNmAliases(raw_symbols, names_by_address)

     if output_directory:
       object_paths_by_name = bulk_analyzer.GetSymbolNames()
       logging.debug('Fetched path information for %d symbols from %d files',
                     len(object_paths_by_name),
                     len(elf_object_paths) + len(missed_object_paths))

       # For aliases, this provides path information where there wasn't any.
       logging.info('Creating aliases for symbols shared by multiple paths')
       raw_symbols = _AssignNmAliasPathsAndCreatePathAliases(
           raw_symbols, object_paths_by_name)

       if track_string_literals:
         logging.info('Waiting for string literal extraction to complete.')
         list_of_positions_by_object_path = bulk_analyzer.GetStringPositions()
       bulk_analyzer.Close()

       if track_string_literals:
         logging.info('Deconstructing ** merge strings into literals')
         replacements = _CreateMergeStringsReplacements(merge_string_syms,
             list_of_positions_by_object_path)
         for merge_sym, literal_syms in itertools.izip(
             merge_string_syms, replacements):
           # Don't replace if no literals were found.
           if literal_syms:
             # Re-find the symbols since aliases cause their indices to change.
             idx = raw_symbols.index(merge_sym)
             # This assignment is a bit slow (causes array to be shifted), but
             # is fast enough since len(merge_string_syms) < 10.
             raw_symbols[idx:idx + 1] = literal_syms

   _ExtractSourcePathsAndNormalizeObjectPaths(raw_symbols, source_mapper)
   logging.info('Converting excessive aliases into shared-path symbols')
   _CompactLargeAliasesIntoSharedSymbols(raw_symbols)
   logging.debug('Connecting nm aliases')
   _ConnectNmAliases(raw_symbols)

   # Padding not really required, but it is useful to check for large padding and
   # log a warning.
   logging.info('Calculating padding')
   _CalculatePadding(raw_symbols)

   # Do not call _NormalizeNames() during archive since that method tends to need
   # tweaks over time. Calling it only when loading .size files allows for more
   # flexability.
   if normalize_names:
     _NormalizeNames(raw_symbols)

   logging.info('Processed %d symbols', len(raw_symbols))
   size_info = models.SizeInfo(section_sizes, raw_symbols)

   if logging.getLogger().isEnabledFor(logging.INFO):
     for line in describe.DescribeSizeInfoCoverage(size_info):
       logging.info(line)
   logging.info('Recorded info for %d symbols', len(size_info.raw_symbols))
   return size_info


 def _DetectGitRevision(directory):
   try:
     git_rev = subprocess.check_output(
         ['git', '-C', directory, 'rev-parse', 'HEAD'])
     return git_rev.rstrip()
   except Exception:
     logging.warning('Failed to detect git revision for file metadata.')
     return None


 def BuildIdFromElf(elf_path, tool_prefix):
   args = [tool_prefix + 'readelf', '-n', elf_path]
   stdout = subprocess.check_output(args)
   match = re.search(r'Build ID: (\w+)', stdout)
   assert match, 'Build ID not found from running: ' + ' '.join(args)
   return match.group(1)


 def _SectionSizesFromElf(elf_path, tool_prefix):
   args = [tool_prefix + 'readelf', '-S', '--wide', elf_path]
   stdout = subprocess.check_output(args)
   section_sizes = {}
   # Matches  [ 2] .hash HASH 00000000006681f0 0001f0 003154 04   A  3   0  8
   for match in re.finditer(r'\[[\s\d]+\] (\..*)$', stdout, re.MULTILINE):
     items = match.group(1).split()
     section_sizes[items[0]] = int(items[4], 16)
   return section_sizes


 def _ArchFromElf(elf_path, tool_prefix):
   args = [tool_prefix + 'readelf', '-h', elf_path]
   stdout = subprocess.check_output(args)
   machine = re.search('Machine:\s*(.+)', stdout).group(1)
   if machine == 'Intel 80386':
     return 'x86'
   if machine == 'Advanced Micro Devices X86-64':
     return 'x64'
   elif machine == 'ARM':
     return 'arm'
   elif machine == 'AArch64':
     return 'arm64'
   return machine


 def _ParseGnArgs(args_path):
   """Returns a list of normalized "key=value" strings."""
   args = {}
   with open(args_path) as f:
     for l in f:
       # Strips #s even if within string literal. Not a problem in practice.
       parts = l.split('#')[0].split('=')
       if len(parts) != 2:
         continue
       args[parts[0].strip()] = parts[1].strip()
   return ["%s=%s" % x for x in sorted(args.iteritems())]


 def _ElfInfoFromApk(apk_path, apk_so_path, tool_prefix):
   """Returns a tuple of (build_id, section_sizes)."""
   with zipfile.ZipFile(apk_path) as apk, \
        tempfile.NamedTemporaryFile() as f:
     f.write(apk.read(apk_so_path))
     f.flush()
     build_id = BuildIdFromElf(f.name, tool_prefix)
     section_sizes = _SectionSizesFromElf(f.name, tool_prefix)
     return build_id, section_sizes


 def AddArguments(parser):
   parser.add_argument('size_file', help='Path to output .size file.')
   parser.add_argument('--apk-file',
                       help='.apk file to measure. When set, --elf-file will be '
                             'derived (if unset). Providing the .apk allows '
                             'for the size of packed relocations to be recorded')
   parser.add_argument('--elf-file',
                       help='Path to input ELF file. Currently used for '
                            'capturing metadata.')
   parser.add_argument('--map-file',
                       help='Path to input .map(.gz) file. Defaults to '
                            '{{elf_file}}.map(.gz)?. If given without '
                            '--elf-file, no size metadata will be recorded.')
   parser.add_argument('--no-source-paths', action='store_true',
                       help='Do not use .ninja files to map '
                            'object_path -> source_path')
   parser.add_argument('--tool-prefix',
                       help='Path prefix for c++filt, nm, readelf.')
   parser.add_argument('--output-directory',
                       help='Path to the root build directory.')
   parser.add_argument('--no-string-literals', dest='track_string_literals',
                       default=True, action='store_false',
                       help='Disable breaking down "** merge strings" into more '
                            'granular symbols.')


 def Run(args, parser):
   if not args.size_file.endswith('.size'):
     parser.error('size_file must end with .size')

   elf_path = args.elf_file
   map_path = args.map_file
   apk_path = args.apk_file
   any_input = apk_path or elf_path or map_path
   if not any_input:
     parser.error('Most pass at least one of --apk-file, --elf-file, --map-file')
   lazy_paths = paths.LazyPaths(tool_prefix=args.tool_prefix,
                                output_directory=args.output_directory,
                                any_path_within_output_directory=any_input)
   if apk_path:
     with zipfile.ZipFile(apk_path) as z:
       lib_infos = [f for f in z.infolist()
                    if f.filename.endswith('.so') and f.file_size > 0]
     assert lib_infos, 'APK has no .so files.'
     # TODO(agrieve): Add support for multiple .so files, and take into account
     #     secondary architectures.
     apk_so_path = max(lib_infos, key=lambda x:x.file_size).filename
     logging.debug('Sub-apk path=%s', apk_so_path)
     if not elf_path and lazy_paths.output_directory:
       elf_path = os.path.join(
           lazy_paths.output_directory, 'lib.unstripped',
           os.path.basename(apk_so_path.replace('crazy.', '')))
       logging.debug('Detected --elf-file=%s', elf_path)

   if map_path:
     if not map_path.endswith('.map') and not map_path.endswith('.map.gz'):
       parser.error('Expected --map-file to end with .map or .map.gz')
   else:
     map_path = elf_path + '.map'
     if not os.path.exists(map_path):
       map_path += '.gz'
     if not os.path.exists(map_path):
       parser.error('Could not find .map(.gz)? file. Ensure you have built with '
                    'is_official_build=true, or use --map-file to point me a '
                    'linker map file.')

   tool_prefix = lazy_paths.VerifyToolPrefix()
   output_directory = None
   if not args.no_source_paths:
     output_directory = lazy_paths.VerifyOutputDirectory()

   metadata = CreateMetadata(map_path, elf_path, apk_path, tool_prefix,
                             output_directory)
   if apk_path and elf_path:
     # Extraction takes around 1 second, so do it in parallel.
     apk_elf_result = concurrent.ForkAndCall(
         _ElfInfoFromApk, (apk_path, apk_so_path, tool_prefix))

   size_info = CreateSizeInfo(map_path, elf_path, tool_prefix, output_directory,
                              normalize_names=False,
                              track_string_literals=args.track_string_literals)

   if metadata:
     size_info.metadata = metadata

     if apk_path:
       logging.debug('Extracting section sizes from .so within .apk')
       unstripped_section_sizes = size_info.section_sizes
       apk_build_id, size_info.section_sizes = apk_elf_result.get()
       assert apk_build_id == metadata[models.METADATA_ELF_BUILD_ID], (
           'BuildID for %s within %s did not match the one at %s' %
           (apk_so_path, apk_path, elf_path))

       packed_section_name = None
       architecture = metadata[models.METADATA_ELF_ARCHITECTURE]
       # Packing occurs enabled only arm32 & arm64.
       if architecture == 'arm':
         packed_section_name = '.rel.dyn'
       elif architecture == 'arm64':
         packed_section_name = '.rela.dyn'

       if packed_section_name:
         logging.debug('Recording size of unpacked relocations')
         if packed_section_name not in size_info.section_sizes:
           logging.warning('Packed section not present: %s', packed_section_name)
         else:
           size_info.section_sizes['%s (unpacked)' % packed_section_name] = (
               unstripped_section_sizes.get(packed_section_name))

   logging.info('Recording metadata: \n  %s',
                '\n  '.join(describe.DescribeMetadata(size_info.metadata)))
   logging.info('Saving result to %s', args.size_file)
   file_format.SaveSizeInfo(size_info, args.size_file)
   size_in_mb = os.path.getsize(args.size_file) / 1024.0 / 1024.0
   logging.info('Done. File size is %.2fMiB.', size_in_mb)