tools/binary_size/libsupersize/archive.py - chromium/src - Git at Google

 # Copyright 2017 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Main Python API for analyzing binary size."""

 import argparse
 import bisect
 import calendar
 import collections
 import copy
 import datetime
 import functools
 import gzip
 import itertools
 import logging
 import os
 import posixpath
 import re
 import shlex
 import string
 import subprocess
 import sys
 import tempfile
 import time
 import zipfile
 import zlib

 import apkanalyzer
 import ar
 import data_quality
 import demangle
 import describe
 import file_format
 import function_signature
 import linker_map_parser
 import models
 import ninja_parser
 import nm
 import obj_analyzer
 import parallel
 import path_util
 import readelf
 import string_extract
 import zip_util


 sys.path.insert(1, os.path.join(path_util.TOOLS_SRC_ROOT, 'tools', 'grit'))
 from grit.format import data_pack

 _METADATA_FILENAME = 'DIR_METADATA'
 _METADATA_COMPONENT_REGEX = re.compile(r'^\s*component:\s*"(.*?)"',
                                        re.MULTILINE)
 _OWNERS_FILENAME = 'OWNERS'
 _OWNERS_COMPONENT_REGEX = re.compile(r'^\s*#\s*COMPONENT:\s*(\S+)',
                                      re.MULTILINE)
 _OWNERS_FILE_PATH_REGEX = re.compile(r'^\s*file://(\S+)', re.MULTILINE)
 # Paths that are missing metadata, and where it's hard to add (e.g. code in
 # other repositories.
 _COMPONENT_DEFAULTS = {
     os.path.join('third_party', 'webrtc'): 'Blink>WebRTC',
     os.path.join('logging', 'rtc_event_log'): 'Blink>WebRTC',
     os.path.join('modules', 'audio_codec'): 'Blink>WebRTC',
     os.path.join('modules', 'audio_processing'): 'Blink>WebRTC',
 }

 _UNCOMPRESSED_COMPRESSION_RATIO_THRESHOLD = 0.9

 # Holds computation state that is live only when an output directory exists.
 _OutputDirectoryContext = collections.namedtuple('_OutputDirectoryContext', [
     'elf_object_paths',  # Only when elf_path is also provided.
     'known_inputs',  # Only when elf_path is also provided.
     'output_directory',
     'thin_archives',
 ])

 # When ensuring matching section sizes between .elf and .map files, these
 # sections should be ignored. When lld creates a combined library with
 # partitions, some sections (like .text) exist in each partition, but the ones
 # below are common. At library splitting time, llvm-objcopy pulls what's needed
 # from these sections into the new libraries. Hence, the ELF sections will end
 # up smaller than the combined .map file sections.
 _SECTION_SIZE_BLOCKLIST = ['.symtab', '.shstrtab', '.strtab']


 # Tunable constant "knobs" for CreateContainerAndSymbols().
 class SectionSizeKnobs:
   def __init__(self):
     # A limit on the number of symbols an address can have, before these symbols
     # are compacted into shared symbols. Increasing this value causes more data
     # to be stored .size files, but is also more expensive.
     # Effect of max_same_name_alias_count (as of Oct 2017, with min_pss = max):
     # 1: shared .text syms = 1772874 bytes, file size = 9.43MiB (645476 syms).
     # 2: shared .text syms = 1065654 bytes, file size = 9.58MiB (669952 syms).
     # 6: shared .text syms = 464058 bytes, file size = 10.11MiB (782693 syms).
     # 10: shared .text syms = 365648 bytes, file size = 10.24MiB (813758 syms).
     # 20: shared .text syms = 86202 bytes, file size = 10.38MiB (854548 syms).
     # 40: shared .text syms = 48424 bytes, file size = 10.50MiB (890396 syms).
     # 50: shared .text syms = 41860 bytes, file size = 10.54MiB (902304 syms).
     # max: shared .text syms = 0 bytes, file size = 11.10MiB (1235449 syms).
     self.max_same_name_alias_count = 40  # 50kb is basically negligable.

     # File name: Source file.
     self.apk_other_files = {
         'assets/icudtl.dat':
         '../../third_party/icu/android/icudtl.dat',
         'assets/snapshot_blob_32.bin':
         '../../v8/snapshot_blob_32.bin',
         'assets/snapshot_blob_64.bin':
         '../../v8/snapshot_blob_64.bin',
         'assets/unwind_cfi_32':
         '../../base/trace_event/cfi_backtrace_android.cc',
         'assets/webapk_dex_version.txt':
         '../../chrome/android/webapk/libs/runtime_library_version.gni',
         'lib/armeabi-v7a/libarcore_sdk_c_minimal.so':
         '../../third_party/arcore-android-sdk/BUILD.gn',
         'lib/armeabi-v7a/libarcore_sdk_c.so':
         '../../third_party/arcore-android-sdk/BUILD.gn',
         'lib/armeabi-v7a/libcrashpad_handler_trampoline.so':
         '../../third_party/crashpad/BUILD.gn',
         'lib/armeabi-v7a/libyoga.so':
         '../../chrome/android/feed/BUILD.gn',
         'lib/armeabi-v7a/libelements.so':
         '../../chrome/android/feed/BUILD.gn',
         'lib/arm64-v8a/libarcore_sdk_c_minimal.so':
         '../../third_party/arcore-android-sdk/BUILD.gn',
         'lib/arm64-v8a/libarcore_sdk_c.so':
         '../../third_party/arcore-android-sdk/BUILD.gn',
         'lib/arm64-v8a/libcrashpad_handler_trampoline.so':
         '../../third_party/crashpad/BUILD.gn',
         'lib/arm64-v8a/libyoga.so':
         '../../chrome/android/feed/BUILD.gn',
         'lib/arm64-v8a/libelements.so':
         '../../chrome/android/feed/BUILD.gn',
     }


 # Parameters and states for archiving a container.
 class ContainerArchiveOptions:
   def __init__(self, top_args, sub_args):
     # An estimate of pak translation compression ratio to make comparisons
     # between .size files reasonable. Otherwise this can differ every pak
     # change.
     self.pak_compression_ratio = 0.38 if sub_args.minimal_apks_file else 0.33

     # Whether to count number of relative relocations instead of binary size.
     self.relocations_mode = top_args.relocations

     self.analyze_java = not (sub_args.native_only or sub_args.no_java
                              or top_args.native_only or top_args.no_java
                              or self.relocations_mode)
     # This may be further disabled downstream, e.g., for the case where an APK
     # is specified, but it contains no .so files.
     self.analyze_native = not (sub_args.java_only or sub_args.no_native
                                or top_args.java_only or top_args.no_native)

     self.track_string_literals = sub_args.track_string_literals


 def _OpenMaybeGzAsText(path):
   """Calls `gzip.open()` if |path| ends in ".gz", otherwise calls `open()`."""
   if path.endswith('.gz'):
     return gzip.open(path, 'rt')
   return open(path, 'rt')


 def _NormalizeNames(raw_symbols):
   """Ensures that all names are formatted in a useful way.

   This includes:
     - Deriving |name| and |template_name| from |full_name|.
     - Stripping of return types (for functions).
     - Moving "vtable for" and the like to be suffixes rather than prefixes.
   """
   found_prefixes = set()
   for symbol in raw_symbols:
     full_name = symbol.full_name

     # See comment in _CalculatePadding() about when this can happen. Don't
     # process names for non-native sections.
     if symbol.IsPak():
       # full_name: "about_ui_resources.grdp: IDR_ABOUT_UI_CREDITS_HTML".
       space_idx = full_name.rindex(' ')
       name = full_name[space_idx + 1:]
       symbol.template_name = name
       symbol.name = name
     elif (full_name.startswith('**') or symbol.IsOverhead()
           or symbol.IsOther()):
       symbol.template_name = full_name
       symbol.name = full_name
     elif symbol.IsDex():
       symbol.full_name, symbol.template_name, symbol.name = (
           function_signature.ParseJava(full_name))
     elif symbol.IsStringLiteral():
       symbol.full_name = full_name
       symbol.template_name = full_name
       symbol.name = full_name
     elif symbol.IsNative():
       # Remove [clone] suffix, and set flag accordingly.
       # Search from left-to-right, as multiple [clone]s can exist.
       # Example name suffixes:
       #     [clone .part.322]  # GCC
       #     [clone .isra.322]  # GCC
       #     [clone .constprop.1064]  # GCC
       #     [clone .11064]  # clang
       # http://unix.stackexchange.com/questions/223013/function-symbol-gets-part-suffix-after-compilation
       idx = full_name.find(' [clone ')
       if idx != -1:
         full_name = full_name[:idx]
         symbol.flags |= models.FLAG_CLONE

       # Clones for C symbols.
       if symbol.section == 't':
         idx = full_name.rfind('.')
         if idx != -1 and full_name[idx + 1:].isdigit():
           new_name = full_name[:idx]
           # Generated symbols that end with .123 but are not clones.
           # Find these via:
           # size_info.symbols.WhereInSection('t').WhereIsGroup().SortedByCount()
           if new_name not in ('__tcf_0', 'startup'):
             full_name = new_name
             symbol.flags |= models.FLAG_CLONE
             # Remove .part / .isra / .constprop.
             idx = full_name.rfind('.', 0, idx)
             if idx != -1:
               full_name = full_name[:idx]

       # E.g.: vtable for FOO
       idx = full_name.find(' for ', 0, 30)
       if idx != -1:
         found_prefixes.add(full_name[:idx + 4])
         full_name = '{} [{}]'.format(full_name[idx + 5:], full_name[:idx])

       # E.g.: virtual thunk to FOO
       idx = full_name.find(' to ', 0, 30)
       if idx != -1:
         found_prefixes.add(full_name[:idx + 3])
         full_name = '{} [{}]'.format(full_name[idx + 4:], full_name[:idx])

       # Strip out return type, and split out name, template_name.
       # Function parsing also applies to non-text symbols.
       # E.g. Function statics.
       symbol.full_name, symbol.template_name, symbol.name = (
           function_signature.Parse(full_name))

       # Remove anonymous namespaces (they just harm clustering).
       symbol.template_name = symbol.template_name.replace(
           '(anonymous namespace)::', '')
       symbol.full_name = symbol.full_name.replace(
           '(anonymous namespace)::', '')
       non_anonymous_name = symbol.name.replace('(anonymous namespace)::', '')
       if symbol.name != non_anonymous_name:
         symbol.flags |= models.FLAG_ANONYMOUS
         symbol.name = non_anonymous_name

     # Allow using "is" to compare names (and should help with RAM). This applies
     # to all symbols.
     function_signature.InternSameNames(symbol)

   logging.debug('Found name prefixes of: %r', found_prefixes)


 def _NormalizeObjectPath(path):
   """Normalizes object paths.

   Prefixes are removed: obj/, ../../
   Archive names made more pathy: foo/bar.a(baz.o) -> foo/bar.a/baz.o
   """
   if path.startswith('obj/'):
     # Convert obj/third_party/... -> third_party/...
     path = path[4:]
   elif path.startswith('../../'):
     # Convert ../../third_party/... -> third_party/...
     path = path[6:]
   if path.endswith(')'):
     # Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o so that hierarchical
     # breakdowns consider the .o part to be a separate node.
     start_idx = path.rindex('(')
     path = os.path.join(path[:start_idx], path[start_idx + 1:-1])
   return path


 def _NormalizeSourcePath(path):
   """Returns (is_generated, normalized_path)"""
   if path.startswith('gen/'):
     # Convert gen/third_party/... -> third_party/...
     return True, path[4:]
   if path.startswith('../../'):
     # Convert ../../third_party/... -> third_party/...
     return False, path[6:]
   return True, path


 def _ExtractSourcePathsAndNormalizeObjectPaths(raw_symbols, source_mapper):
   """Fills in the |source_path| attribute and normalizes |object_path|."""
   logging.info('Normalizing dex symbol paths')
   dex_and_other = models.DEX_SECTIONS + (models.SECTION_OTHER, )
   for symbol in raw_symbols:
     if symbol.source_path and symbol.section_name in dex_and_other:
       symbol.generated_source, symbol.source_path = _NormalizeSourcePath(
           symbol.source_path)

   if source_mapper:
     logging.info('Looking up source paths from ninja files')
     for symbol in raw_symbols:
       if symbol.IsDex() or symbol.IsOther():
         continue
       # Native symbols and pak symbols use object paths.
       object_path = symbol.object_path
       if object_path:
         # We don't have source info for prebuilt .a files.
         if not os.path.isabs(object_path) and not object_path.startswith('..'):
           source_path = source_mapper.FindSourceForPath(object_path)
           if source_path:
             symbol.generated_source, symbol.source_path = (
                 _NormalizeSourcePath(source_path))
         symbol.object_path = _NormalizeObjectPath(object_path)
     assert source_mapper.unmatched_paths_count == 0, (
         'One or more source file paths could not be found. Likely caused by '
         '.ninja files being generated at a different time than the .map file.')
   else:
     logging.info('Normalizing object paths')
     for symbol in raw_symbols:
       if symbol.object_path:
         symbol.object_path = _NormalizeObjectPath(symbol.object_path)


 def _ComputeAncestorPath(path_list, symbol_count):
   """Returns the common ancestor of the given paths."""
   if not path_list:
     return ''

   prefix = os.path.commonprefix(path_list)
   # Check if all paths were the same.
   if prefix == path_list[0]:
     return prefix

   # Put in buckets to cut down on the number of unique paths.
   if symbol_count >= 100:
     symbol_count_str = '100+'
   elif symbol_count >= 50:
     symbol_count_str = '50-99'
   elif symbol_count >= 20:
     symbol_count_str = '20-49'
   elif symbol_count >= 10:
     symbol_count_str = '10-19'
   else:
     symbol_count_str = str(symbol_count)

   # Put the path count as a subdirectory so that grouping by path will show
   # "{shared}" as a bucket, and the symbol counts as leafs.
   if not prefix:
     return os.path.join('{shared}', symbol_count_str)
   return os.path.join(os.path.dirname(prefix), '{shared}', symbol_count_str)


 def _CompactLargeAliasesIntoSharedSymbols(raw_symbols, knobs):
   """Converts symbols with large number of aliases into single symbols.

   The merged symbol's path fields are changed to common-ancestor paths in
   the form: common/dir/{shared}/$SYMBOL_COUNT

   Assumes aliases differ only by path (not by name).
   """
   num_raw_symbols = len(raw_symbols)
   num_shared_symbols = 0
   src_cursor = 0
   dst_cursor = 0
   while src_cursor < num_raw_symbols:
     symbol = raw_symbols[src_cursor]
     raw_symbols[dst_cursor] = symbol
     dst_cursor += 1
     aliases = symbol.aliases
     if aliases and len(aliases) > knobs.max_same_name_alias_count:
       symbol.source_path = _ComputeAncestorPath(
           [s.source_path for s in aliases if s.source_path], len(aliases))
       symbol.object_path = _ComputeAncestorPath(
           [s.object_path for s in aliases if s.object_path], len(aliases))
       symbol.generated_source = all(s.generated_source for s in aliases)
       symbol.aliases = None
       num_shared_symbols += 1
       src_cursor += len(aliases)
     else:
       src_cursor += 1
   raw_symbols[dst_cursor:] = []
   num_removed = src_cursor - dst_cursor
   logging.debug('Converted %d aliases into %d shared-path symbols',
                 num_removed, num_shared_symbols)


 def _ConnectNmAliases(raw_symbols):
   """Ensures |aliases| is set correctly for all symbols."""
   prev_sym = raw_symbols[0]
   for sym in raw_symbols[1:]:
     # Don't merge bss symbols.
     if sym.address > 0 and prev_sym.address == sym.address:
       # Don't merge padding-only symbols (** symbol gaps).
       if prev_sym.size > 0:
         # Don't merge if already merged.
         if prev_sym.aliases is None or prev_sym.aliases is not sym.aliases:
           if prev_sym.aliases:
             prev_sym.aliases.append(sym)
           else:
             prev_sym.aliases = [prev_sym, sym]
           sym.aliases = prev_sym.aliases
     prev_sym = sym


 def _AssignNmAliasPathsAndCreatePathAliases(raw_symbols, object_paths_by_name):
   num_found_paths = 0
   num_unknown_names = 0
   num_path_mismatches = 0
   num_aliases_created = 0
   ret = []
   for symbol in raw_symbols:
     ret.append(symbol)
     full_name = symbol.full_name
     # '__typeid_' symbols appear in linker .map only, and not nm output.
     if full_name.startswith('__typeid_'):
       if object_paths_by_name.get(full_name):
         logging.warning('Found unexpected __typeid_ symbol in nm output: %s',
                         full_name)
       continue

     # Don't skip if symbol.IsBss(). This is needed for LLD-LTO to work, since
     # .bss object_path data are unavailable for linker_map_parser, and need to
     # be extracted here. For regular LLD flow, incorrect aliased symbols can
     # arise. But that's a lesser evil compared to having LLD-LTO .bss missing
     # object_path and source_path.
     # TODO(huangs): Fix aliased symbols for the LLD case.
     if (symbol.IsStringLiteral() or
         not full_name or
         full_name[0] in '*.' or  # e.g. ** merge symbols, .Lswitch.table
         full_name == 'startup'):
       continue

     object_paths = object_paths_by_name.get(full_name)
     if object_paths:
       num_found_paths += 1
     else:
       # Happens a lot with code that has LTO enabled (linker creates symbols).
       num_unknown_names += 1
       continue

     if symbol.object_path and symbol.object_path not in object_paths:
       if num_path_mismatches < 10:
         logging.warning('Symbol path reported by .map not found by nm.')
         logging.warning('sym=%r', symbol)
         logging.warning('paths=%r', object_paths)
       object_paths.append(symbol.object_path)
       object_paths.sort()
       num_path_mismatches += 1

     symbol.object_path = object_paths[0]

     if len(object_paths) > 1:
       # Create one symbol for each object_path.
       aliases = symbol.aliases or [symbol]
       symbol.aliases = aliases
       num_aliases_created += len(object_paths) - 1
       for object_path in object_paths[1:]:
         new_sym = models.Symbol(
             symbol.section_name, symbol.size, address=symbol.address,
             full_name=full_name, object_path=object_path, aliases=aliases)
         aliases.append(new_sym)
         ret.append(new_sym)

   logging.debug('Cross-referenced %d symbols with nm output. '
                 'num_unknown_names=%d num_path_mismatches=%d '
                 'num_aliases_created=%d',
                 num_found_paths, num_unknown_names, num_path_mismatches,
                 num_aliases_created)
   # Currently: num_unknown_names=1246 out of 591206 (0.2%).
   if num_unknown_names > len(raw_symbols) * 0.01:
     logging.warning('Abnormal number of symbols not found in .o files (%d)',
                     num_unknown_names)
   return ret


 def _DiscoverMissedObjectPaths(raw_symbols, known_inputs):
   # Missing object paths are caused by .a files added by -l flags, which are not
   # listed as explicit inputs within .ninja rules.
   missed_inputs = set()
   for symbol in raw_symbols:
     path = symbol.object_path
     if path.endswith(')'):
       # Convert foo/bar.a(baz.o) -> foo/bar.a
       path = path[:path.rindex('(')]
     if path and path not in known_inputs:
       missed_inputs.add(path)
   return missed_inputs


 def _CreateMergeStringsReplacements(merge_string_syms,
                                     list_of_positions_by_object_path):
   """Creates replacement symbols for |merge_syms|."""
   ret = []
   STRING_LITERAL_NAME = models.STRING_LITERAL_NAME
   assert len(merge_string_syms) == len(list_of_positions_by_object_path)
   tups = zip(merge_string_syms, list_of_positions_by_object_path)
   for merge_sym, positions_by_object_path in tups:
     merge_sym_address = merge_sym.address
     new_symbols = []
     ret.append(new_symbols)
     for object_path, positions in positions_by_object_path.items():
       for offset, size in positions:
         address = merge_sym_address + offset
         symbol = models.Symbol(
             models.SECTION_RODATA,
             size,
             address=address,
             full_name=STRING_LITERAL_NAME,
             object_path=object_path)
         new_symbols.append(symbol)

   logging.debug('Created %d string literal symbols', sum(len(x) for x in ret))
   logging.debug('Sorting string literals')
   for symbols in ret:
     # For de-duping & alias creation, order by address & size.
     # For alias symbol ordering, sort by object_path.
     symbols.sort(key=lambda x: (x.address, -x.size, x.object_path))

   logging.debug('Deduping string literals')
   num_removed = 0
   size_removed = 0
   num_aliases = 0
   for i, symbols in enumerate(ret):
     if not symbols:
       continue
     prev_symbol = symbols[0]
     new_symbols = [prev_symbol]
     for symbol in symbols[1:]:
       padding = symbol.address - prev_symbol.end_address
       if (prev_symbol.address == symbol.address and
           prev_symbol.size == symbol.size):
         # String is an alias.
         num_aliases += 1
         aliases = prev_symbol.aliases
         if aliases:
           aliases.append(symbol)
           symbol.aliases = aliases
         else:
           aliases = [prev_symbol, symbol]
           prev_symbol.aliases = aliases
           symbol.aliases = aliases
       elif padding + symbol.size <= 0:
         # String is a substring of prior one.
         num_removed += 1
         size_removed += symbol.size
         continue
       elif padding < 0:
         # String overlaps previous one. Adjust to not overlap.
         symbol.address -= padding
         symbol.size += padding
       new_symbols.append(symbol)
       prev_symbol = symbol
     ret[i] = new_symbols

   logging.debug(
       'Removed %d overlapping string literals (%d bytes) & created %d aliases',
                 num_removed, size_removed, num_aliases)
   return ret


 def _ParseComponentFromMetadata(path):
   """Extracts Component from DIR_METADATA."""
   try:
     with open(path) as f:
       data = f.read()

     m = _METADATA_COMPONENT_REGEX.search(data)
     if m:
       return m.group(1)
   except IOError:
     # Need to catch both FileNotFoundError and NotADirectoryError since
     # source_paths for .aar files look like: /path/to/lib.aar/path/within/zip
     pass
   return ''


 def _ParseComponentFromOwners(path):
   """Extracts COMPONENT and file:// from an OWNERS file.

   Args:
     path: Path to the file to parse.

   Returns:
     (component, None) if COMPONENT: line was found.
     ('', path) if a single file:// was found.
     ('', None) if neither was found.
   """
   try:
     with open(path) as f:
       data = f.read()

     m = _OWNERS_COMPONENT_REGEX.search(data)
     if m:
       return m.group(1), None
     aliases = _OWNERS_FILE_PATH_REGEX.findall(data)
     if len(aliases) == 1:
       return '', aliases[0]
   except IOError:
     # Need to catch both FileNotFoundError and NotADirectoryError since
     # source_paths for .aar files look like: /path/to/lib.aar/path/within/zip
     pass
   return '', None


 def _FindComponentRoot(path, cache, source_directory):
   """Searches all parent directories for COMPONENT in OWNERS files.

   Args:
     path: Path of directory to start searching from. Must be relative to
       |source_directory|.
     cache: Dict of OWNERS paths. Used instead of filesystem if paths are present
       in the dict.
     source_directory: Directory to use as the root.

   Returns:
     COMPONENT belonging to |path|, or empty string if not found.
   """
   assert not os.path.isabs(path)
   component = cache.get(path)
   if component is not None:
     return component

   metadata_path = os.path.join(source_directory, path, _METADATA_FILENAME)
   component = _ParseComponentFromMetadata(metadata_path)
   if not component:
     owners_path = os.path.join(source_directory, path, _OWNERS_FILENAME)
     component, path_alias = _ParseComponentFromOwners(owners_path)

   if not component:
     # Store in cache before recursing to prevent cycles.
     cache[path] = ''
     if path_alias:
       alias_dir = os.path.dirname(path_alias)
       component = _FindComponentRoot(alias_dir, cache, source_directory)

   if not component:
     parent_path = os.path.dirname(path)
     if parent_path:
       component = _FindComponentRoot(parent_path, cache, source_directory)

   cache[path] = component
   return component


 def _PopulateComponents(raw_symbols, source_directory):
   """Populates the |component| field based on |source_path|.

   Symbols without a |source_path| are skipped.

   Args:
     raw_symbols: list of Symbol objects.
     source_directory: Directory to use as the root.
   """
   seen_paths = _COMPONENT_DEFAULTS.copy()
   for symbol in raw_symbols:
     if symbol.source_path:
       folder_path = os.path.dirname(symbol.source_path)
       symbol.component = _FindComponentRoot(folder_path, seen_paths,
                                             source_directory)


 def _UpdateSymbolNamesFromNm(raw_symbols, names_by_address):
   """Updates raw_symbols names with extra information from nm."""
   logging.debug('Update symbol names')
   # linker_map_parser extracts '** outlined function' without knowing how many
   # such symbols exist at each address. nm has this information, and stores the
   # value as, e.g., '** outlined function * 5'. Copy the information over.
   for s in raw_symbols:
     if s.full_name.startswith('** outlined function'):
       name_list = names_by_address.get(s.address)
       if name_list:
         for name in name_list:
           if name.startswith('** outlined function'):
             s.full_name = name
             break


 def _AddNmAliases(raw_symbols, names_by_address):
   """Adds symbols that were removed by identical code folding."""
   # Step 1: Create list of (index_of_symbol, name_list).
   logging.debug('Creating alias list')
   replacements = []
   num_new_symbols = 0
   num_missing = 0
   missing_names = collections.defaultdict(list)
   for i, s in enumerate(raw_symbols):
     # Don't alias padding-only symbols (e.g. ** symbol gap)
     if s.size_without_padding == 0:
       continue
     # Also skip artificial symbols that won't appear in nm output.
     if s.full_name.startswith('** CFI jump table'):
       continue
     name_list = names_by_address.get(s.address)
     if name_list:
       if s.full_name not in name_list:
         num_missing += 1
         missing_names[s.full_name].append(s.address)
         # Sometimes happens for symbols from assembly files.
         if num_missing < 10:
           logging.debug('Name missing from aliases: %s %s (addr=%x)',
                         s.full_name, name_list, s.address)
         continue
       replacements.append((i, name_list))
       num_new_symbols += len(name_list) - 1

   if missing_names and logging.getLogger().isEnabledFor(logging.INFO):
     for address, names in names_by_address.items():
       for name in names:
         if name in missing_names:
           logging.info('Missing name %s is at address %x instead of [%s]' %
               (name, address, ','.join('%x' % a for a in missing_names[name])))

   if float(num_new_symbols) / len(raw_symbols) < .05:
     logging.warning('Number of aliases is oddly low (%.0f%%). It should '
                     'usually be around 25%%. Ensure --tool-prefix is correct. ',
                     float(num_new_symbols) / len(raw_symbols) * 100)

   # Step 2: Create new symbols as siblings to each existing one.
   logging.debug('Creating %d new symbols from nm output', num_new_symbols)
   expected_num_symbols = len(raw_symbols) + num_new_symbols
   ret = []
   prev_src = 0
   for cur_src, name_list in replacements:
     ret += raw_symbols[prev_src:cur_src]
     prev_src = cur_src + 1
     sym = raw_symbols[cur_src]
     # Create symbols (|sym| gets recreated and discarded).
     new_syms = []
     for full_name in name_list:
       # Do not set |aliases| in order to avoid being pruned by
       # _CompactLargeAliasesIntoSharedSymbols(), which assumes aliases differ
       # only by path. The field will be set afterwards by _ConnectNmAliases().
       new_syms.append(models.Symbol(
           sym.section_name, sym.size, address=sym.address, full_name=full_name))
     ret += new_syms
   ret += raw_symbols[prev_src:]
   assert expected_num_symbols == len(ret)
   return ret


 def LoadAndPostProcessSizeInfo(path, file_obj=None):
   """Returns a SizeInfo for the given |path|."""
   logging.debug('Loading results from: %s', path)
   size_info = file_format.LoadSizeInfo(path, file_obj=file_obj)
   logging.info('Normalizing symbol names')
   _NormalizeNames(size_info.raw_symbols)
   logging.info('Loaded %d symbols', len(size_info.raw_symbols))
   return size_info


 def LoadAndPostProcessDeltaSizeInfo(path, file_obj=None):
   """Returns a tuple of SizeInfos for the given |path|."""
   logging.debug('Loading results from: %s', path)
   before_size_info, after_size_info = file_format.LoadDeltaSizeInfo(
       path, file_obj=file_obj)
   logging.info('Normalizing symbol names')
   _NormalizeNames(before_size_info.raw_symbols)
   _NormalizeNames(after_size_info.raw_symbols)
   logging.info('Loaded %d + %d symbols', len(before_size_info.raw_symbols),
                len(after_size_info.raw_symbols))
   return before_size_info, after_size_info


 def _GetModuleInfoList(minimal_apks_path):
   module_info_list = []
   with zipfile.ZipFile(minimal_apks_path) as z:
     for info in z.infolist():
       # E.g.:
       # splits/base-master.apk
       # splits/base-en.apk
       # splits/vr-master.apk
       # splits/vr-en.apk
       # TODO(agrieve): Might be worth measuring a non-en locale as well.
       m = re.match(r'splits/(.*)-master\.apk', info.filename)
       if m:
         module_info_list.append((m.group(1), info.file_size))
   return sorted(module_info_list)


 def _CollectModuleSizes(minimal_apks_path):
   sizes_by_module = collections.defaultdict(int)
   for module_name, file_size in _GetModuleInfoList(minimal_apks_path):
     sizes_by_module[module_name] += file_size
   return sizes_by_module


 def _ExtendSectionRange(section_range_by_name, section_name, delta_size):
   (prev_address, prev_size) = section_range_by_name.get(section_name, (0, 0))
   section_range_by_name[section_name] = (prev_address, prev_size + delta_size)


 def CreateMetadata(args, linker_name, build_config):
   """Creates metadata dict while updating |build_config|.

   Args:
     args: Resolved command-line args.
     linker_name: A coded linker name (see linker_map_parser.py).
     build_config: Common build configurations to update or to undergo
         consistency checks.

   Returns:
     A dict of models.METADATA_* -> values. Performs "best effort" extraction
     using available data.
   """
   logging.debug('Constructing metadata')

   def update_build_config(key, value):
     if key in build_config:
       old_value = build_config[key]
       if value != old_value:
         raise ValueError('Inconsistent {}: {} (was {})'.format(
             key, value, old_value))
     else:
       build_config[key] = value

   metadata = {}

   # Ensure all paths are relative to output directory to make them hermetic.
   if args.output_directory:
     shorten_path = lambda path: os.path.relpath(path, args.output_directory)
     gn_args = _ParseGnArgs(os.path.join(args.output_directory, 'args.gn'))
     update_build_config(models.BUILD_CONFIG_GN_ARGS, gn_args)
   else:
     # If output directory is unavailable, just store basenames.
     shorten_path = os.path.basename

   if args.tool_prefix:
     relative_tool_prefix = path_util.ToToolsSrcRootRelative(args.tool_prefix)
     update_build_config(models.BUILD_CONFIG_TOOL_PREFIX, relative_tool_prefix)

   if linker_name:
     update_build_config(models.BUILD_CONFIG_LINKER_NAME, linker_name)

   # Deduce GIT revision (cached via @lru_cache).
   git_rev = _DetectGitRevision(args.source_directory)
   if git_rev:
     update_build_config(models.BUILD_CONFIG_GIT_REVISION, git_rev)

   if args.elf_file:
     metadata[models.METADATA_ELF_FILENAME] = shorten_path(args.elf_file)
     architecture = readelf.ArchFromElf(args.elf_file, args.tool_prefix)
     metadata[models.METADATA_ELF_ARCHITECTURE] = architecture
     timestamp_obj = datetime.datetime.utcfromtimestamp(
         os.path.getmtime(args.elf_file))
     timestamp = calendar.timegm(timestamp_obj.timetuple())
     metadata[models.METADATA_ELF_MTIME] = timestamp
     build_id = readelf.BuildIdFromElf(args.elf_file, args.tool_prefix)
     metadata[models.METADATA_ELF_BUILD_ID] = build_id
     relocations_count = _CountRelocationsFromElf(args.elf_file,
                                                  args.tool_prefix)
     metadata[models.METADATA_ELF_RELOCATIONS_COUNT] = relocations_count

   if args.map_file:
     metadata[models.METADATA_MAP_FILENAME] = shorten_path(args.map_file)

   if args.minimal_apks_file:
     metadata[models.METADATA_APK_FILENAME] = shorten_path(
         args.minimal_apks_file)
     if args.split_name and args.split_name != 'base':
       metadata[models.METADATA_APK_SIZE] = os.path.getsize(args.apk_file)
       metadata[models.METADATA_APK_SPLIT_NAME] = args.split_name
     else:
       sizes_by_module = _CollectModuleSizes(args.minimal_apks_file)
       for name, size in sizes_by_module.items():
         key = models.METADATA_APK_SIZE
         if name != 'base':
           key += '-' + name
         metadata[key] = size
   elif args.apk_file:
     metadata[models.METADATA_APK_FILENAME] = shorten_path(args.apk_file)
     metadata[models.METADATA_APK_SIZE] = os.path.getsize(args.apk_file)

   return metadata


 def _ResolveThinArchivePaths(raw_symbols, thin_archives):
   """Converts object_paths for thin archives to external .o paths."""
   for symbol in raw_symbols:
     object_path = symbol.object_path
     if object_path.endswith(')'):
       start_idx = object_path.rindex('(')
       archive_path = object_path[:start_idx]
       if archive_path in thin_archives:
         subpath = object_path[start_idx + 1:-1]
         symbol.object_path = ar.CreateThinObjectPath(archive_path, subpath)


 def _DeduceObjectPathForSwitchTables(raw_symbols, object_paths_by_name):
   strip_num_suffix_regexp = re.compile(r'\s+\(\.\d+\)$')
   num_switch_tables = 0
   num_unassigned = 0
   num_deduced = 0
   num_arbitrations = 0
   for s in raw_symbols:
     if s.full_name.startswith('Switch table for '):
       num_switch_tables += 1
       # Strip 'Switch table for ' prefix.
       name = s.full_name[17:]
       # Strip, e.g., ' (.123)' suffix.
       name = re.sub(strip_num_suffix_regexp, '', name)
       object_paths = object_paths_by_name.get(name, None)
       if not s.object_path:
         if object_paths is None:
           num_unassigned += 1
         else:
           num_deduced += 1
           # If ambiguity arises, arbitrate by taking the first.
           s.object_path = object_paths[0]
           if len(object_paths) > 1:
             num_arbitrations += 1
       else:
         assert object_paths and s.object_path in object_paths
   if num_switch_tables > 0:
     logging.info(
         'Found %d switch tables: Deduced %d object paths with ' +
         '%d arbitrations. %d remain unassigned.', num_switch_tables,
         num_deduced, num_arbitrations, num_unassigned)


 def _NameStringLiterals(raw_symbols, elf_path, tool_prefix):
   # Assign ASCII-readable string literals names like "string contents".
   STRING_LENGTH_CUTOFF = 30

   PRINTABLE_TBL = [False] * 256
   for ch in string.printable:
     PRINTABLE_TBL[ord(ch)] = True

   for sym, name in string_extract.ReadStringLiterals(raw_symbols, elf_path,
                                                      tool_prefix):
     # Newlines and tabs are used as delimiters in file_format.py
     # At this point, names still have a terminating null byte.
     name = name.replace(b'\n', b'').replace(b'\t', b'').strip(b'\00')
     is_printable = all(PRINTABLE_TBL[c] for c in name)
     if is_printable:
       name = name.decode('ascii')
       if len(name) > STRING_LENGTH_CUTOFF:
         sym.full_name = '"{}[...]"'.format(name[:STRING_LENGTH_CUTOFF])
       else:
         sym.full_name = '"{}"'.format(name)
     else:
       sym.full_name = models.STRING_LITERAL_NAME


 def _ParseElfInfo(map_path, elf_path, tool_prefix, track_string_literals,
                   outdir_context=None, linker_name=None):
   """Adds ELF section ranges and symbols."""
   assert map_path or elf_path, 'Need a linker map or an ELF file.'
   assert map_path or not track_string_literals, (
       'track_string_literals not yet implemented without map file')
   if elf_path:
     elf_section_ranges = readelf.SectionInfoFromElf(elf_path, tool_prefix)

     # Run nm on the elf file to retrieve the list of symbol names per-address.
     # This list is required because the .map file contains only a single name
     # for each address, yet multiple symbols are often coalesced when they are
     # identical. This coalescing happens mainly for small symbols and for C++
     # templates. Such symbols make up ~500kb of libchrome.so on Android.
     elf_nm_result = nm.CollectAliasesByAddressAsync(elf_path, tool_prefix)

     # Run nm on all .o/.a files to retrieve the symbol names within them.
     # The list is used to detect when mutiple .o files contain the same symbol
     # (e.g. inline functions), and to update the object_path / source_path
     # fields accordingly.
     # Looking in object files is required because the .map file choses a
     # single path for these symbols.
     # Rather than record all paths for each symbol, set the paths to be the
     # common ancestor of all paths.
     if outdir_context:
       bulk_analyzer = obj_analyzer.BulkObjectFileAnalyzer(
           tool_prefix, outdir_context.output_directory,
           track_string_literals=track_string_literals)
       bulk_analyzer.AnalyzePaths(outdir_context.elf_object_paths)

   if map_path:
     logging.info('Parsing Linker Map')
     with _OpenMaybeGzAsText(map_path) as f:
       map_section_ranges, raw_symbols, linker_map_extras = (
           linker_map_parser.MapFileParser().Parse(linker_name, f))

       if outdir_context and outdir_context.thin_archives:
         _ResolveThinArchivePaths(raw_symbols, outdir_context.thin_archives)
   else:
     logging.info('Collecting symbols from nm')
     raw_symbols = nm.CreateUniqueSymbols(elf_path, tool_prefix,
                                          elf_section_ranges)

   if map_path and elf_path:
     logging.debug('Validating section sizes')
     differing_elf_section_sizes = {}
     differing_map_section_sizes = {}
     for k, (_, elf_size) in elf_section_ranges.items():
       if k in _SECTION_SIZE_BLOCKLIST:
         continue
       (_, map_size) = map_section_ranges.get(k)
       if map_size != elf_size:
         differing_map_section_sizes[k] = map_size
         differing_elf_section_sizes[k] = elf_size
     if differing_map_section_sizes:
       logging.error('ELF file and .map file do not agree on section sizes.')
       logging.error('readelf: %r', differing_elf_section_sizes)
       logging.error('.map file: %r', differing_map_section_sizes)
       sys.exit(1)

   if elf_path and outdir_context:
     missed_object_paths = _DiscoverMissedObjectPaths(
         raw_symbols, outdir_context.known_inputs)
     missed_object_paths = ar.ExpandThinArchives(
         missed_object_paths, outdir_context.output_directory)[0]
     bulk_analyzer.AnalyzePaths(missed_object_paths)
     bulk_analyzer.SortPaths()
     if track_string_literals and map_path:
       merge_string_syms = [s for s in raw_symbols if
                            s.full_name == '** merge strings' or
                            s.full_name == '** lld merge strings']
       # More likely for there to be a bug in supersize than an ELF to not have a
       # single string literal.
       assert merge_string_syms
       string_ranges = [(s.address, s.size) for s in merge_string_syms]
       bulk_analyzer.AnalyzeStringLiterals(elf_path, string_ranges)

   # Map file for some reason doesn't demangle all names.
   # Demangle prints its own log statement.
   demangle.DemangleRemainingSymbols(raw_symbols, tool_prefix)

   object_paths_by_name = {}
   if elf_path:
     logging.info(
         'Adding symbols removed by identical code folding (as reported by nm)')
     # This normally does not block (it's finished by this time).
     names_by_address = elf_nm_result.get()
     _UpdateSymbolNamesFromNm(raw_symbols, names_by_address)

     raw_symbols = _AddNmAliases(raw_symbols, names_by_address)

     if outdir_context:
       object_paths_by_name = bulk_analyzer.GetSymbolNames()
       logging.debug(
           'Fetched path information for %d symbols from %d files',
           len(object_paths_by_name),
           len(outdir_context.elf_object_paths) + len(missed_object_paths))
       _DeduceObjectPathForSwitchTables(raw_symbols, object_paths_by_name)
       # For aliases, this provides path information where there wasn't any.
       logging.info('Creating aliases for symbols shared by multiple paths')
       raw_symbols = _AssignNmAliasPathsAndCreatePathAliases(
           raw_symbols, object_paths_by_name)

       if track_string_literals:
         logging.info('Waiting for string literal extraction to complete.')
         list_of_positions_by_object_path = bulk_analyzer.GetStringPositions()
       bulk_analyzer.Close()

       if track_string_literals:
         logging.info('Deconstructing ** merge strings into literals')
         replacements = _CreateMergeStringsReplacements(merge_string_syms,
             list_of_positions_by_object_path)
         for merge_sym, literal_syms in zip(merge_string_syms, replacements):
           # Don't replace if no literals were found.
           if literal_syms:
             # Re-find the symbols since aliases cause their indices to change.
             idx = raw_symbols.index(merge_sym)
             # This assignment is a bit slow (causes array to be shifted), but
             # is fast enough since len(merge_string_syms) < 10.
             raw_symbols[idx:idx + 1] = literal_syms

   if map_path:
     linker_map_parser.DeduceObjectPathsFromThinMap(raw_symbols,
                                                    linker_map_extras)

   if elf_path and track_string_literals:
     _NameStringLiterals(raw_symbols, elf_path, tool_prefix)

   # If we have an ELF file, use its ranges as the source of truth, since some
   # sections can differ from the .map.
   return (elf_section_ranges if elf_path else map_section_ranges, raw_symbols,
           object_paths_by_name)


 def _ComputePakFileSymbols(
     file_name, contents, res_info, symbols_by_id, compression_ratio=1):
   id_map = {
       id(v): k
       for k, v in sorted(list(contents.resources.items()), reverse=True)
   }
   alias_map = {
       k: id_map[id(v)]
       for k, v in contents.resources.items() if id_map[id(v)] != k
   }
   name = posixpath.basename(file_name)
   # Hyphens used for language regions. E.g.: en-GB.pak, sr-Latn.pak, ...
   # Longest translated .pak file without hyphen: fil.pak
   if '-' in name or len(name) <= 7:
     section_name = models.SECTION_PAK_TRANSLATIONS
   else:
     # E.g.: resources.pak, chrome_100_percent.pak.
     section_name = models.SECTION_PAK_NONTRANSLATED
   overhead = (12 + 6) * compression_ratio  # Header size plus extra offset
   # Key just needs to be unique from other IDs and pak overhead symbols.
   symbols_by_id[-len(symbols_by_id) - 1] = models.Symbol(
       section_name, overhead, full_name='Overhead: {}'.format(file_name))
   for resource_id in sorted(contents.resources):
     if resource_id in alias_map:
       # 4 extra bytes of metadata (2 16-bit ints)
       size = 4
       resource_id = alias_map[resource_id]
     else:
       resource_data = contents.resources[resource_id]
       # 6 extra bytes of metadata (1 32-bit int, 1 16-bit int)
       size = len(resource_data) + 6
       name, source_path = res_info[resource_id]
       if resource_id not in symbols_by_id:
         full_name = '{}: {}'.format(source_path, name)
         new_symbol = models.Symbol(
             section_name, 0, address=resource_id, full_name=full_name)
         if (section_name == models.SECTION_PAK_NONTRANSLATED and
             _IsPakContentUncompressed(resource_data)):
           new_symbol.flags |= models.FLAG_UNCOMPRESSED
         symbols_by_id[resource_id] = new_symbol

     size *= compression_ratio
     symbols_by_id[resource_id].size += size
   return section_name


 def _IsPakContentUncompressed(content):
   raw_size = len(content)
   # Assume anything less than 100 bytes cannot be compressed.
   if raw_size < 100:
     return False

   compressed_size = len(zlib.compress(content, 1))
   compression_ratio = compressed_size / float(raw_size)
   return compression_ratio < _UNCOMPRESSED_COMPRESSION_RATIO_THRESHOLD


 class _ResourceSourceMapper:
   def __init__(self, size_info_prefix, knobs):
     self._knobs = knobs
     self._res_info = self._LoadResInfo(size_info_prefix)
     self._pattern_dollar_underscore = re.compile(r'\$+(.*?)(?:__\d)+')
     self._pattern_version_suffix = re.compile(r'-v\d+/')

   @staticmethod
   def _ParseResInfoFile(res_info_path):
     with open(res_info_path, 'r') as info_file:
       return dict(l.rstrip().split('\t') for l in info_file)

   def _LoadResInfo(self, size_info_prefix):
     apk_res_info_path = size_info_prefix + '.res.info'
     res_info_without_root = self._ParseResInfoFile(apk_res_info_path)
     # We package resources in the res/ folder only in the apk.
     res_info = {
         os.path.join('res', dest): source
         for dest, source in res_info_without_root.items()
     }
     res_info.update(self._knobs.apk_other_files)
     return res_info

   def FindSourceForPath(self, path):
     # Sometimes android adds $ in front and __# before extension.
     path = self._pattern_dollar_underscore.sub(r'\1', path)
     ret = self._res_info.get(path)
     if ret:
       return ret
     # Android build tools may append extra -v flags for the root dir.
     path = self._pattern_version_suffix.sub('/', path)
     ret = self._res_info.get(path)
     if ret:
       return ret
     return None


 def _ParsePakInfoFile(pak_info_path):
   with open(pak_info_path, 'r') as info_file:
     res_info = {}
     for line in info_file.readlines():
       name, res_id, path = line.split(',')
       res_info[int(res_id)] = (name, path.strip())
   return res_info


 def _ParsePakSymbols(symbols_by_id, object_paths_by_pak_id):
   raw_symbols = []
   for resource_id, symbol in symbols_by_id.items():
     raw_symbols.append(symbol)
     paths = object_paths_by_pak_id.get(resource_id)
     if not paths:
       continue
     symbol.object_path = paths[0]
     if len(paths) == 1:
       continue
     aliases = symbol.aliases or [symbol]
     symbol.aliases = aliases
     for path in paths[1:]:
       new_sym = models.Symbol(
           symbol.section_name, symbol.size, address=symbol.address,
           full_name=symbol.full_name, object_path=path, aliases=aliases)
       aliases.append(new_sym)
       raw_symbols.append(new_sym)
   raw_total = 0.0
   int_total = 0
   for symbol in raw_symbols:
     raw_total += symbol.size
     # We truncate rather than round to ensure that we do not over attribute. It
     # is easier to add another symbol to make up the difference.
     symbol.size = int(symbol.size)
     int_total += symbol.size
   # Attribute excess to translations since only those are compressed.
   overhead_size = round(raw_total - int_total)
   if overhead_size:
     raw_symbols.append(
         models.Symbol(models.SECTION_PAK_TRANSLATIONS,
                       overhead_size,
                       address=raw_symbols[-1].end_address,
                       full_name='Overhead: Pak compression artifacts'))

   # Pre-sort to make final sort faster.
   # Note: _SECTION_SORT_ORDER[] for pak symbols matches section_name ordering.
   raw_symbols.sort(
       key=lambda s: (s.section_name, s.IsOverhead(), s.address, s.object_path))
   return raw_symbols


 def _ParseApkElfSectionRanges(section_ranges, metadata, apk_elf_result):
   if metadata:
     logging.debug('Extracting section sizes from .so within .apk')
     apk_build_id, apk_section_ranges, elf_overhead_size = apk_elf_result.get()
     assert apk_build_id == metadata[models.METADATA_ELF_BUILD_ID], (
         'BuildID from apk_elf_result did not match')

     packed_section_name = None
     architecture = metadata[models.METADATA_ELF_ARCHITECTURE]
     # Packing occurs enabled only arm32 & arm64.
     if architecture == 'arm':
       packed_section_name = '.rel.dyn'
     elif architecture == 'arm64':
       packed_section_name = '.rela.dyn'

     if packed_section_name:
       unpacked_range = section_ranges.get(packed_section_name)
       if unpacked_range is None:
         logging.warning('Packed section not present: %s', packed_section_name)
       elif unpacked_range != apk_section_ranges.get(packed_section_name):
         # These ranges are different only when using relocation_packer, which
         # hasn't been used since switching from gold -> lld.
         apk_section_ranges['%s (unpacked)' %
                            packed_section_name] = unpacked_range
   else:
     _, apk_section_ranges, elf_overhead_size = apk_elf_result.get()
   return apk_section_ranges, elf_overhead_size


 class _ResourcePathDeobfuscator:

   def __init__(self, pathmap_path):
     self._pathmap = self._LoadResourcesPathmap(pathmap_path)

   def _LoadResourcesPathmap(self, pathmap_path):
     """Load the pathmap of obfuscated resource paths.

     Returns: A dict mapping from obfuscated paths to original paths or an
              empty dict if passed a None |pathmap_path|.
     """
     if pathmap_path is None:
       return {}

     pathmap = {}
     with open(pathmap_path, 'r') as f:
       for line in f:
         line = line.strip()
         if line.startswith('--') or line == '':
           continue
         original, renamed = line.split(' -> ')
         pathmap[renamed] = original
     return pathmap

   def MaybeRemapPath(self, path):
     long_path = self._pathmap.get(path)
     if long_path:
       return long_path
     # if processing a .minimal.apks, we are actually just processing the base
     # module.
     long_path = self._pathmap.get('base/{}'.format(path))
     if long_path:
       # The first 5 chars are 'base/', which we don't need because we are
       # looking directly inside the base module apk.
       return long_path[5:]
     return path


 def _ParseApkOtherSymbols(section_ranges, apk_path, apk_so_path,
                           resources_pathmap_path, size_info_prefix, metadata,
                           knobs):
   res_source_mapper = _ResourceSourceMapper(size_info_prefix, knobs)
   resource_deobfuscator = _ResourcePathDeobfuscator(resources_pathmap_path)
   apk_symbols = []
   dex_size = 0
   zip_info_total = 0
   zipalign_total = 0
   with zipfile.ZipFile(apk_path) as z:
     signing_block_size = zip_util.MeasureApkSignatureBlock(z)
     for zip_info in z.infolist():
       zip_info_total += zip_info.compress_size
       # Account for zipalign overhead that exists in local file header.
       zipalign_total += zip_util.ReadZipInfoExtraFieldLength(z, zip_info)
       # Account for zipalign overhead that exists in central directory header.
       # Happens when python aligns entries in apkbuilder.py, but does not
       # exist when using Android's zipalign. E.g. for bundle .apks files.
       zipalign_total += len(zip_info.extra)
       # Skip main shared library, pak, and dex files as they are accounted for.
       if (zip_info.filename == apk_so_path
           or zip_info.filename.endswith('.pak')):
         continue
       if zip_info.filename.endswith('.dex'):
         dex_size += zip_info.file_size
         continue

       resource_filename = resource_deobfuscator.MaybeRemapPath(
           zip_info.filename)
       source_path = res_source_mapper.FindSourceForPath(resource_filename)
       if source_path is None:
         source_path = os.path.join(models.APK_PREFIX_PATH, resource_filename)
       apk_symbols.append(
           models.Symbol(
               models.SECTION_OTHER,
               zip_info.compress_size,
               source_path=source_path,
               full_name=resource_filename))  # Full name must disambiguate

   # Store zipalign overhead and signing block size as metadata rather than an
   # "Overhead:" symbol because they fluctuate in size, and would be a source of
   # noise in symbol diffs if included as symbols (http://crbug.com/1130754).
   # Might be even better if we had an option in Tiger Viewer to ignore certain
   # symbols, but taking this as a short-cut for now.
   metadata[models.METADATA_ZIPALIGN_OVERHEAD] = zipalign_total
   metadata[models.METADATA_SIGNING_BLOCK_SIZE] = signing_block_size

   # Overhead includes:
   #  * Size of all local zip headers (minus zipalign padding).
   #  * Size of central directory & end of central directory.
   overhead_size = (os.path.getsize(apk_path) - zip_info_total - zipalign_total -
                    signing_block_size)
   assert overhead_size >= 0, 'Apk overhead must be non-negative'
   zip_overhead_symbol = models.Symbol(
       models.SECTION_OTHER, overhead_size, full_name='Overhead: APK file')
   apk_symbols.append(zip_overhead_symbol)
   _ExtendSectionRange(section_ranges, models.SECTION_OTHER,
                       sum(s.size for s in apk_symbols))
   return dex_size, apk_symbols


 def _CreatePakObjectMap(object_paths_by_name):
   # IDS_ macro usages result in templated function calls that contain the
   # resource ID in them. These names are collected along with all other symbols
   # by running "nm" on them. We just need to extract the values from them.
   object_paths_by_pak_id = {}
   PREFIX = 'void ui::AllowlistedResource<'
   id_start_idx = len(PREFIX)
   id_end_idx = -len('>()')
   for name in object_paths_by_name:
     if name.startswith(PREFIX):
       pak_id = int(name[id_start_idx:id_end_idx])
       object_paths_by_pak_id[pak_id] = object_paths_by_name[name]
   return object_paths_by_pak_id


 def _FindPakSymbolsFromApk(opts, section_ranges, apk_path, size_info_prefix):
   with zipfile.ZipFile(apk_path) as z:
     pak_zip_infos = (f for f in z.infolist() if f.filename.endswith('.pak'))
     pak_info_path = size_info_prefix + '.pak.info'
     res_info = _ParsePakInfoFile(pak_info_path)
     symbols_by_id = {}
     total_compressed_size = 0
     total_uncompressed_size = 0
     for zip_info in pak_zip_infos:
       contents = data_pack.ReadDataPackFromString(z.read(zip_info))
       compression_ratio = 1.0
       if zip_info.compress_size < zip_info.file_size:
         total_compressed_size += zip_info.compress_size
         total_uncompressed_size += zip_info.file_size
         compression_ratio = opts.pak_compression_ratio
       section_name = _ComputePakFileSymbols(
           zip_info.filename, contents,
           res_info, symbols_by_id, compression_ratio=compression_ratio)
       _ExtendSectionRange(section_ranges, section_name, zip_info.compress_size)

     if total_uncompressed_size > 0:
       actual_ratio = (
           float(total_compressed_size) / total_uncompressed_size)
       logging.info(
           'Pak Compression Ratio: %f Actual: %f Diff: %.0f',
           opts.pak_compression_ratio, actual_ratio,
           (opts.pak_compression_ratio - actual_ratio) * total_uncompressed_size)
   return symbols_by_id


 def _FindPakSymbolsFromFiles(section_ranges, pak_files, pak_info_path,
                              output_directory):
   """Uses files from args to find and add pak symbols."""
   res_info = _ParsePakInfoFile(pak_info_path)
   symbols_by_id = {}
   for pak_file_path in pak_files:
     with open(pak_file_path, 'rb') as f:
       contents = data_pack.ReadDataPackFromString(f.read())
     section_name = _ComputePakFileSymbols(
         os.path.relpath(pak_file_path, output_directory), contents, res_info,
         symbols_by_id)
     _ExtendSectionRange(section_ranges, section_name,
                         os.path.getsize(pak_file_path))
   return symbols_by_id


 def _CalculateElfOverhead(section_ranges, elf_path):
   if elf_path:
     section_sizes_total_without_bss = sum(
         size for k, (address, size) in section_ranges.items()
         if k not in models.BSS_SECTIONS)
     elf_overhead_size = (
         os.path.getsize(elf_path) - section_sizes_total_without_bss)
     assert elf_overhead_size >= 0, (
         'Negative ELF overhead {}'.format(elf_overhead_size))
     return elf_overhead_size
   return 0


 def _OverwriteSymbolSizesWithRelocationCount(raw_symbols, tool_prefix,
                                              elf_path):
   logging.info('Removing non-native symbols')
   raw_symbols = [sym for sym in raw_symbols if sym.IsNative()]

   logging.info('Overwriting symbol sizes with relocation count')
   # Last symbol address is the end of the last symbol, so we don't misattribute
   # all relros after the last symbol to that symbol.
   symbol_addresses = [s.address for s in raw_symbols]
   symbol_addresses.append(raw_symbols[-1].end_address)

   for symbol in raw_symbols:
     symbol.address = 0
     symbol.size = 0
     symbol.padding = 0

   relro_addresses = readelf.CollectRelocationAddresses(elf_path, tool_prefix)
   # More likely for there to be a bug in supersize than an ELF to have any
   # relative relocations.
   assert relro_addresses

   logging.info('Adding %d relocations', len(relro_addresses))
   for addr in relro_addresses:
     # Attribute relros to largest symbol start address that precede them.
     idx = bisect.bisect_right(symbol_addresses, addr) - 1
     if 0 <= idx < len(raw_symbols):
       symbol = raw_symbols[idx]
       for alias in symbol.aliases or [symbol]:
         alias.size += 1

   raw_symbols = [sym for sym in raw_symbols if sym.size]
   return raw_symbols


 def _AddUnattributedSectionSymbols(raw_symbols, section_ranges):
   # Create symbols for ELF sections not covered by existing symbols.
   logging.info('Searching for symbol gaps...')
   new_syms_by_section = collections.defaultdict(list)

   for section_name, group in itertools.groupby(
       raw_symbols, lambda s: s.section_name):
     # Get last Last symbol in group.
     for sym in group:
       pass
     end_address = sym.end_address  # pylint: disable=undefined-loop-variable
     size_from_syms = end_address - section_ranges[section_name][0]
     overhead = section_ranges[section_name][1] - size_from_syms
     assert overhead >= 0, (
         ('End of last symbol (%x) in section %s is %d bytes after the end of '
          'section from readelf (%x).') % (end_address, section_name, -overhead,
                                           sum(section_ranges[section_name])))
     if overhead > 0 and section_name not in models.BSS_SECTIONS:
       new_syms_by_section[section_name].append(
           models.Symbol(section_name,
                         overhead,
                         address=end_address,
                         full_name='** {} (unattributed)'.format(section_name)))
       logging.info('Last symbol in %s does not reach end of section, gap=%d',
                    section_name, overhead)

   # Sections that should not bundle into ".other".
   unsummed_sections, summed_sections = models.ClassifySections(
       section_ranges.keys())
   other_elf_symbols = []
   # Sort keys to ensure consistent order (> 1 sections may have address = 0).
   for section_name, (_, section_size) in list(section_ranges.items()):
     # Handle sections that don't appear in |raw_symbols|.
     if (section_name not in unsummed_sections
         and section_name not in summed_sections):
       other_elf_symbols.append(
           models.Symbol(models.SECTION_OTHER,
                         section_size,
                         full_name='** ELF Section: {}'.format(section_name)))
       _ExtendSectionRange(section_ranges, models.SECTION_OTHER, section_size)
   other_elf_symbols.sort(key=lambda s: (s.address, s.full_name))

   # TODO(agrieve): It would probably simplify things to use a dict of
   #     section_name->raw_symbols while creating symbols.
   # Merge |new_syms_by_section| into |raw_symbols| while maintaining ordering.
   ret = []
   for section_name, group in itertools.groupby(
       raw_symbols, lambda s: s.section_name):
     ret.extend(group)
     ret.extend(new_syms_by_section[section_name])
   return ret, other_elf_symbols


 def _ParseNinjaFiles(output_directory, elf_path=None):
   linker_elf_path = elf_path
   if elf_path:
     # For partitioned libraries, the actual link command outputs __combined.so.
     partitioned_elf_path = elf_path.replace('.so', '__combined.so')
     if os.path.exists(partitioned_elf_path):
       linker_elf_path = partitioned_elf_path

   logging.info('Parsing ninja files, looking for %s.',
                (linker_elf_path or 'source mapping only (elf_path=None)'))

   source_mapper, ninja_elf_object_paths = ninja_parser.Parse(
       output_directory, linker_elf_path)

   logging.debug('Parsed %d .ninja files.', source_mapper.parsed_file_count)
   if elf_path:
     assert ninja_elf_object_paths, (
         'Failed to find link command in ninja files for ' +
         os.path.relpath(linker_elf_path, output_directory))

   return source_mapper, ninja_elf_object_paths


 def CreateContainerAndSymbols(knobs=None,
                               opts=None,
                               container_name=None,
                               metadata=None,
                               map_path=None,
                               tool_prefix=None,
                               output_directory=None,
                               source_directory=None,
                               elf_path=None,
                               apk_path=None,
                               mapping_path=None,
                               resources_pathmap_path=None,
                               apk_so_path=None,
                               pak_files=None,
                               pak_info_file=None,
                               linker_name=None,
                               size_info_prefix=None):
   """Creates a Container (with sections sizes) and symbols for a SizeInfo.

   Args:
     knobs: Instance of SectionSizeKnobs.
     opts: Instance of ContainerArchiveOptions.
     container_name: Name for the created Container. May be '' if only one
         Container exists.
     metadata: Metadata dict from CreateMetadata().
     map_path: Path to the linker .map(.gz) file to parse.
     tool_prefix: Prefix for c++filt & nm (required).
     output_directory: Build output directory. If None, source_paths and symbol
         alias information will not be recorded.
     source_directory: Path to source root.
     elf_path: Path to the corresponding unstripped ELF file. Used to find symbol
         aliases and inlined functions. Can be None.
     apk_path: Path to the .apk file to measure.
     mapping_path: Path to the .mapping file for DEX symbol processing.
     resources_pathmap_path: Path to the pathmap file that maps original
         resource paths to shortened resource paths.
     apk_so_path: Path to an .so file within an APK file.
     pak_files: List of paths to .pak files.
     pak_info_file: Path to a .pak.info file.
     linker_name: A coded linker name (see linker_map_parser.py).
     size_info_prefix: Path to $out/size-info/$ApkName.

   Returns:
     A tuple of (container, raw_symbols).
     containers is a Container instance that stores metadata and section_sizes
     (section_sizes maps section names to respective sizes).
     raw_symbols is a list of Symbol objects.
   """
   assert elf_path or not opts.relocations_mode, (
       '--relocations-mode requires a ELF file')

   knobs = knobs or SectionSizeKnobs()
   if apk_path and apk_so_path:
     # Extraction takes around 1 second, so do it in parallel.
     apk_elf_result = parallel.ForkAndCall(_ElfInfoFromApk,
                                           (apk_path, apk_so_path, tool_prefix))
   else:
     apk_elf_result = None

   outdir_context = None
   source_mapper = None
   section_ranges = {}
   raw_symbols = []
   if opts.analyze_native and output_directory:
     # Finds all objects passed to the linker and creates a map of .o -> .cc.
     source_mapper, ninja_elf_object_paths = _ParseNinjaFiles(
         output_directory, elf_path)

     # Start by finding elf_object_paths so that nm can run on them while the
     # linker .map is being parsed.
     if ninja_elf_object_paths:
       elf_object_paths, thin_archives = ar.ExpandThinArchives(
           ninja_elf_object_paths, output_directory)
       known_inputs = set(elf_object_paths)
       known_inputs.update(ninja_elf_object_paths)
     else:
       elf_object_paths = None
       known_inputs = None
       # When we don't know which elf file is used, just search all paths.
       if opts.analyze_native:
         thin_archives = set(
             p for p in source_mapper.IterAllPaths() if p.endswith('.a')
             and ar.IsThinArchive(os.path.join(output_directory, p)))
       else:
         thin_archives = None

     outdir_context = _OutputDirectoryContext(
         elf_object_paths=elf_object_paths,
         known_inputs=known_inputs,
         output_directory=output_directory,
         thin_archives=thin_archives)

   if opts.analyze_native:
     section_ranges, raw_symbols, object_paths_by_name = _ParseElfInfo(
         map_path,
         elf_path,
         tool_prefix,
         opts.track_string_literals,
         outdir_context=outdir_context,
         linker_name=linker_name)

   if apk_elf_result:
     section_ranges, elf_overhead_size = _ParseApkElfSectionRanges(
         section_ranges, metadata, apk_elf_result)
   elif elf_path:
     # Strip ELF before capturing section information to avoid recording
     # debug sections.
     with tempfile.NamedTemporaryFile(suffix=os.path.basename(elf_path)) as f:
       strip_path = path_util.GetStripPath(tool_prefix)
       subprocess.run([strip_path, '-o', f.name, elf_path], check=True)
       section_ranges = readelf.SectionInfoFromElf(f.name, tool_prefix)
       elf_overhead_size = _CalculateElfOverhead(section_ranges, f.name)

   if elf_path:
     raw_symbols, other_elf_symbols = _AddUnattributedSectionSymbols(
         raw_symbols, section_ranges)

   pak_symbols_by_id = None
   other_symbols = []
   if apk_path and size_info_prefix and not opts.relocations_mode:
     # Can modify |section_ranges|.
     pak_symbols_by_id = _FindPakSymbolsFromApk(opts, section_ranges, apk_path,
                                                size_info_prefix)

     # Can modify |section_ranges|.
     dex_size, other_symbols = _ParseApkOtherSymbols(section_ranges, apk_path,
                                                     apk_so_path,
                                                     resources_pathmap_path,
                                                     size_info_prefix, metadata,
                                                     knobs)

     if opts.analyze_java:
       dex_symbols = apkanalyzer.CreateDexSymbols(apk_path, mapping_path,
                                                  size_info_prefix)

       # We can't meaningfully track section size of dex methods vs other, so
       # just fake the size of dex methods as the sum of symbols, and make
       # "dex other" responsible for any unattributed bytes.
       dex_method_size = int(
           round(
               sum(s.pss for s in dex_symbols
                   if s.section_name == models.SECTION_DEX_METHOD)))
       section_ranges[models.SECTION_DEX_METHOD] = (0, dex_method_size)
       section_ranges[models.SECTION_DEX] = (0, dex_size - dex_method_size)

       dex_other_size = int(
           round(
               sum(s.pss for s in dex_symbols
                   if s.section_name == models.SECTION_DEX)))
       unattributed_dex = section_ranges[models.SECTION_DEX][1] - dex_other_size
       # Compare against -5 instead of 0 to guard against round-off errors.
       assert unattributed_dex >= -5, ('Dex symbols take up more space than '
                                       'the dex sections have available')
       if unattributed_dex > 0:
         dex_symbols.append(
             models.Symbol(
                 models.SECTION_DEX,
                 unattributed_dex,
                 full_name='** .dex (unattributed - includes string literals)'))
       raw_symbols.extend(dex_symbols)

   elif pak_files and pak_info_file:
     # Can modify |section_ranges|.
     pak_symbols_by_id = _FindPakSymbolsFromFiles(
         section_ranges, pak_files, pak_info_file, output_directory)

   if elf_path:
     elf_overhead_symbol = models.Symbol(
         models.SECTION_OTHER, elf_overhead_size, full_name='Overhead: ELF file')
     _ExtendSectionRange(section_ranges, models.SECTION_OTHER, elf_overhead_size)
     other_symbols.append(elf_overhead_symbol)
     other_symbols.extend(other_elf_symbols)

   if pak_symbols_by_id:
     logging.debug('Extracting pak IDs from symbol names, and creating symbols')
     object_paths_by_pak_id = {}
     if opts.analyze_native:
       object_paths_by_pak_id = _CreatePakObjectMap(object_paths_by_name)
     pak_raw_symbols = _ParsePakSymbols(
         pak_symbols_by_id, object_paths_by_pak_id)
     raw_symbols.extend(pak_raw_symbols)

   # Always have .other come last.
   other_symbols.sort(key=lambda s: (s.IsOverhead(), s.full_name.startswith(
       '**'), s.address, s.full_name))
   raw_symbols.extend(other_symbols)

   _ExtractSourcePathsAndNormalizeObjectPaths(raw_symbols, source_mapper)
   _PopulateComponents(raw_symbols, source_directory)
   logging.info('Converting excessive aliases into shared-path symbols')
   _CompactLargeAliasesIntoSharedSymbols(raw_symbols, knobs)
   logging.debug('Connecting nm aliases')
   _ConnectNmAliases(raw_symbols)

   if opts.relocations_mode:
     raw_symbols = _OverwriteSymbolSizesWithRelocationCount(
         raw_symbols, tool_prefix, elf_path)

   section_sizes = {k: size for k, (address, size) in section_ranges.items()}
   container = models.Container(name=container_name,
                                metadata=metadata,
                                section_sizes=section_sizes)
   for symbol in raw_symbols:
     symbol.container = container

   # Sorting for relocations mode causes .data and .data.rel.ro to be interleaved
   # due to setting all addresses to 0.
   if not opts.relocations_mode:
     file_format.SortSymbols(raw_symbols, check_already_mostly_sorted=True)

   return container, raw_symbols


 def CreateSizeInfo(build_config,
                    container_list,
                    raw_symbols_list,
                    normalize_names=True):
   """Performs operations on all symbols and creates a SizeInfo object."""
   assert len(container_list) == len(raw_symbols_list)

   all_raw_symbols = []
   for raw_symbols in raw_symbols_list:
     file_format.CalculatePadding(raw_symbols)

     # Do not call _NormalizeNames() during archive since that method tends to
     # need tweaks over time. Calling it only when loading .size files allows for
     # more flexibility.
     if normalize_names:
       _NormalizeNames(raw_symbols)

     all_raw_symbols += raw_symbols

   return models.SizeInfo(build_config, container_list, all_raw_symbols)


 @functools.lru_cache
 def _DetectGitRevision(directory):
   """Runs git rev-parse to get the SHA1 hash of the current revision.

   Args:
     directory: Path to directory where rev-parse command will be run.

   Returns:
     A string with the SHA1 hash, or None if an error occured.
   """
   try:
     git_rev = subprocess.check_output(
         ['git', '-C', directory, 'rev-parse', 'HEAD']).decode('ascii')
     return git_rev.rstrip()
   except Exception:
     logging.warning('Failed to detect git revision for file metadata.')
     return None


 def _ElfIsMainPartition(elf_path, tool_prefix):
   section_ranges = readelf.SectionInfoFromElf(elf_path, tool_prefix)
   return models.SECTION_PART_END in section_ranges.keys()


 def _CountRelocationsFromElf(elf_path, tool_prefix):
   args = [path_util.GetObjDumpPath(tool_prefix), '--private-headers', elf_path]
   stdout = subprocess.check_output(args).decode('ascii')
   relocations = re.search('REL[AR]?COUNT\s*(.+)', stdout).group(1)
   return int(relocations, 16)


 @functools.lru_cache
 def _ParseGnArgs(args_path):
   """Returns a list of normalized "key=value" strings."""
   args = {}
   with open(args_path) as f:
     for l in f:
       # Strips #s even if within string literal. Not a problem in practice.
       parts = l.split('#')[0].split('=')
       if len(parts) != 2:
         continue
       args[parts[0].strip()] = parts[1].strip()
   return ["%s=%s" % x for x in sorted(args.items())]


 def _DetectLinkerName(map_path):
   with _OpenMaybeGzAsText(map_path) as f:
     return linker_map_parser.DetectLinkerNameFromMapFile(f)


 def _ElfInfoFromApk(apk_path, apk_so_path, tool_prefix):
   """Returns a tuple of (build_id, section_ranges, elf_overhead_size)."""
   with zip_util.UnzipToTemp(apk_path, apk_so_path) as temp:
     build_id = readelf.BuildIdFromElf(temp, tool_prefix)
     section_ranges = readelf.SectionInfoFromElf(temp, tool_prefix)
     elf_overhead_size = _CalculateElfOverhead(section_ranges, temp)
     return build_id, section_ranges, elf_overhead_size


 def _AddContainerArguments(parser):
   """Add arguments applicable to a single container."""

   # Special: Use _IdentifyInputFile() to detect main file argument.
   parser.add_argument('-f', metavar='FILE',
                       help='Auto-identify input file type.')

   # Main file argument: Exactly one should be specified (perhaps via -f), with
   # the exception that --map-file can be specified in addition.
   # _IdentifyInputFile() should be kept updated.
   parser.add_argument('--apk-file',
                       help='.apk file to measure. Other flags can generally be '
                       'derived when this is used.')
   parser.add_argument('--minimal-apks-file',
                       help='.minimal.apks file to measure. Other flags can '
                       'generally be derived when this is used.')
   parser.add_argument('--elf-file', help='Path to input ELF file.')
   parser.add_argument('--map-file',
                       help='Path to input .map(.gz) file. Defaults to '
                            '{{elf_file}}.map(.gz)?. If given without '
                            '--elf-file, no size metadata will be recorded.')

   # Auxiliary file arguments.
   parser.add_argument('--mapping-file',
                       help='Proguard .mapping file for deobfuscation.')
   parser.add_argument('--resources-pathmap-file',
                       help='.pathmap.txt file that contains a maping from '
                       'original resource paths to shortened resource paths.')
   parser.add_argument('--pak-file', action='append',
                       help='Paths to pak files.')
   parser.add_argument('--pak-info-file',
                       help='This file should contain all ids found in the pak '
                            'files that have been passed in.')
   parser.add_argument('--aux-elf-file',
                       help='Path to auxiliary ELF if the main file is APK, '
                       'useful for capturing metadata.')

   # Non-file argument.
   parser.add_argument('--no-string-literals', dest='track_string_literals',
                       default=True, action='store_false',
                       help='Disable breaking down "** merge strings" into more '
                            'granular symbols.')
   parser.add_argument('--no-map-file',
                       dest='ignore_linker_map',
                       action='store_true',
                       help='Use debug information to capture symbol sizes '
                       'instead of linker map file.')
   parser.add_argument(
       '--relocations',
       action='store_true',
       help='Instead of counting binary size, count number of relative '
       'relocation instructions in ELF code.')
   parser.add_argument(
       '--java-only', action='store_true', help='Run on only Java symbols')
   parser.add_argument(
       '--native-only', action='store_true', help='Run on only native symbols')
   parser.add_argument(
       '--no-java', action='store_true', help='Do not run on Java symbols')
   parser.add_argument(
       '--no-native', action='store_true', help='Do not run on native symbols')
   parser.add_argument(
       '--include-padding',
       action='store_true',
       help='Include a padding field for each symbol, instead of rederiving '
       'from consecutive symbols on file load.')
   parser.add_argument(
       '--check-data-quality',
       action='store_true',
       help='Perform sanity checks to ensure there is no missing data.')

   # The split_name arg is used for bundles to identify DFMs.
   parser.set_defaults(split_name=None)


 def AddArguments(parser):
   parser.add_argument('size_file', help='Path to output .size file.')
   parser.add_argument('--source-directory',
                       help='Custom path to the root source directory.')
   parser.add_argument('--output-directory',
                       help='Path to the root build directory.')
   parser.add_argument('--tool-prefix',
                       help='Path prefix for c++filt, nm, readelf.')
   parser.add_argument(
       '--no-output-directory',
       action='store_true',
       help='Skips all data collection that requires build intermediates.')
   parser.add_argument('--ssargs-file',
                       help='Path to SuperSize multi-container arguments file.')
   _AddContainerArguments(parser)


 def _IdentifyInputFile(args, on_config_error):
   """Identifies main input file type from |args.f|, and updates |args|.

   Identification is performed on filename alone, i.e., the file need not exist.
   The result is written to a field in |args|. If the field exists then it
   simply gets overwritten.

   If '.' is missing from |args.f| then --elf-file is assumed.

   Returns:
     The primary input file.
 """
   if args.f:
     if args.f.endswith('.minimal.apks'):
       args.minimal_apks_file = args.f
     elif args.f.endswith('.apk'):
       args.apk_file = args.f
     elif args.f.endswith('.so') or '.' not in os.path.basename(args.f):
       args.elf_file = args.f
     elif args.f.endswith('.map') or args.f.endswith('.map.gz'):
       args.map_file = args.f
     elif args.f.endswith('.ssargs'):
       # Fails if trying to nest them, which should never happen.
       args.ssargs_file = args.f
     else:
       on_config_error('Cannot identify file ' + args.f)
     args.f = None

   ret = [
       args.apk_file, args.elf_file, args.minimal_apks_file,
       args.__dict__.get('ssargs_file')
   ]
   ret = [v for v in ret if v]
   # --map-file can be a main file, or used with another main file.
   if not ret and args.map_file:
     ret.append(args.map_file)
   elif not ret:
     on_config_error(
         'Must pass at least one of --apk-file, --minimal-apks-file, '
         '--elf-file, --map-file, --ssargs-file')
   elif len(ret) > 1:
     on_config_error(
         'Found colliding --apk-file, --minimal-apk-file, --elf-file, '
         '--ssargs-file')
   return ret[0]


 def ParseSsargs(lines):
   """Parses .ssargs data.

   An .ssargs file is a text file to specify multiple containers as input to
   SuperSize-archive. After '#'-based comments, start / end whitespaces, and
   empty lines are stripped, each line specifies a distinct container. Format:
   * Positional argument: |name| for the container.
   * Main input file specified by -f, --apk-file, --elf-file, etc.:
     * Can be an absolute path.
     * Can be a relative path. In this case, it's up to the caller to supply the
       base directory.
     * -f switch must not specify another .ssargs file.
   * For supported switches: See _AddContainerArguments().

   Args:
     lines: An iterator containing lines of .ssargs data.
   Returns:
     A list of arguments, one for each container.
   Raises:
     ValueError: Parse error, including input line number.
   """
   sub_args_list = []
   parser = argparse.ArgumentParser(add_help=False)
   parser.error = lambda msg: (_ for _ in ()).throw(ValueError(msg))
   parser.add_argument('name')
   _AddContainerArguments(parser)
   try:
     for lineno, line in enumerate(lines, 1):
       toks = shlex.split(line, comments=True)
       if not toks:  # Skip if line is empty after stripping comments.
         continue
       sub_args_list.append(parser.parse_args(toks))
   except ValueError as e:
     e.args = ('Line %d: %s' % (lineno, e.args[0]), )
     raise e
   return sub_args_list


 def _DeduceNativeInfo(tentative_output_dir, apk_path, elf_path, map_path,
                       ignore_linker_map, on_config_error):
   apk_so_path = None
   if apk_path:
     with zipfile.ZipFile(apk_path) as z:
       lib_infos = [
           f for f in z.infolist()
           if f.filename.endswith('.so') and f.file_size > 0
       ]
     if not lib_infos:
       return None, map_path, None

     # TODO(agrieve): Add support for multiple .so files, and take into account
     #     secondary architectures.
     apk_so_path = max(lib_infos, key=lambda x: x.file_size).filename
     logging.debug('Sub-apk path=%s', apk_so_path)
     if not elf_path and tentative_output_dir:
       elf_path = os.path.join(
           tentative_output_dir, 'lib.unstripped',
           os.path.basename(apk_so_path.replace('crazy.', '')))
       logging.debug('Detected --elf-file=%s', elf_path)

   if map_path:
     if not map_path.endswith('.map') and not map_path.endswith('.map.gz'):
       on_config_error('Expected --map-file to end with .map or .map.gz')
   elif elf_path:
     # TODO(agrieve): Support breaking down partitions.
     is_partition = elf_path.endswith('_partition.so')
     if is_partition:
       on_config_error('Found unexpected _partition.so: ' + elf_path)

     if not ignore_linker_map:
       if _ElfIsMainPartition(elf_path, ''):
         map_path = elf_path.replace('.so', '__combined.so') + '.map'
       else:
         map_path = elf_path + '.map'
       if not os.path.exists(map_path):
         map_path += '.gz'

   if not ignore_linker_map and not os.path.exists(map_path):
     # Consider a missing linker map fatal only for the base module. For .so
     # files in feature modules, allow skipping breakdowns.
     on_config_error(
         'Could not find .map(.gz)? file. Ensure you have built with '
         'is_official_build=true and generate_linker_map=true, or use '
         '--map-file to point me a linker map file, or use --no-map-file.')

   return elf_path, map_path, apk_so_path


 def _DeduceAuxPaths(args, apk_prefix):
   mapping_path = args.mapping_file
   resources_pathmap_path = args.resources_pathmap_file
   if apk_prefix:
     if not mapping_path:
       mapping_path = apk_prefix + '.mapping'
       logging.debug('Detected --mapping-file=%s', mapping_path)
     if not resources_pathmap_path:
       possible_pathmap_path = apk_prefix + '.pathmap.txt'
       # This could be pointing to a stale pathmap file if path shortening was
       # previously enabled but is disabled for the current build. However, since
       # current apk/aab will have unshortened paths, looking those paths up in
       # the stale pathmap which is keyed by shortened paths would not find any
       # mapping and thus should not cause any issues.
       if os.path.exists(possible_pathmap_path):
         resources_pathmap_path = possible_pathmap_path
         logging.debug('Detected --resources-pathmap-file=%s',
                       resources_pathmap_path)
   return mapping_path, resources_pathmap_path


 def _ReadMultipleArgsFromStream(lines, base_dir, err_prefix, on_config_error):
   try:
     ret = ParseSsargs(lines)
   except ValueError as e:
     on_config_error('%s: %s' % (err_prefix, e.args[0]))
   for sub_args in ret:
     for k, v in sub_args.__dict__.items():
       # Translate file arguments to be relative to |sub_dir|.
       if (k.endswith('_file') or k == 'f') and isinstance(v, str):
         sub_args.__dict__[k] = os.path.join(base_dir, v)
   return ret


 def _ReadMultipleArgsFromFile(ssargs_file, on_config_error):
   with open(ssargs_file, 'r') as fh:
     lines = list(fh)
   err_prefix = 'In file ' + ssargs_file
   # Supply |base_dir| as the directory containing the .ssargs file, to ensure
   # consistent behavior wherever SuperSize-archive runs.
   base_dir = os.path.dirname(os.path.abspath(ssargs_file))
   return _ReadMultipleArgsFromStream(lines, base_dir, err_prefix,
                                      on_config_error)


 def _ProcessContainerArgs(top_args, sub_args, container_name, on_config_error):
   # Copy output_directory, tool_prefix, etc. into sub_args.
   for k, v in top_args.__dict__.items():
     sub_args.__dict__.setdefault(k, v)

   opts = ContainerArchiveOptions(top_args, sub_args)
   apk_prefix = sub_args.minimal_apks_file or sub_args.apk_file
   if apk_prefix:
     # Allow either .minimal.apks or just .apks.
     apk_prefix = apk_prefix.replace('.minimal.apks', '.aab')
     apk_prefix = apk_prefix.replace('.apks', '.aab')

   sub_args.mapping_path, resources_pathmap_path = _DeduceAuxPaths(
       sub_args, apk_prefix)
   linker_name = None
   if opts.analyze_native:
     is_base_module = sub_args.split_name in (None, 'base')
     # We don't yet support analyzing .so files outside of base modules.
     if not is_base_module:
       opts.analyze_native = False
     else:
       sub_args.elf_file, sub_args.map_file, apk_so_path = _DeduceNativeInfo(
           tentative_output_dir=top_args.output_directory,
           apk_path=sub_args.apk_file,
           elf_path=sub_args.elf_file or sub_args.aux_elf_file,
           map_path=sub_args.map_file,
           ignore_linker_map=sub_args.ignore_linker_map,
           on_config_error=on_config_error)

     if sub_args.ignore_linker_map:
       sub_args.map_file = None

   if opts.analyze_native:
     if sub_args.map_file:
       linker_name = _DetectLinkerName(sub_args.map_file)
       logging.info('Linker name: %s', linker_name)
     else:
       # TODO(crbug.com/1193507): Remove when we implement string literal
       #     tracking without map files.
       #     nm emits some string literal symbols, but most exist in symbol gaps.
       opts.track_string_literals = False

     tool_prefix_finder = path_util.ToolPrefixFinder(
         value=sub_args.tool_prefix,
         output_directory=top_args.output_directory,
         linker_name=linker_name)
     sub_args.tool_prefix = tool_prefix_finder.Finalized()
   else:
     # Trust that these values will not be used, and set to None.
     sub_args.elf_file = None
     sub_args.map_file = None
     apk_so_path = None

   size_info_prefix = None
   if top_args.output_directory and apk_prefix:
     size_info_prefix = os.path.join(top_args.output_directory, 'size-info',
                                     os.path.basename(apk_prefix))

   # Need one or the other to have native symbols.
   if not sub_args.elf_file and not sub_args.map_file:
     opts.analyze_native = False

   container_args = sub_args.__dict__.copy()
   container_args.update(opts.__dict__)
   logging.info('Container Params: %r', container_args)
   return (sub_args, opts, container_name, apk_so_path, resources_pathmap_path,
           linker_name, size_info_prefix)


 def _IsOnDemand(apk_path):
   # Check if the manifest specifies whether or not to extract native libs.
   output = subprocess.check_output([
       path_util.GetAapt2Path(), 'dump', 'xmltree', '--file',
       'AndroidManifest.xml', apk_path
   ]).decode('ascii')

   def parse_attr(name):
     # http://schemas.android.com/apk/res/android:isFeatureSplit(0x0101055b)=true
     # http://schemas.android.com/apk/distribution:onDemand=true
     m = re.search(name + r'(?:\(.*?\))?=(\w+)', output)
     return m and m.group(1) == 'true'

   is_feature_split = parse_attr('android:isFeatureSplit')
   # Can use <dist:on-demand>, or <module dist:onDemand="true">.
   on_demand = parse_attr(
       'distribution:onDemand') or 'distribution:on-demand' in output
   on_demand = bool(on_demand and is_feature_split)

   return on_demand


 def _IterSubArgs(top_args, on_config_error):
   """Generates main paths (may be deduced) for each containers given by input.

   Yields:
     For each container, main paths and other info needed to create size_info.
   """
   main_file = _IdentifyInputFile(top_args, on_config_error)
   if top_args.no_output_directory:
     top_args.output_directory = None
   else:
     output_directory_finder = path_util.OutputDirectoryFinder(
         value=top_args.output_directory,
         any_path_within_output_directory=main_file)
     top_args.output_directory = output_directory_finder.Finalized()

   if not top_args.source_directory:
     top_args.source_directory = path_util.GetSrcRootFromOutputDirectory(
         top_args.output_directory)
     assert top_args.source_directory

   if top_args.ssargs_file:
     sub_args_list = _ReadMultipleArgsFromFile(top_args.ssargs_file,
                                               on_config_error)
   else:
     sub_args_list = [top_args]

   # Do a quick first pass to ensure inputs have been built.
   for sub_args in sub_args_list:
     main_file = _IdentifyInputFile(sub_args, on_config_error)
     if not os.path.exists(main_file):
       raise Exception('Input does not exist: ' + main_file)

   # Each element in |sub_args_list| specifies a container.
   for sub_args in sub_args_list:
     main_file = _IdentifyInputFile(sub_args, on_config_error)
     if hasattr(sub_args, 'name'):
       container_name = sub_args.name
     else:
       container_name = os.path.basename(main_file)
     if set(container_name) & set('<>?'):
       parser.error('Container name cannot have characters in "<>?"')


     # If needed, extract .apk file to a temp file and process that instead.
     if sub_args.minimal_apks_file:
       for module_name, _ in _GetModuleInfoList(sub_args.minimal_apks_file):
         with zip_util.UnzipToTemp(
             sub_args.minimal_apks_file,
             'splits/{}-master.apk'.format(module_name)) as temp:
           module_sub_args = copy.copy(sub_args)
           module_sub_args.apk_file = temp
           module_sub_args.split_name = module_name
           module_sub_args.name = '{}/{}.apk'.format(container_name, module_name)
           # Make on-demand a part of the name so that:
           # * It's obvious from the name which DFMs are on-demand.
           # * Diffs that change an on-demand status show as adds/removes.
           if _IsOnDemand(temp):
             module_sub_args.name += '?'
           if module_name != 'base':
             # TODO(crbug.com/1143690): Fix native analysis for split APKs.
             module_sub_args.map_file = None
           yield _ProcessContainerArgs(top_args, module_sub_args,
                                       module_sub_args.name, on_config_error)
     else:
       yield _ProcessContainerArgs(top_args, sub_args, container_name,
                                   on_config_error)


 def Run(top_args, on_config_error):
   if not top_args.size_file.endswith('.size'):
     on_config_error('size_file must end with .size')
   if top_args.check_data_quality:
     start_time = time.time()

   knobs = SectionSizeKnobs()
   build_config = {}
   seen_container_names = set()
   container_list = []
   raw_symbols_list = []

   # Iterate over each container.
   for (sub_args, opts, container_name, apk_so_path, resources_pathmap_path,
        linker_name, size_info_prefix) in _IterSubArgs(top_args,
                                                       on_config_error):

     if container_name in seen_container_names:
       raise ValueError('Duplicate container name: {}'.format(container_name))
     seen_container_names.add(container_name)

     metadata = CreateMetadata(sub_args, linker_name, build_config)
     container, raw_symbols = CreateContainerAndSymbols(
         knobs=knobs,
         opts=opts,
         container_name=container_name,
         metadata=metadata,
         map_path=sub_args.map_file,
         tool_prefix=sub_args.tool_prefix,
         elf_path=sub_args.elf_file,
         apk_path=sub_args.apk_file,
         mapping_path=sub_args.mapping_path,
         output_directory=sub_args.output_directory,
         source_directory=sub_args.source_directory,
         resources_pathmap_path=resources_pathmap_path,
         apk_so_path=apk_so_path,
         pak_files=sub_args.pak_file,
         pak_info_file=sub_args.pak_info_file,
         linker_name=linker_name,
         size_info_prefix=size_info_prefix)

     container_list.append(container)
     raw_symbols_list.append(raw_symbols)

   size_info = CreateSizeInfo(build_config,
                              container_list,
                              raw_symbols_list,
                              normalize_names=False)

   if logging.getLogger().isEnabledFor(logging.DEBUG):
     for line in data_quality.DescribeSizeInfoCoverage(size_info):
       logging.debug(line)
   logging.info('Recorded info for %d symbols', len(size_info.raw_symbols))
   for container in size_info.containers:
     logging.info('Recording metadata: \n  %s',
                  '\n  '.join(describe.DescribeDict(container.metadata)))

   logging.info('Saving result to %s', top_args.size_file)
   file_format.SaveSizeInfo(size_info,
                            top_args.size_file,
                            include_padding=top_args.include_padding)
   size_in_mb = os.path.getsize(top_args.size_file) / 1024.0 / 1024.0
   logging.info('Done. File size is %.2fMiB.', size_in_mb)

   if top_args.check_data_quality:
     logging.info('Checking data quality')
     data_quality.CheckDataQuality(size_info, top_args.track_string_literals)
     duration = (time.time() - start_time) / 60
     if duration > 10:
       raise data_quality.QualityCheckError(
           'Command should not take longer than 10 minutes.'
           ' Took {:.1f} minutes.'.format(duration))