tools/binary_size/libsupersize/apkanalyzer.py - chromium/src - Git at Google

 # Copyright 2018 The Chromium Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Runs apkanalyzer to parse dex files in an apk.

 Assumes that apk_path.mapping and apk_path.jar.info is available.
 """

 import collections
 import functools
 import itertools
 import logging
 import os
 import posixpath
 import re
 import subprocess
 import zipfile

 import archive_util
 import dalvik_bytecode
 import dex_parser
 import models
 import path_util
 import parallel
 import string_extract

 _TOTAL_NODE_NAME = '<TOTAL>'

 # A limit on the number of symbols a DEX string literal can have, before these
 # symbols are compacted into shared symbols. Increasing this value causes more
 # data to be stored .size files, but is also more expensive.
 # Effect as of Nov 2022 (run on TrichromeGoogle.ssargs with --java-only):
 # 1: shared syms = 117811 bytes, file size = 3385635 (33630 syms).
 # 2: shared syms = 39689 bytes, file size = 3408845 (36843 syms).
 # 3: shared syms = 17831 bytes, file size = 3419021 (38553 syms).
 # 5: shared syms = 6874 bytes, file size = 3425173 (40097 syms).
 # 6: shared syms = 5098 bytes, file size = 3427458 (40597 syms).
 # 8: shared syms = 3370 bytes, file size = 3429819 (41208 syms).
 # 10: shared syms = 2250 bytes, file size = 3431944 (41720 syms).
 # 20: shared syms = 587 bytes, file size = 3435466 (42983 syms).
 # 40: shared syms = 204 bytes, file size = 3439084 (43909 syms).
 # max: shared syms = 0 bytes, file size = 3446275 (46315 syms).
 # Going with 6, i.e., strings literals with > 6 aliases are combined into a
 # shared symbol. So 46315 - 40597 = 5718, or ~12% of original syms are removed,
 # at the cost leaving ~5100 byte in binary sizes unresolved into aliases).
 _DEX_STRING_MAX_SAME_NAME_ALIAS_COUNT = 6

 # Synthetics that map 1:1 with the class they are a suffix on.
 _CLASS_SPECIFIC_SYNTHETICS = (
     'ExternalSyntheticLambda',
     'ExternalSyntheticApiModelOutline',
     'ExternalSyntheticServiceLoad',
     'Lambda',
 )


 def _ParseJarInfoFile(file_name):
   with open(file_name, 'r') as info:
     source_map = dict()
     for line in info:
       package_path, file_path = line.strip().split(',', 1)
       source_map[package_path] = file_path
   return source_map


 def RunApkAnalyzerAsync(apk_path, mapping_path):
   """Starts an apkanalyzer job for the given apk.

   Args:
     apk_path: Path to the apk to run on.
     mapping_path: Path to the proguard mapping file.

   Returns:
     An object to pass to CreateDexSymbols().
   """
   args = [path_util.GetApkAnalyzerPath(), 'dex', 'packages', apk_path]
   if mapping_path and os.path.exists(mapping_path):
     args.extend(['--proguard-mappings', mapping_path])
   env = os.environ.copy()
   env['JAVA_HOME'] = path_util.GetJavaHome()

   # Use a thread rather than directly using a Popen instance so that stdout is
   # being read from.
   return parallel.CallOnThread(subprocess.run,
                                args,
                                env=env,
                                encoding='utf-8',
                                capture_output=True,
                                check=True)


 def _ParseApkAnalyzerOutput(stdout, stderr):
   stderr = re.sub(r'Successfully loaded.*?\n', '', stderr)
   if stderr.strip():
     raise Exception('Unexpected stderr:\n' + stderr)
   data = []
   for line in stdout.splitlines():
     try:
       vals = line.split()
       # We want to name these columns so we know exactly which is which.
       # pylint: disable=unused-variable
       node_type, state, defined_methods, referenced_methods, size, name = (
           vals[0], vals[1], vals[2], vals[3], vals[4], vals[5:])
       data.append((node_type, ' '.join(name), int(size)))
     except Exception:
       logging.error('Problem line was: %s', line)
       raise
   return data


 # VisibleForTesting
 def UndoHierarchicalSizing(data):
   """Subtracts child node sizes from parent nodes.

   Note that inner classes
   should be considered as siblings rather than child nodes.

   Example nodes:
     [
       ('P', '<TOTAL>', 37),
       ('P', 'org', 32),
       ('P', 'org.chromium', 32),
       ('C', 'org.chromium.ClassA', 14),
       ('M', 'org.chromium.ClassA void methodA()', 10),
       ('C', 'org.chromium.ClassA$Proxy', 8),
     ]

   Processed nodes:
     [
       ('<TOTAL>', 15),
       ('org.chromium.ClassA', 4),
       ('org.chromium.ClassA void methodA()', 10),
       ('org.chromium.ClassA$Proxy', 8),
     ]
   """
   num_nodes = len(data)
   nodes = []

   def process_node(start_idx):
     assert start_idx < num_nodes, 'Attempting to parse beyond data array.'
     node_type, name, size = data[start_idx]
     total_child_size = 0
     next_idx = start_idx + 1
     name_len = len(name)
     while next_idx < num_nodes:
       next_name = data[next_idx][1]
       if name == _TOTAL_NODE_NAME or (
           len(next_name) > name_len and next_name.startswith(name)
           and next_name[name_len] in '. '):
         # Child node
         child_next_idx, child_node_size = process_node(next_idx)
         next_idx = child_next_idx
         total_child_size += child_node_size
       else:
         # Sibling or higher nodes
         break

     # Apkanalyzer may overcount private method sizes at times. Unfortunately
     # the fix is not in the version we have in Android SDK Tools. For now we
     # prefer to undercount child sizes since the parent's size is more
     # accurate. This means the sum of child nodes may exceed its immediate
     # parent node's size.
     total_child_size = min(size, total_child_size)
     # TODO(wnwen): Add assert back once dexlib2 2.2.5 is released and rolled.
     #assert total_child_size <= size, (
     #    'Child node total size exceeded parent node total size')

     node_size = size - total_child_size
     # It is valid to have a package and a class with the same name.
     # To avoid having two symbols with the same name in these cases, do not
     # create symbols for packages (which have no size anyways).
     if node_type == 'P' and node_size != 0 and name != _TOTAL_NODE_NAME:
       logging.warning('Unexpected java package that takes up size: %d, %s',
                       node_size, name)
     if node_type != 'P' or node_size != 0:
       nodes.append((node_type, name, node_size))
     return next_idx, size

   idx = 0
   while idx < num_nodes:
     idx = process_node(idx)[0]
   return nodes


 def _TruncateFrom(value, delimiter, rfind=False):
   idx = value.rfind(delimiter) if rfind else value.find(delimiter)
   if idx != -1:
     return value[:idx]
   return value


 def _NormalizeName(orig_name):
   """Extracts outer class name and normalizes names with hashes in them.

   Returns:
     outer_class: The outer class. Example: package.Class
       Returns None for classes that are outlines.
     new_name: Normalized name.
   """
   # May be reassigned by one of the cases below.
   outer_class = _TruncateFrom(orig_name, '$')

   # $$ is the convention for a synthetic class and all known desugared lambda
   synthetic_marker_idx = orig_name.find('$$')
   if synthetic_marker_idx == -1:
     return outer_class, orig_name

   synthetic_part = orig_name[synthetic_marker_idx + 2:]

   # Example: package.Cls$$InternalSyntheticLambda$0$81073ff6$0
   if synthetic_part.startswith('InternalSyntheticLambda$'):
     next_dollar_idx = orig_name.index('$',
         synthetic_marker_idx + len('$$InternalSyntheticLambda$'))
     return outer_class, orig_name[:next_dollar_idx]

   # Ensure we notice if a new type of InternalSythetic pops up.
   # E.g. to see if it follows the same naming scheme.
   assert not synthetic_part.startswith('Internal'), f'Unrecognized: {orig_name}'

   if synthetic_part.startswith(_CLASS_SPECIFIC_SYNTHETICS):
     return outer_class, orig_name

   return None, orig_name


 def NormalizeLine(orig_name, full_name):
   """Normalizes a line from apkanalyzer output.
   Args:
     orig_name: The original name from apkanalyzer output.
     full_name: The full name of the symbol.
   Returns:
     outer_class: The outer class. Example: package.Class
       Returns None for classes that are outlines.
     new_full_name: Normalized full name.
   """
   # See tests for a more comprehensive list of what d8 currently generates.
   outer_class, new_name = _NormalizeName(orig_name)
   if new_name is not orig_name:
     full_name = full_name.replace(orig_name, new_name)
   return outer_class, full_name


 def _MakeDexObjectPath(package_name, is_outlined):
   if is_outlined:
     # Create a special meta-directory for outlined lambdas to easily monitor
     # their total size and spot regressions.
     return posixpath.join(models.OUTLINES_PREFIX_PATH, *package_name.split('.'))
   return posixpath.join(models.APK_PREFIX_PATH, *package_name.split('.'))


 # Visible for testing.
 def CreateDexSymbol(name, size, source_map):
   parts = name.split(' ')  # (class_name, return_type, method_name)
   new_package = parts[0]

   if new_package == _TOTAL_NODE_NAME:
     return None

   outer_class, name = NormalizeLine(new_package, name)

   # Look for class merging.
   old_package = new_package
   # len(parts) == 2 for class nodes.
   if len(parts) > 2:
     method = parts[2]
     # last_idx == -1 for fields, which is fine.
     last_idx = method.find('(')
     last_idx = method.rfind('.', 0, last_idx)
     if last_idx != -1:
       old_package = method[:last_idx]

       # TODO(b/333617478): Delete this work-around when R8 mapping files no
       #     longer output this pattern.
       suspect_class_name = old_package
       if suspect_class_name.startswith('WV.'):
         suspect_class_name = suspect_class_name[3:]
       if ('.' not in suspect_class_name
           and new_package.endswith(f'.{suspect_class_name}')):
         name = name.replace(f' {old_package}.', ' ')
         old_package = new_package
       else:
         # Non-workaround case:
         outer_class, name = NormalizeLine(old_package, name)

   is_outlined = outer_class == None
   object_path = _MakeDexObjectPath(old_package, is_outlined)
   if name.endswith(')'):
     section_name = models.SECTION_DEX_METHOD
   else:
     section_name = models.SECTION_DEX

   source_path = source_map.get(outer_class, '')
   return models.Symbol(section_name,
                        size,
                        full_name=name,
                        object_path=object_path,
                        source_path=source_path)


 def _SymbolsFromNodes(nodes, source_map):
   # Use (DEX_METHODS, DEX) buckets to speed up sorting.
   symbol_buckets = ([], [])
   for _, name, node_size in nodes:
     symbol = CreateDexSymbol(name, node_size, source_map)
     if symbol:
       bucket_index = int(symbol.section_name is models.SECTION_DEX)
       symbol_buckets[bucket_index].append(symbol)
   for symbols_bucket in symbol_buckets:
     symbols_bucket.sort(key=lambda s: s.full_name)
   return symbol_buckets

 def _GenDexStringsUsedByClasses(dexfile, class_deobfuscation_map):
   """Emit strings used in code_items and associate them with classes.

   Args:
     dexfile: A DexFile instance.
     class_deobfuscation_map: Map from obfuscated names to class names.

   Yields:
     string_idx: DEX string index.
     size: Number of bytes taken by string, including pointer.
     decoded_string: The decoded string.
     class_names: List of class names
   """
   if not dexfile or not dexfile.code_item_list:
     return

   # Helper to deobfuscate class names while converting 'LFoo;' -> 'Foo'.
   num_bad_name = 0
   num_deobfus_names = 0
   num_failed_deobfus = 0

   @functools.lru_cache(None)
   def LookupDeobfuscatedClassNames(class_def_idx):
     nonlocal num_bad_name, num_deobfus_names, num_failed_deobfus
     class_def_item = dexfile.class_def_item_list[class_def_idx]
     name = dexfile.GetTypeString(class_def_item.class_idx)
     if not (name.startswith('L') and name.endswith(';')):
       num_bad_name += 1
       return name
     # Change "L{X};" to "{X}", and convert path name to class name.
     name = name[1:-1].replace('/', '.')
     deobfuscated_name = class_deobfuscation_map.get(name, None)
     if deobfuscated_name is not None:
       name = deobfuscated_name
       num_deobfus_names += 1
     else:
       num_failed_deobfus += 1
     return name

   # Precompute map from code item offsets to set of string id used.
   code_off_to_used_string_ids = {
       code_item.offset: set(dexfile.IterAllStringIdsUsedByCodeItem(code_item))
       for code_item in dexfile.code_item_list
   }
   code_off_to_used_string_ids[0] = set()  # Offset 0 = No code.

   # Walk code for each class, each methods, mark string usages.
   string_idx_to_class_idxs = collections.defaultdict(set)
   for i, class_item in enumerate(dexfile.class_def_item_list):
     string_idxs_used_by_class = set()
     class_data_item = dexfile.GetClassDataItemByOffset(
         class_item.class_data_off)
     if class_data_item:
       for encoded_method in itertools.chain(class_data_item.direct_methods,
                                             class_data_item.virtual_methods):
         code_off = encoded_method.code_off
         string_idxs_used_by_class |= code_off_to_used_string_ids[code_off]
     for string_idx in string_idxs_used_by_class:
       string_idx_to_class_idxs[string_idx].add(i)

   # Emit each string used by code, with names of classes that use it. Both are
   # sorted to maintain consitency.
   for string_idx in sorted(string_idx_to_class_idxs):
     string_item = dexfile.string_data_item_list[string_idx]
     size = string_item.byte_size + 4  # +4 for pointer.
     decoded_string = string_item.data
     class_idxs = string_idx_to_class_idxs[string_idx]
     class_names = sorted(LookupDeobfuscatedClassNames(i) for i in class_idxs)
     yield string_idx, size, decoded_string, class_names

   logging.info('Deobfuscated %d / %d classes (%d failures)', num_deobfus_names,
                len(dexfile.class_def_item_list), num_failed_deobfus)
   if num_bad_name > 0:
     logging.warn('Found %d class names not formatted as "L.*;".' % num_bad_name)


 def _MakeFakeSourcePath(class_name):
   class_path = class_name.replace('.', '/')
   return f'{models.APK_PREFIX_PATH}/{class_path}'


 def _StringSymbolsFromDexFile(apk_path, dexfile, source_map,
                               class_deobfuscation_map):
   if not dexfile:
     return []
   logging.info('Extractng string symbols from %s', apk_path)

   # Code strings: Strings accessed via class -> method -> code -> string.
   # These are extracted into separate symbols ,aliases among referring classes.
   fresh_string_idx_set = set(range(len(dexfile.string_data_item_list)))
   object_path = str(apk_path)
   dex_string_symbols = []
   string_iter = _GenDexStringsUsedByClasses(dexfile, class_deobfuscation_map)
   for string_idx, size, decoded_string, string_user_class_names in string_iter:
     fresh_string_idx_set.remove(string_idx)
     num_aliases = len(string_user_class_names)
     aliases = []
     for class_name in string_user_class_names:
       outer_class, class_name = _NormalizeName(class_name)
       full_name = string_extract.GetNameOfStringLiteralBytes(
           decoded_string.encode('utf-8', errors='surrogatepass'))
       source_path = (source_map.get(outer_class, '')
                      or _MakeFakeSourcePath(class_name))
       sym = models.Symbol(models.SECTION_DEX,
                           size,
                           full_name=full_name,
                           object_path=object_path,
                           source_path=source_path,
                           aliases=aliases if num_aliases > 1 else None)
       aliases.append(sym)
     assert num_aliases == len(aliases)
     dex_string_symbols += aliases

   logging.info('Counted %d class -> method -> code strings',
                len(dexfile.string_data_item_list) - len(fresh_string_idx_set))

   # Extract aggregate string symbols for {types, methods, fields, prototypes}.
   # Due to significant overlap (coincidental or induced by R8), {method, field}
   # string symbols share a common aggregate. Other overlap sare resolved by
   # applying the priority:
   #   code > type > {method, field} > prototype,
   # i.e., bytes from code strings are not counted in aggregates; bytes from type
   # string aggregate are not counted by {{method, field}, prototype}, etc.

   def _AddAggregateStringSymbol(name, string_idx_set):
     nonlocal fresh_string_idx_set
     old_count = len(string_idx_set)
     string_idx_set &= fresh_string_idx_set
     fresh_string_idx_set -= string_idx_set
     logging.info('Counted %d %s strings among %d found', len(string_idx_set),
                  name, old_count)
     if string_idx_set:
       # Each string has +4 for pointer.
       size = sum(dexfile.string_data_item_list[string_idx].byte_size
                  for string_idx in string_idx_set) + 4 * len(string_idx_set)
       sym = models.Symbol(models.SECTION_DEX,
                           size,
                           full_name=f'** .dex ({name} strings)')
       dex_string_symbols.append(sym)

   # Type strings.
   type_string_idx_set = {i.descriptor_idx for i in dexfile.type_id_item_list}
   _AddAggregateStringSymbol('type', type_string_idx_set)

   # Method and field strings.
   method_string_idx_set = {i.name_idx for i in dexfile.method_id_item_list}
   field_string_idx_set = {i.name_idx for i in dexfile.field_id_item_list}
   _AddAggregateStringSymbol('method and field',
                             method_string_idx_set | field_string_idx_set)

   # Prototype strings.
   proto_string_idx_set = {i.shorty_idx for i in dexfile.proto_id_item_list}
   _AddAggregateStringSymbol('prototype', proto_string_idx_set)

   return dex_string_symbols


 def _ParseDexfilesInApk(apk_path):
   with zipfile.ZipFile(apk_path) as src_zip:
     dex_infos = [
         info for info in src_zip.infolist() if
         info.filename.startswith('classes') and info.filename.endswith('.dex')
     ]
     # Assume sound and stable ordering of DEX filenames.
     for dex_info in dex_infos:
       dex_data = src_zip.read(dex_info)
       yield dex_info.filename, dex_parser.DexFile(dex_data)


 def CreateDexSymbols(apk_path, apk_analyzer_async_result, dex_total_size,
                      class_deobfuscation_map, size_info_prefix,
                      track_string_literals):
   """Creates DEX symbols given apk_analyzer output.

   Args:
     apk_path: Path to the APK containing the DEX file.
     apk_analyzer_async_result: Return value from RunApkAnalyzerAsync().
     dex_total_size: Sum of the sizes of all .dex files in the apk.
     class_deobfuscation_map: Map from obfuscated names to class names.
     size_info_prefix: Path such as: out/Release/size-info/BaseName.
     track_string_literals: Create symbols for string literals.

   Returns:
     A tuple of (section_ranges, raw_symbols, metrics_by_file), where
     metrics_by_file is a dict from DEX file name to a dict of
     {metric_name: value}.
   """
   logging.debug('Waiting for apkanalyzer to finish')
   apk_analyzer_result = apk_analyzer_async_result.get()
   logging.debug('Analyzing DEX - processing results')
   if size_info_prefix:
     source_map = _ParseJarInfoFile(size_info_prefix + '.jar.info')
   else:
     source_map = dict()

   nodes = _ParseApkAnalyzerOutput(apk_analyzer_result.stdout,
                                   apk_analyzer_result.stderr)
   nodes = UndoHierarchicalSizing(nodes)

   total_node_size = sum([x[2] for x in nodes])
   # TODO(agrieve): Figure out why this log is triggering for
   #     ChromeModernPublic.apk (https://crbug.com/851535).
   # Reporting: dex_total_size=6546088 total_node_size=6559549
   if dex_total_size < total_node_size:
     logging.error(
         'Node size too large, check for node processing errors. '
         'dex_total_size=%d total_node_size=%d', dex_total_size, total_node_size)

   dex_method_symbols, dex_other_symbols = _SymbolsFromNodes(nodes, source_map)
   dex_string_symbols = []
   metrics_by_file = {}
   for dex_path, dexfile in _ParseDexfilesInApk(apk_path):
     logging.debug('Found DEX: %r', dex_path)
     if track_string_literals:
       dex_string_symbols += _StringSymbolsFromDexFile(apk_path, dexfile,
                                                       source_map,
                                                       class_deobfuscation_map)
     map_item_sizes = dexfile.ComputeMapItemSizes()
     metrics = {}
     for item in map_item_sizes:
       metrics[f'{models.METRICS_SIZE}/' + item['name']] = item['byte_size']
       metrics[f'{models.METRICS_COUNT}/' + item['name']] = item['size']
     metrics_by_file[dex_path] = metrics

   if dex_string_symbols:
     logging.info('Converting excessive DEX string aliases into shared-path '
                  'symbols')
     archive_util.CompactLargeAliasesIntoSharedSymbols(
         dex_string_symbols, _DEX_STRING_MAX_SAME_NAME_ALIAS_COUNT)

   dex_method_size = round(sum(s.pss for s in dex_method_symbols))
   dex_other_size = round(sum(s.pss for s in dex_other_symbols))
   dex_other_size += round(sum(s.pss for s in dex_string_symbols))
   unattributed_dex = dex_total_size - dex_method_size - dex_other_size
   # Compare against -5 instead of 0 to guard against round-off errors.
   assert unattributed_dex >= -5, (
       'sum(dex_symbols.size) > sum(filesize(dex file)). {} vs {}'.format(
           dex_method_size + dex_other_size, dex_total_size))

   if unattributed_dex > 0:
     dex_other_symbols.append(
         models.Symbol(
             models.SECTION_DEX,
             unattributed_dex,
             full_name='** .dex (unattributed)'))

   dex_other_symbols.extend(dex_method_symbols)
   dex_other_symbols.extend(dex_string_symbols)

   # We can't meaningfully track section size of dex methods vs other, so
   # just fake the size of dex methods as the sum of symbols, and make
   # "dex other" responsible for any unattributed bytes.
   section_ranges = {
       models.SECTION_DEX_METHOD: (0, dex_method_size),
       models.SECTION_DEX: (0, dex_total_size - dex_method_size),
   }

   return section_ranges, dex_other_symbols, metrics_by_file
	# Copyright 2018 The Chromium Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Runs apkanalyzer to parse dex files in an apk.

	Assumes that apk_path.mapping and apk_path.jar.info is available.
	"""

	import collections
	import functools
	import itertools
	import logging
	import os
	import posixpath
	import re
	import subprocess
	import zipfile

	import archive_util
	import dalvik_bytecode
	import dex_parser
	import models
	import path_util
	import parallel
	import string_extract

	_TOTAL_NODE_NAME = '<TOTAL>'

	# A limit on the number of symbols a DEX string literal can have, before these
	# symbols are compacted into shared symbols. Increasing this value causes more
	# data to be stored .size files, but is also more expensive.
	# Effect as of Nov 2022 (run on TrichromeGoogle.ssargs with --java-only):
	# 1: shared syms = 117811 bytes, file size = 3385635 (33630 syms).
	# 2: shared syms = 39689 bytes, file size = 3408845 (36843 syms).
	# 3: shared syms = 17831 bytes, file size = 3419021 (38553 syms).
	# 5: shared syms = 6874 bytes, file size = 3425173 (40097 syms).
	# 6: shared syms = 5098 bytes, file size = 3427458 (40597 syms).
	# 8: shared syms = 3370 bytes, file size = 3429819 (41208 syms).
	# 10: shared syms = 2250 bytes, file size = 3431944 (41720 syms).
	# 20: shared syms = 587 bytes, file size = 3435466 (42983 syms).
	# 40: shared syms = 204 bytes, file size = 3439084 (43909 syms).
	# max: shared syms = 0 bytes, file size = 3446275 (46315 syms).
	# Going with 6, i.e., strings literals with > 6 aliases are combined into a
	# shared symbol. So 46315 - 40597 = 5718, or ~12% of original syms are removed,
	# at the cost leaving ~5100 byte in binary sizes unresolved into aliases).
	_DEX_STRING_MAX_SAME_NAME_ALIAS_COUNT = 6

	# Synthetics that map 1:1 with the class they are a suffix on.
	_CLASS_SPECIFIC_SYNTHETICS = (
	'ExternalSyntheticLambda',
	'ExternalSyntheticApiModelOutline',
	'ExternalSyntheticServiceLoad',
	'Lambda',
	)


	def _ParseJarInfoFile(file_name):
	with open(file_name, 'r') as info:
	source_map = dict()
	for line in info:
	package_path, file_path = line.strip().split(',', 1)
	source_map[package_path] = file_path
	return source_map


	def RunApkAnalyzerAsync(apk_path, mapping_path):
	"""Starts an apkanalyzer job for the given apk.

	Args:
	apk_path: Path to the apk to run on.
	mapping_path: Path to the proguard mapping file.

	Returns:
	An object to pass to CreateDexSymbols().
	"""
	args = [path_util.GetApkAnalyzerPath(), 'dex', 'packages', apk_path]
	if mapping_path and os.path.exists(mapping_path):
	args.extend(['--proguard-mappings', mapping_path])
	env = os.environ.copy()
	env['JAVA_HOME'] = path_util.GetJavaHome()

	# Use a thread rather than directly using a Popen instance so that stdout is
	# being read from.
	return parallel.CallOnThread(subprocess.run,
	args,
	env=env,
	encoding='utf-8',
	capture_output=True,
	check=True)


	def _ParseApkAnalyzerOutput(stdout, stderr):
	stderr = re.sub(r'Successfully loaded.*?\n', '', stderr)
	if stderr.strip():
	raise Exception('Unexpected stderr:\n' + stderr)
	data = []
	for line in stdout.splitlines():
	try:
	vals = line.split()
	# We want to name these columns so we know exactly which is which.
	# pylint: disable=unused-variable
	node_type, state, defined_methods, referenced_methods, size, name = (
	vals[0], vals[1], vals[2], vals[3], vals[4], vals[5:])
	data.append((node_type, ' '.join(name), int(size)))
	except Exception:
	logging.error('Problem line was: %s', line)
	raise
	return data


	# VisibleForTesting
	def UndoHierarchicalSizing(data):
	"""Subtracts child node sizes from parent nodes.

	Note that inner classes
	should be considered as siblings rather than child nodes.

	Example nodes:
	[
	('P', '<TOTAL>', 37),
	('P', 'org', 32),
	('P', 'org.chromium', 32),
	('C', 'org.chromium.ClassA', 14),
	('M', 'org.chromium.ClassA void methodA()', 10),
	('C', 'org.chromium.ClassA$Proxy', 8),
	]

	Processed nodes:
	[
	('<TOTAL>', 15),
	('org.chromium.ClassA', 4),
	('org.chromium.ClassA void methodA()', 10),
	('org.chromium.ClassA$Proxy', 8),
	]
	"""
	num_nodes = len(data)
	nodes = []

	def process_node(start_idx):
	assert start_idx < num_nodes, 'Attempting to parse beyond data array.'
	node_type, name, size = data[start_idx]
	total_child_size = 0
	next_idx = start_idx + 1
	name_len = len(name)
	while next_idx < num_nodes:
	next_name = data[next_idx][1]
	if name == _TOTAL_NODE_NAME or (
	len(next_name) > name_len and next_name.startswith(name)
	and next_name[name_len] in '. '):
	# Child node
	child_next_idx, child_node_size = process_node(next_idx)
	next_idx = child_next_idx
	total_child_size += child_node_size
	else:
	# Sibling or higher nodes
	break

	# Apkanalyzer may overcount private method sizes at times. Unfortunately
	# the fix is not in the version we have in Android SDK Tools. For now we
	# prefer to undercount child sizes since the parent's size is more
	# accurate. This means the sum of child nodes may exceed its immediate
	# parent node's size.
	total_child_size = min(size, total_child_size)
	# TODO(wnwen): Add assert back once dexlib2 2.2.5 is released and rolled.
	#assert total_child_size <= size, (
	# 'Child node total size exceeded parent node total size')

	node_size = size - total_child_size
	# It is valid to have a package and a class with the same name.
	# To avoid having two symbols with the same name in these cases, do not
	# create symbols for packages (which have no size anyways).
	if node_type == 'P' and node_size != 0 and name != _TOTAL_NODE_NAME:
	logging.warning('Unexpected java package that takes up size: %d, %s',
	node_size, name)
	if node_type != 'P' or node_size != 0:
	nodes.append((node_type, name, node_size))
	return next_idx, size

	idx = 0
	while idx < num_nodes:
	idx = process_node(idx)[0]
	return nodes


	def _TruncateFrom(value, delimiter, rfind=False):
	idx = value.rfind(delimiter) if rfind else value.find(delimiter)
	if idx != -1:
	return value[:idx]
	return value


	def _NormalizeName(orig_name):
	"""Extracts outer class name and normalizes names with hashes in them.

	Returns:
	outer_class: The outer class. Example: package.Class
	Returns None for classes that are outlines.
	new_name: Normalized name.
	"""
	# May be reassigned by one of the cases below.
	outer_class = _TruncateFrom(orig_name, '$')

	# $$ is the convention for a synthetic class and all known desugared lambda
	synthetic_marker_idx = orig_name.find('$$')
	if synthetic_marker_idx == -1:
	return outer_class, orig_name

	synthetic_part = orig_name[synthetic_marker_idx + 2:]

	# Example: package.Cls$$InternalSyntheticLambda$0$81073ff6$0
	if synthetic_part.startswith('InternalSyntheticLambda$'):
	next_dollar_idx = orig_name.index('$',
	synthetic_marker_idx + len('$$InternalSyntheticLambda$'))
	return outer_class, orig_name[:next_dollar_idx]

	# Ensure we notice if a new type of InternalSythetic pops up.
	# E.g. to see if it follows the same naming scheme.
	assert not synthetic_part.startswith('Internal'), f'Unrecognized: {orig_name}'

	if synthetic_part.startswith(_CLASS_SPECIFIC_SYNTHETICS):
	return outer_class, orig_name

	return None, orig_name


	def NormalizeLine(orig_name, full_name):
	"""Normalizes a line from apkanalyzer output.
	Args:
	orig_name: The original name from apkanalyzer output.
	full_name: The full name of the symbol.
	Returns:
	outer_class: The outer class. Example: package.Class
	Returns None for classes that are outlines.
	new_full_name: Normalized full name.
	"""
	# See tests for a more comprehensive list of what d8 currently generates.
	outer_class, new_name = _NormalizeName(orig_name)
	if new_name is not orig_name:
	full_name = full_name.replace(orig_name, new_name)
	return outer_class, full_name


	def _MakeDexObjectPath(package_name, is_outlined):
	if is_outlined:
	# Create a special meta-directory for outlined lambdas to easily monitor
	# their total size and spot regressions.
	return posixpath.join(models.OUTLINES_PREFIX_PATH, *package_name.split('.'))
	return posixpath.join(models.APK_PREFIX_PATH, *package_name.split('.'))


	# Visible for testing.
	def CreateDexSymbol(name, size, source_map):
	parts = name.split(' ') # (class_name, return_type, method_name)
	new_package = parts[0]

	if new_package == _TOTAL_NODE_NAME:
	return None

	outer_class, name = NormalizeLine(new_package, name)

	# Look for class merging.
	old_package = new_package
	# len(parts) == 2 for class nodes.
	if len(parts) > 2:
	method = parts[2]
	# last_idx == -1 for fields, which is fine.
	last_idx = method.find('(')
	last_idx = method.rfind('.', 0, last_idx)
	if last_idx != -1:
	old_package = method[:last_idx]

	# TODO(b/333617478): Delete this work-around when R8 mapping files no
	# longer output this pattern.
	suspect_class_name = old_package
	if suspect_class_name.startswith('WV.'):
	suspect_class_name = suspect_class_name[3:]
	if ('.' not in suspect_class_name
	and new_package.endswith(f'.{suspect_class_name}')):
	name = name.replace(f' {old_package}.', ' ')
	old_package = new_package
	else:
	# Non-workaround case:
	outer_class, name = NormalizeLine(old_package, name)

	is_outlined = outer_class == None
	object_path = _MakeDexObjectPath(old_package, is_outlined)
	if name.endswith(')'):
	section_name = models.SECTION_DEX_METHOD
	else:
	section_name = models.SECTION_DEX

	source_path = source_map.get(outer_class, '')
	return models.Symbol(section_name,
	size,
	full_name=name,
	object_path=object_path,
	source_path=source_path)


	def _SymbolsFromNodes(nodes, source_map):
	# Use (DEX_METHODS, DEX) buckets to speed up sorting.
	symbol_buckets = ([], [])
	for _, name, node_size in nodes:
	symbol = CreateDexSymbol(name, node_size, source_map)
	if symbol:
	bucket_index = int(symbol.section_name is models.SECTION_DEX)
	symbol_buckets[bucket_index].append(symbol)
	for symbols_bucket in symbol_buckets:
	symbols_bucket.sort(key=lambda s: s.full_name)
	return symbol_buckets

	def _GenDexStringsUsedByClasses(dexfile, class_deobfuscation_map):
	"""Emit strings used in code_items and associate them with classes.

	Args:
	dexfile: A DexFile instance.
	class_deobfuscation_map: Map from obfuscated names to class names.

	Yields:
	string_idx: DEX string index.
	size: Number of bytes taken by string, including pointer.
	decoded_string: The decoded string.
	class_names: List of class names
	"""
	if not dexfile or not dexfile.code_item_list:
	return

	# Helper to deobfuscate class names while converting 'LFoo;' -> 'Foo'.
	num_bad_name = 0
	num_deobfus_names = 0
	num_failed_deobfus = 0

	@functools.lru_cache(None)
	def LookupDeobfuscatedClassNames(class_def_idx):
	nonlocal num_bad_name, num_deobfus_names, num_failed_deobfus
	class_def_item = dexfile.class_def_item_list[class_def_idx]
	name = dexfile.GetTypeString(class_def_item.class_idx)
	if not (name.startswith('L') and name.endswith(';')):
	num_bad_name += 1
	return name
	# Change "L{X};" to "{X}", and convert path name to class name.
	name = name[1:-1].replace('/', '.')
	deobfuscated_name = class_deobfuscation_map.get(name, None)
	if deobfuscated_name is not None:
	name = deobfuscated_name
	num_deobfus_names += 1
	else:
	num_failed_deobfus += 1
	return name

	# Precompute map from code item offsets to set of string id used.
	code_off_to_used_string_ids = {
	code_item.offset: set(dexfile.IterAllStringIdsUsedByCodeItem(code_item))
	for code_item in dexfile.code_item_list
	}
	code_off_to_used_string_ids[0] = set() # Offset 0 = No code.

	# Walk code for each class, each methods, mark string usages.
	string_idx_to_class_idxs = collections.defaultdict(set)
	for i, class_item in enumerate(dexfile.class_def_item_list):
	string_idxs_used_by_class = set()
	class_data_item = dexfile.GetClassDataItemByOffset(
	class_item.class_data_off)
	if class_data_item:
	for encoded_method in itertools.chain(class_data_item.direct_methods,
	class_data_item.virtual_methods):
	code_off = encoded_method.code_off
	string_idxs_used_by_class \|= code_off_to_used_string_ids[code_off]
	for string_idx in string_idxs_used_by_class:
	string_idx_to_class_idxs[string_idx].add(i)

	# Emit each string used by code, with names of classes that use it. Both are
	# sorted to maintain consitency.
	for string_idx in sorted(string_idx_to_class_idxs):
	string_item = dexfile.string_data_item_list[string_idx]
	size = string_item.byte_size + 4 # +4 for pointer.
	decoded_string = string_item.data
	class_idxs = string_idx_to_class_idxs[string_idx]
	class_names = sorted(LookupDeobfuscatedClassNames(i) for i in class_idxs)
	yield string_idx, size, decoded_string, class_names

	logging.info('Deobfuscated %d / %d classes (%d failures)', num_deobfus_names,
	len(dexfile.class_def_item_list), num_failed_deobfus)
	if num_bad_name > 0:
	logging.warn('Found %d class names not formatted as "L.*;".' % num_bad_name)


	def _MakeFakeSourcePath(class_name):
	class_path = class_name.replace('.', '/')
	return f'{models.APK_PREFIX_PATH}/{class_path}'


	def _StringSymbolsFromDexFile(apk_path, dexfile, source_map,
	class_deobfuscation_map):
	if not dexfile:
	return []
	logging.info('Extractng string symbols from %s', apk_path)

	# Code strings: Strings accessed via class -> method -> code -> string.
	# These are extracted into separate symbols ,aliases among referring classes.
	fresh_string_idx_set = set(range(len(dexfile.string_data_item_list)))
	object_path = str(apk_path)
	dex_string_symbols = []
	string_iter = _GenDexStringsUsedByClasses(dexfile, class_deobfuscation_map)
	for string_idx, size, decoded_string, string_user_class_names in string_iter:
	fresh_string_idx_set.remove(string_idx)
	num_aliases = len(string_user_class_names)
	aliases = []
	for class_name in string_user_class_names:
	outer_class, class_name = _NormalizeName(class_name)
	full_name = string_extract.GetNameOfStringLiteralBytes(
	decoded_string.encode('utf-8', errors='surrogatepass'))
	source_path = (source_map.get(outer_class, '')
	or _MakeFakeSourcePath(class_name))
	sym = models.Symbol(models.SECTION_DEX,
	size,
	full_name=full_name,
	object_path=object_path,
	source_path=source_path,
	aliases=aliases if num_aliases > 1 else None)
	aliases.append(sym)
	assert num_aliases == len(aliases)
	dex_string_symbols += aliases

	logging.info('Counted %d class -> method -> code strings',
	len(dexfile.string_data_item_list) - len(fresh_string_idx_set))

	# Extract aggregate string symbols for {types, methods, fields, prototypes}.
	# Due to significant overlap (coincidental or induced by R8), {method, field}
	# string symbols share a common aggregate. Other overlap sare resolved by
	# applying the priority:
	# code > type > {method, field} > prototype,
	# i.e., bytes from code strings are not counted in aggregates; bytes from type
	# string aggregate are not counted by {{method, field}, prototype}, etc.

	def _AddAggregateStringSymbol(name, string_idx_set):
	nonlocal fresh_string_idx_set
	old_count = len(string_idx_set)
	string_idx_set &= fresh_string_idx_set
	fresh_string_idx_set -= string_idx_set
	logging.info('Counted %d %s strings among %d found', len(string_idx_set),
	name, old_count)
	if string_idx_set:
	# Each string has +4 for pointer.
	size = sum(dexfile.string_data_item_list[string_idx].byte_size
	for string_idx in string_idx_set) + 4 * len(string_idx_set)
	sym = models.Symbol(models.SECTION_DEX,
	size,
	full_name=f'** .dex ({name} strings)')
	dex_string_symbols.append(sym)

	# Type strings.
	type_string_idx_set = {i.descriptor_idx for i in dexfile.type_id_item_list}
	_AddAggregateStringSymbol('type', type_string_idx_set)

	# Method and field strings.
	method_string_idx_set = {i.name_idx for i in dexfile.method_id_item_list}
	field_string_idx_set = {i.name_idx for i in dexfile.field_id_item_list}
	_AddAggregateStringSymbol('method and field',
	method_string_idx_set \| field_string_idx_set)

	# Prototype strings.
	proto_string_idx_set = {i.shorty_idx for i in dexfile.proto_id_item_list}
	_AddAggregateStringSymbol('prototype', proto_string_idx_set)

	return dex_string_symbols


	def _ParseDexfilesInApk(apk_path):
	with zipfile.ZipFile(apk_path) as src_zip:
	dex_infos = [
	info for info in src_zip.infolist() if
	info.filename.startswith('classes') and info.filename.endswith('.dex')
	]
	# Assume sound and stable ordering of DEX filenames.
	for dex_info in dex_infos:
	dex_data = src_zip.read(dex_info)
	yield dex_info.filename, dex_parser.DexFile(dex_data)


	def CreateDexSymbols(apk_path, apk_analyzer_async_result, dex_total_size,
	class_deobfuscation_map, size_info_prefix,
	track_string_literals):
	"""Creates DEX symbols given apk_analyzer output.

	Args:
	apk_path: Path to the APK containing the DEX file.
	apk_analyzer_async_result: Return value from RunApkAnalyzerAsync().
	dex_total_size: Sum of the sizes of all .dex files in the apk.
	class_deobfuscation_map: Map from obfuscated names to class names.
	size_info_prefix: Path such as: out/Release/size-info/BaseName.
	track_string_literals: Create symbols for string literals.

	Returns:
	A tuple of (section_ranges, raw_symbols, metrics_by_file), where
	metrics_by_file is a dict from DEX file name to a dict of
	{metric_name: value}.
	"""
	logging.debug('Waiting for apkanalyzer to finish')
	apk_analyzer_result = apk_analyzer_async_result.get()
	logging.debug('Analyzing DEX - processing results')
	if size_info_prefix:
	source_map = _ParseJarInfoFile(size_info_prefix + '.jar.info')
	else:
	source_map = dict()

	nodes = _ParseApkAnalyzerOutput(apk_analyzer_result.stdout,
	apk_analyzer_result.stderr)
	nodes = UndoHierarchicalSizing(nodes)

	total_node_size = sum([x[2] for x in nodes])
	# TODO(agrieve): Figure out why this log is triggering for
	# ChromeModernPublic.apk (https://crbug.com/851535).
	# Reporting: dex_total_size=6546088 total_node_size=6559549
	if dex_total_size < total_node_size:
	logging.error(
	'Node size too large, check for node processing errors. '
	'dex_total_size=%d total_node_size=%d', dex_total_size, total_node_size)

	dex_method_symbols, dex_other_symbols = _SymbolsFromNodes(nodes, source_map)
	dex_string_symbols = []
	metrics_by_file = {}
	for dex_path, dexfile in _ParseDexfilesInApk(apk_path):
	logging.debug('Found DEX: %r', dex_path)
	if track_string_literals:
	dex_string_symbols += _StringSymbolsFromDexFile(apk_path, dexfile,
	source_map,
	class_deobfuscation_map)
	map_item_sizes = dexfile.ComputeMapItemSizes()
	metrics = {}
	for item in map_item_sizes:
	metrics[f'{models.METRICS_SIZE}/' + item['name']] = item['byte_size']
	metrics[f'{models.METRICS_COUNT}/' + item['name']] = item['size']
	metrics_by_file[dex_path] = metrics

	if dex_string_symbols:
	logging.info('Converting excessive DEX string aliases into shared-path '
	'symbols')
	archive_util.CompactLargeAliasesIntoSharedSymbols(
	dex_string_symbols, _DEX_STRING_MAX_SAME_NAME_ALIAS_COUNT)

	dex_method_size = round(sum(s.pss for s in dex_method_symbols))
	dex_other_size = round(sum(s.pss for s in dex_other_symbols))
	dex_other_size += round(sum(s.pss for s in dex_string_symbols))
	unattributed_dex = dex_total_size - dex_method_size - dex_other_size
	# Compare against -5 instead of 0 to guard against round-off errors.
	assert unattributed_dex >= -5, (
	'sum(dex_symbols.size) > sum(filesize(dex file)). {} vs {}'.format(
	dex_method_size + dex_other_size, dex_total_size))

	if unattributed_dex > 0:
	dex_other_symbols.append(
	models.Symbol(
	models.SECTION_DEX,
	unattributed_dex,
	full_name='** .dex (unattributed)'))

	dex_other_symbols.extend(dex_method_symbols)
	dex_other_symbols.extend(dex_string_symbols)

	# We can't meaningfully track section size of dex methods vs other, so
	# just fake the size of dex methods as the sum of symbols, and make
	# "dex other" responsible for any unattributed bytes.
	section_ranges = {
	models.SECTION_DEX_METHOD: (0, dex_method_size),
	models.SECTION_DEX: (0, dex_total_size - dex_method_size),
	}

	return section_ranges, dex_other_symbols, metrics_by_file