| # Copyright 2018 The Chromium Authors |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Runs apkanalyzer to parse dex files in an apk. |
| |
| Assumes that apk_path.mapping and apk_path.jar.info is available. |
| """ |
| |
| import collections |
| import functools |
| import itertools |
| import logging |
| import os |
| import posixpath |
| import re |
| import subprocess |
| import zipfile |
| |
| import archive_util |
| import dalvik_bytecode |
| import dex_parser |
| import models |
| import path_util |
| import parallel |
| import string_extract |
| |
| _TOTAL_NODE_NAME = '<TOTAL>' |
| |
| # A limit on the number of symbols a DEX string literal can have, before these |
| # symbols are compacted into shared symbols. Increasing this value causes more |
| # data to be stored .size files, but is also more expensive. |
| # Effect as of Nov 2022 (run on TrichromeGoogle.ssargs with --java-only): |
| # 1: shared syms = 117811 bytes, file size = 3385635 (33630 syms). |
| # 2: shared syms = 39689 bytes, file size = 3408845 (36843 syms). |
| # 3: shared syms = 17831 bytes, file size = 3419021 (38553 syms). |
| # 5: shared syms = 6874 bytes, file size = 3425173 (40097 syms). |
| # 6: shared syms = 5098 bytes, file size = 3427458 (40597 syms). |
| # 8: shared syms = 3370 bytes, file size = 3429819 (41208 syms). |
| # 10: shared syms = 2250 bytes, file size = 3431944 (41720 syms). |
| # 20: shared syms = 587 bytes, file size = 3435466 (42983 syms). |
| # 40: shared syms = 204 bytes, file size = 3439084 (43909 syms). |
| # max: shared syms = 0 bytes, file size = 3446275 (46315 syms). |
| # Going with 6, i.e., strings literals with > 6 aliases are combined into a |
| # shared symbol. So 46315 - 40597 = 5718, or ~12% of original syms are removed, |
| # at the cost leaving ~5100 byte in binary sizes unresolved into aliases). |
| _DEX_STRING_MAX_SAME_NAME_ALIAS_COUNT = 6 |
| |
| # Synthetics that map 1:1 with the class they are a suffix on. |
| _CLASS_SPECIFIC_SYNTHETICS = ( |
| 'ExternalSyntheticLambda', |
| 'ExternalSyntheticApiModelOutline', |
| 'ExternalSyntheticServiceLoad', |
| 'Lambda', |
| ) |
| |
| |
| def _ParseJarInfoFile(file_name): |
| with open(file_name, 'r') as info: |
| source_map = dict() |
| for line in info: |
| package_path, file_path = line.strip().split(',', 1) |
| source_map[package_path] = file_path |
| return source_map |
| |
| |
| def RunApkAnalyzerAsync(apk_path, mapping_path): |
| """Starts an apkanalyzer job for the given apk. |
| |
| Args: |
| apk_path: Path to the apk to run on. |
| mapping_path: Path to the proguard mapping file. |
| |
| Returns: |
| An object to pass to CreateDexSymbols(). |
| """ |
| args = [path_util.GetApkAnalyzerPath(), 'dex', 'packages', apk_path] |
| if mapping_path and os.path.exists(mapping_path): |
| args.extend(['--proguard-mappings', mapping_path]) |
| env = os.environ.copy() |
| env['JAVA_HOME'] = path_util.GetJavaHome() |
| |
| # Use a thread rather than directly using a Popen instance so that stdout is |
| # being read from. |
| return parallel.CallOnThread(subprocess.run, |
| args, |
| env=env, |
| encoding='utf-8', |
| capture_output=True, |
| check=True) |
| |
| |
| def _ParseApkAnalyzerOutput(stdout, stderr): |
| stderr = re.sub(r'Successfully loaded.*?\n', '', stderr) |
| if stderr.strip(): |
| raise Exception('Unexpected stderr:\n' + stderr) |
| data = [] |
| for line in stdout.splitlines(): |
| try: |
| vals = line.split() |
| # We want to name these columns so we know exactly which is which. |
| # pylint: disable=unused-variable |
| node_type, state, defined_methods, referenced_methods, size, name = ( |
| vals[0], vals[1], vals[2], vals[3], vals[4], vals[5:]) |
| data.append((node_type, ' '.join(name), int(size))) |
| except Exception: |
| logging.error('Problem line was: %s', line) |
| raise |
| return data |
| |
| |
| # VisibleForTesting |
| def UndoHierarchicalSizing(data): |
| """Subtracts child node sizes from parent nodes. |
| |
| Note that inner classes |
| should be considered as siblings rather than child nodes. |
| |
| Example nodes: |
| [ |
| ('P', '<TOTAL>', 37), |
| ('P', 'org', 32), |
| ('P', 'org.chromium', 32), |
| ('C', 'org.chromium.ClassA', 14), |
| ('M', 'org.chromium.ClassA void methodA()', 10), |
| ('C', 'org.chromium.ClassA$Proxy', 8), |
| ] |
| |
| Processed nodes: |
| [ |
| ('<TOTAL>', 15), |
| ('org.chromium.ClassA', 4), |
| ('org.chromium.ClassA void methodA()', 10), |
| ('org.chromium.ClassA$Proxy', 8), |
| ] |
| """ |
| num_nodes = len(data) |
| nodes = [] |
| |
| def process_node(start_idx): |
| assert start_idx < num_nodes, 'Attempting to parse beyond data array.' |
| node_type, name, size = data[start_idx] |
| total_child_size = 0 |
| next_idx = start_idx + 1 |
| name_len = len(name) |
| while next_idx < num_nodes: |
| next_name = data[next_idx][1] |
| if name == _TOTAL_NODE_NAME or ( |
| len(next_name) > name_len and next_name.startswith(name) |
| and next_name[name_len] in '. '): |
| # Child node |
| child_next_idx, child_node_size = process_node(next_idx) |
| next_idx = child_next_idx |
| total_child_size += child_node_size |
| else: |
| # Sibling or higher nodes |
| break |
| |
| # Apkanalyzer may overcount private method sizes at times. Unfortunately |
| # the fix is not in the version we have in Android SDK Tools. For now we |
| # prefer to undercount child sizes since the parent's size is more |
| # accurate. This means the sum of child nodes may exceed its immediate |
| # parent node's size. |
| total_child_size = min(size, total_child_size) |
| # TODO(wnwen): Add assert back once dexlib2 2.2.5 is released and rolled. |
| #assert total_child_size <= size, ( |
| # 'Child node total size exceeded parent node total size') |
| |
| node_size = size - total_child_size |
| # It is valid to have a package and a class with the same name. |
| # To avoid having two symbols with the same name in these cases, do not |
| # create symbols for packages (which have no size anyways). |
| if node_type == 'P' and node_size != 0 and name != _TOTAL_NODE_NAME: |
| logging.warning('Unexpected java package that takes up size: %d, %s', |
| node_size, name) |
| if node_type != 'P' or node_size != 0: |
| nodes.append((node_type, name, node_size)) |
| return next_idx, size |
| |
| idx = 0 |
| while idx < num_nodes: |
| idx = process_node(idx)[0] |
| return nodes |
| |
| |
| def _TruncateFrom(value, delimiter, rfind=False): |
| idx = value.rfind(delimiter) if rfind else value.find(delimiter) |
| if idx != -1: |
| return value[:idx] |
| return value |
| |
| |
| def _NormalizeName(orig_name): |
| """Extracts outer class name and normalizes names with hashes in them. |
| |
| Returns: |
| outer_class: The outer class. Example: package.Class |
| Returns None for classes that are outlines. |
| new_name: Normalized name. |
| """ |
| # May be reassigned by one of the cases below. |
| outer_class = _TruncateFrom(orig_name, '$') |
| |
| # $$ is the convention for a synthetic class and all known desugared lambda |
| synthetic_marker_idx = orig_name.find('$$') |
| if synthetic_marker_idx == -1: |
| return outer_class, orig_name |
| |
| synthetic_part = orig_name[synthetic_marker_idx + 2:] |
| |
| # Example: package.Cls$$InternalSyntheticLambda$0$81073ff6$0 |
| if synthetic_part.startswith('InternalSyntheticLambda$'): |
| next_dollar_idx = orig_name.index('$', |
| synthetic_marker_idx + len('$$InternalSyntheticLambda$')) |
| return outer_class, orig_name[:next_dollar_idx] |
| |
| # Ensure we notice if a new type of InternalSythetic pops up. |
| # E.g. to see if it follows the same naming scheme. |
| assert not synthetic_part.startswith('Internal'), f'Unrecognized: {orig_name}' |
| |
| if synthetic_part.startswith(_CLASS_SPECIFIC_SYNTHETICS): |
| return outer_class, orig_name |
| |
| return None, orig_name |
| |
| |
| def NormalizeLine(orig_name, full_name): |
| """Normalizes a line from apkanalyzer output. |
| Args: |
| orig_name: The original name from apkanalyzer output. |
| full_name: The full name of the symbol. |
| Returns: |
| outer_class: The outer class. Example: package.Class |
| Returns None for classes that are outlines. |
| new_full_name: Normalized full name. |
| """ |
| # See tests for a more comprehensive list of what d8 currently generates. |
| outer_class, new_name = _NormalizeName(orig_name) |
| if new_name is not orig_name: |
| full_name = full_name.replace(orig_name, new_name) |
| return outer_class, full_name |
| |
| |
| def _MakeDexObjectPath(package_name, is_outlined): |
| if is_outlined: |
| # Create a special meta-directory for outlined lambdas to easily monitor |
| # their total size and spot regressions. |
| return posixpath.join(models.OUTLINES_PREFIX_PATH, *package_name.split('.')) |
| return posixpath.join(models.APK_PREFIX_PATH, *package_name.split('.')) |
| |
| |
| # Visible for testing. |
| def CreateDexSymbol(name, size, source_map): |
| parts = name.split(' ') # (class_name, return_type, method_name) |
| new_package = parts[0] |
| |
| if new_package == _TOTAL_NODE_NAME: |
| return None |
| |
| outer_class, name = NormalizeLine(new_package, name) |
| |
| # Look for class merging. |
| old_package = new_package |
| # len(parts) == 2 for class nodes. |
| if len(parts) > 2: |
| method = parts[2] |
| # last_idx == -1 for fields, which is fine. |
| last_idx = method.find('(') |
| last_idx = method.rfind('.', 0, last_idx) |
| if last_idx != -1: |
| old_package = method[:last_idx] |
| |
| # TODO(b/333617478): Delete this work-around when R8 mapping files no |
| # longer output this pattern. |
| suspect_class_name = old_package |
| if suspect_class_name.startswith('WV.'): |
| suspect_class_name = suspect_class_name[3:] |
| if ('.' not in suspect_class_name |
| and new_package.endswith(f'.{suspect_class_name}')): |
| name = name.replace(f' {old_package}.', ' ') |
| old_package = new_package |
| else: |
| # Non-workaround case: |
| outer_class, name = NormalizeLine(old_package, name) |
| |
| is_outlined = outer_class == None |
| object_path = _MakeDexObjectPath(old_package, is_outlined) |
| if name.endswith(')'): |
| section_name = models.SECTION_DEX_METHOD |
| else: |
| section_name = models.SECTION_DEX |
| |
| source_path = source_map.get(outer_class, '') |
| return models.Symbol(section_name, |
| size, |
| full_name=name, |
| object_path=object_path, |
| source_path=source_path) |
| |
| |
| def _SymbolsFromNodes(nodes, source_map): |
| # Use (DEX_METHODS, DEX) buckets to speed up sorting. |
| symbol_buckets = ([], []) |
| for _, name, node_size in nodes: |
| symbol = CreateDexSymbol(name, node_size, source_map) |
| if symbol: |
| bucket_index = int(symbol.section_name is models.SECTION_DEX) |
| symbol_buckets[bucket_index].append(symbol) |
| for symbols_bucket in symbol_buckets: |
| symbols_bucket.sort(key=lambda s: s.full_name) |
| return symbol_buckets |
| |
| def _GenDexStringsUsedByClasses(dexfile, class_deobfuscation_map): |
| """Emit strings used in code_items and associate them with classes. |
| |
| Args: |
| dexfile: A DexFile instance. |
| class_deobfuscation_map: Map from obfuscated names to class names. |
| |
| Yields: |
| string_idx: DEX string index. |
| size: Number of bytes taken by string, including pointer. |
| decoded_string: The decoded string. |
| class_names: List of class names |
| """ |
| if not dexfile or not dexfile.code_item_list: |
| return |
| |
| # Helper to deobfuscate class names while converting 'LFoo;' -> 'Foo'. |
| num_bad_name = 0 |
| num_deobfus_names = 0 |
| num_failed_deobfus = 0 |
| |
| @functools.lru_cache(None) |
| def LookupDeobfuscatedClassNames(class_def_idx): |
| nonlocal num_bad_name, num_deobfus_names, num_failed_deobfus |
| class_def_item = dexfile.class_def_item_list[class_def_idx] |
| name = dexfile.GetTypeString(class_def_item.class_idx) |
| if not (name.startswith('L') and name.endswith(';')): |
| num_bad_name += 1 |
| return name |
| # Change "L{X};" to "{X}", and convert path name to class name. |
| name = name[1:-1].replace('/', '.') |
| deobfuscated_name = class_deobfuscation_map.get(name, None) |
| if deobfuscated_name is not None: |
| name = deobfuscated_name |
| num_deobfus_names += 1 |
| else: |
| num_failed_deobfus += 1 |
| return name |
| |
| # Precompute map from code item offsets to set of string id used. |
| code_off_to_used_string_ids = { |
| code_item.offset: set(dexfile.IterAllStringIdsUsedByCodeItem(code_item)) |
| for code_item in dexfile.code_item_list |
| } |
| code_off_to_used_string_ids[0] = set() # Offset 0 = No code. |
| |
| # Walk code for each class, each methods, mark string usages. |
| string_idx_to_class_idxs = collections.defaultdict(set) |
| for i, class_item in enumerate(dexfile.class_def_item_list): |
| string_idxs_used_by_class = set() |
| class_data_item = dexfile.GetClassDataItemByOffset( |
| class_item.class_data_off) |
| if class_data_item: |
| for encoded_method in itertools.chain(class_data_item.direct_methods, |
| class_data_item.virtual_methods): |
| code_off = encoded_method.code_off |
| string_idxs_used_by_class |= code_off_to_used_string_ids[code_off] |
| for string_idx in string_idxs_used_by_class: |
| string_idx_to_class_idxs[string_idx].add(i) |
| |
| # Emit each string used by code, with names of classes that use it. Both are |
| # sorted to maintain consitency. |
| for string_idx in sorted(string_idx_to_class_idxs): |
| string_item = dexfile.string_data_item_list[string_idx] |
| size = string_item.byte_size + 4 # +4 for pointer. |
| decoded_string = string_item.data |
| class_idxs = string_idx_to_class_idxs[string_idx] |
| class_names = sorted(LookupDeobfuscatedClassNames(i) for i in class_idxs) |
| yield string_idx, size, decoded_string, class_names |
| |
| logging.info('Deobfuscated %d / %d classes (%d failures)', num_deobfus_names, |
| len(dexfile.class_def_item_list), num_failed_deobfus) |
| if num_bad_name > 0: |
| logging.warn('Found %d class names not formatted as "L.*;".' % num_bad_name) |
| |
| |
| def _MakeFakeSourcePath(class_name): |
| class_path = class_name.replace('.', '/') |
| return f'{models.APK_PREFIX_PATH}/{class_path}' |
| |
| |
| def _StringSymbolsFromDexFile(apk_path, dexfile, source_map, |
| class_deobfuscation_map): |
| if not dexfile: |
| return [] |
| logging.info('Extractng string symbols from %s', apk_path) |
| |
| # Code strings: Strings accessed via class -> method -> code -> string. |
| # These are extracted into separate symbols ,aliases among referring classes. |
| fresh_string_idx_set = set(range(len(dexfile.string_data_item_list))) |
| object_path = str(apk_path) |
| dex_string_symbols = [] |
| string_iter = _GenDexStringsUsedByClasses(dexfile, class_deobfuscation_map) |
| for string_idx, size, decoded_string, string_user_class_names in string_iter: |
| fresh_string_idx_set.remove(string_idx) |
| num_aliases = len(string_user_class_names) |
| aliases = [] |
| for class_name in string_user_class_names: |
| outer_class, class_name = _NormalizeName(class_name) |
| full_name = string_extract.GetNameOfStringLiteralBytes( |
| decoded_string.encode('utf-8', errors='surrogatepass')) |
| source_path = (source_map.get(outer_class, '') |
| or _MakeFakeSourcePath(class_name)) |
| sym = models.Symbol(models.SECTION_DEX, |
| size, |
| full_name=full_name, |
| object_path=object_path, |
| source_path=source_path, |
| aliases=aliases if num_aliases > 1 else None) |
| aliases.append(sym) |
| assert num_aliases == len(aliases) |
| dex_string_symbols += aliases |
| |
| logging.info('Counted %d class -> method -> code strings', |
| len(dexfile.string_data_item_list) - len(fresh_string_idx_set)) |
| |
| # Extract aggregate string symbols for {types, methods, fields, prototypes}. |
| # Due to significant overlap (coincidental or induced by R8), {method, field} |
| # string symbols share a common aggregate. Other overlap sare resolved by |
| # applying the priority: |
| # code > type > {method, field} > prototype, |
| # i.e., bytes from code strings are not counted in aggregates; bytes from type |
| # string aggregate are not counted by {{method, field}, prototype}, etc. |
| |
| def _AddAggregateStringSymbol(name, string_idx_set): |
| nonlocal fresh_string_idx_set |
| old_count = len(string_idx_set) |
| string_idx_set &= fresh_string_idx_set |
| fresh_string_idx_set -= string_idx_set |
| logging.info('Counted %d %s strings among %d found', len(string_idx_set), |
| name, old_count) |
| if string_idx_set: |
| # Each string has +4 for pointer. |
| size = sum(dexfile.string_data_item_list[string_idx].byte_size |
| for string_idx in string_idx_set) + 4 * len(string_idx_set) |
| sym = models.Symbol(models.SECTION_DEX, |
| size, |
| full_name=f'** .dex ({name} strings)') |
| dex_string_symbols.append(sym) |
| |
| # Type strings. |
| type_string_idx_set = {i.descriptor_idx for i in dexfile.type_id_item_list} |
| _AddAggregateStringSymbol('type', type_string_idx_set) |
| |
| # Method and field strings. |
| method_string_idx_set = {i.name_idx for i in dexfile.method_id_item_list} |
| field_string_idx_set = {i.name_idx for i in dexfile.field_id_item_list} |
| _AddAggregateStringSymbol('method and field', |
| method_string_idx_set | field_string_idx_set) |
| |
| # Prototype strings. |
| proto_string_idx_set = {i.shorty_idx for i in dexfile.proto_id_item_list} |
| _AddAggregateStringSymbol('prototype', proto_string_idx_set) |
| |
| return dex_string_symbols |
| |
| |
| def _ParseDexfilesInApk(apk_path): |
| with zipfile.ZipFile(apk_path) as src_zip: |
| dex_infos = [ |
| info for info in src_zip.infolist() if |
| info.filename.startswith('classes') and info.filename.endswith('.dex') |
| ] |
| # Assume sound and stable ordering of DEX filenames. |
| for dex_info in dex_infos: |
| dex_data = src_zip.read(dex_info) |
| yield dex_info.filename, dex_parser.DexFile(dex_data) |
| |
| |
| def CreateDexSymbols(apk_path, apk_analyzer_async_result, dex_total_size, |
| class_deobfuscation_map, size_info_prefix, |
| track_string_literals): |
| """Creates DEX symbols given apk_analyzer output. |
| |
| Args: |
| apk_path: Path to the APK containing the DEX file. |
| apk_analyzer_async_result: Return value from RunApkAnalyzerAsync(). |
| dex_total_size: Sum of the sizes of all .dex files in the apk. |
| class_deobfuscation_map: Map from obfuscated names to class names. |
| size_info_prefix: Path such as: out/Release/size-info/BaseName. |
| track_string_literals: Create symbols for string literals. |
| |
| Returns: |
| A tuple of (section_ranges, raw_symbols, metrics_by_file), where |
| metrics_by_file is a dict from DEX file name to a dict of |
| {metric_name: value}. |
| """ |
| logging.debug('Waiting for apkanalyzer to finish') |
| apk_analyzer_result = apk_analyzer_async_result.get() |
| logging.debug('Analyzing DEX - processing results') |
| if size_info_prefix: |
| source_map = _ParseJarInfoFile(size_info_prefix + '.jar.info') |
| else: |
| source_map = dict() |
| |
| nodes = _ParseApkAnalyzerOutput(apk_analyzer_result.stdout, |
| apk_analyzer_result.stderr) |
| nodes = UndoHierarchicalSizing(nodes) |
| |
| total_node_size = sum([x[2] for x in nodes]) |
| # TODO(agrieve): Figure out why this log is triggering for |
| # ChromeModernPublic.apk (https://crbug.com/851535). |
| # Reporting: dex_total_size=6546088 total_node_size=6559549 |
| if dex_total_size < total_node_size: |
| logging.error( |
| 'Node size too large, check for node processing errors. ' |
| 'dex_total_size=%d total_node_size=%d', dex_total_size, total_node_size) |
| |
| dex_method_symbols, dex_other_symbols = _SymbolsFromNodes(nodes, source_map) |
| dex_string_symbols = [] |
| metrics_by_file = {} |
| for dex_path, dexfile in _ParseDexfilesInApk(apk_path): |
| logging.debug('Found DEX: %r', dex_path) |
| if track_string_literals: |
| dex_string_symbols += _StringSymbolsFromDexFile(apk_path, dexfile, |
| source_map, |
| class_deobfuscation_map) |
| map_item_sizes = dexfile.ComputeMapItemSizes() |
| metrics = {} |
| for item in map_item_sizes: |
| metrics[f'{models.METRICS_SIZE}/' + item['name']] = item['byte_size'] |
| metrics[f'{models.METRICS_COUNT}/' + item['name']] = item['size'] |
| metrics_by_file[dex_path] = metrics |
| |
| if dex_string_symbols: |
| logging.info('Converting excessive DEX string aliases into shared-path ' |
| 'symbols') |
| archive_util.CompactLargeAliasesIntoSharedSymbols( |
| dex_string_symbols, _DEX_STRING_MAX_SAME_NAME_ALIAS_COUNT) |
| |
| dex_method_size = round(sum(s.pss for s in dex_method_symbols)) |
| dex_other_size = round(sum(s.pss for s in dex_other_symbols)) |
| dex_other_size += round(sum(s.pss for s in dex_string_symbols)) |
| unattributed_dex = dex_total_size - dex_method_size - dex_other_size |
| # Compare against -5 instead of 0 to guard against round-off errors. |
| assert unattributed_dex >= -5, ( |
| 'sum(dex_symbols.size) > sum(filesize(dex file)). {} vs {}'.format( |
| dex_method_size + dex_other_size, dex_total_size)) |
| |
| if unattributed_dex > 0: |
| dex_other_symbols.append( |
| models.Symbol( |
| models.SECTION_DEX, |
| unattributed_dex, |
| full_name='** .dex (unattributed)')) |
| |
| dex_other_symbols.extend(dex_method_symbols) |
| dex_other_symbols.extend(dex_string_symbols) |
| |
| # We can't meaningfully track section size of dex methods vs other, so |
| # just fake the size of dex methods as the sum of symbols, and make |
| # "dex other" responsible for any unattributed bytes. |
| section_ranges = { |
| models.SECTION_DEX_METHOD: (0, dex_method_size), |
| models.SECTION_DEX: (0, dex_total_size - dex_method_size), |
| } |
| |
| return section_ranges, dex_other_symbols, metrics_by_file |