blob: 73e7569b9739602b8c2e09dd732290ea44d9ad3a [file] [log] [blame]
# Copyright 2017 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Main Python API for analyzing binary size."""
import argparse
import calendar
import collections
import datetime
import gzip
import itertools
import logging
import os
import posixpath
import re
import subprocess
import sys
import tempfile
import zipfile
import zlib
import apkanalyzer
import ar
import concurrent
import demangle
import describe
import file_format
import function_signature
import linker_map_parser
import models
import ninja_parser
import nm
import obj_analyzer
import path_util
sys.path.insert(1, os.path.join(path_util.SRC_ROOT, 'tools', 'grit'))
from grit.format import data_pack
_OWNERS_FILENAME = 'OWNERS'
_COMPONENT_REGEX = re.compile(r'\s*#\s*COMPONENT\s*:\s*(\S+)')
_FILE_PATH_REGEX = re.compile(r'\s*file://(\S+)')
_UNCOMPRESSED_COMPRESSION_RATIO_THRESHOLD = 0.9
# Holds computation state that is live only when an output directory exists.
_OutputDirectoryContext = collections.namedtuple('_OutputDirectoryContext', [
'elf_object_paths', # Only when elf_path is also provided.
'known_inputs', # Only when elf_path is also provided.
'output_directory',
'source_mapper',
'thin_archives',
])
# Tunable "knobs" for CreateSectionSizesAndSymbols().
class SectionSizeKnobs(object):
def __init__(self):
# A limit on the number of symbols an address can have, before these symbols
# are compacted into shared symbols. Increasing this value causes more data
# to be stored .size files, but is also more expensive.
# Effect of max_same_name_alias_count (as of Oct 2017, with min_pss = max):
# 1: shared .text syms = 1772874 bytes, file size = 9.43MiB (645476 syms).
# 2: shared .text syms = 1065654 bytes, file size = 9.58MiB (669952 syms).
# 6: shared .text syms = 464058 bytes, file size = 10.11MiB (782693 syms).
# 10: shared .text syms = 365648 bytes, file size = 10.24MiB (813758 syms).
# 20: shared .text syms = 86202 bytes, file size = 10.38MiB (854548 syms).
# 40: shared .text syms = 48424 bytes, file size = 10.50MiB (890396 syms).
# 50: shared .text syms = 41860 bytes, file size = 10.54MiB (902304 syms).
# max: shared .text syms = 0 bytes, file size = 11.10MiB (1235449 syms).
self.max_same_name_alias_count = 40 # 50kb is basically negligable.
# An estimate of pak translation compression ratio to make comparisons
# between .size files reasonable. Otherwise this can differ every pak
# change.
self.pak_compression_ratio = 0.33
# File name: Source file.
self.apk_other_files = {
'assets/icudtl.dat': '../../third_party/icu/android/icudtl.dat',
'assets/snapshot_blob_32.bin': '../../v8/snapshot_blob_32.bin',
'assets/snapshot_blob_64.bin': '../../v8/snapshot_blob_64.bin',
'assets/natives_blob.bin': '../../v8/natives_blob.bin',
'assets/unwind_cfi_32': '../../base/trace_event/cfi_backtrace_android.cc',
'assets/webapk_dex_version.txt': (
'../../chrome/android/webapk/libs/runtime_library_version.gni'),
'lib/armeabi-v7a/libarcore_sdk_c_minimal.so': (
'../../third_party/arcore-android-sdk'),
}
self.apk_expected_other_files = set([
# From Monochrome.apk
'AndroidManifest.xml',
'resources.arsc',
'assets/AndroidManifest.xml',
'assets/metaresources.arsc',
'META-INF/CHROMIUM.SF',
'META-INF/CHROMIUM.RSA',
'META-INF/MANIFEST.MF',
])
self.src_root = path_util.SRC_ROOT
def _OpenMaybeGz(path):
"""Calls `gzip.open()` if |path| ends in ".gz", otherwise calls `open()`."""
if path.endswith('.gz'):
return gzip.open(path, 'rb')
return open(path, 'rb')
def _StripLinkerAddedSymbolPrefixes(raw_symbols):
"""Removes prefixes sometimes added to symbol names during link
Removing prefixes make symbol names match up with those found in .o files.
"""
for symbol in raw_symbols:
full_name = symbol.full_name
if full_name.startswith('startup.'):
symbol.flags |= models.FLAG_STARTUP
symbol.full_name = full_name[8:]
elif full_name.startswith('unlikely.'):
symbol.flags |= models.FLAG_UNLIKELY
symbol.full_name = full_name[9:]
elif full_name.startswith('rel.local.'):
symbol.flags |= models.FLAG_REL_LOCAL
symbol.full_name = full_name[10:]
elif full_name.startswith('rel.'):
symbol.flags |= models.FLAG_REL
symbol.full_name = full_name[4:]
elif full_name.startswith('hot.'):
symbol.flags |= models.FLAG_HOT
symbol.full_name = full_name[4:]
elif full_name.startswith('.L.str'):
symbol.full_name = models.STRING_LITERAL_NAME
def _NormalizeNames(raw_symbols):
"""Ensures that all names are formatted in a useful way.
This includes:
- Deriving |name| and |template_name| from |full_name|.
- Stripping of return types (for functions).
- Moving "vtable for" and the like to be suffixes rather than prefixes.
"""
found_prefixes = set()
for symbol in raw_symbols:
full_name = symbol.full_name
# See comment in _CalculatePadding() about when this can happen. Don't
# process names for non-native sections.
if symbol.IsPak():
# full_name: "about_ui_resources.grdp: IDR_ABOUT_UI_CREDITS_HTML".
space_idx = full_name.rindex(' ')
name = full_name[space_idx + 1:]
symbol.template_name = name
symbol.name = name
elif (full_name.startswith('*') or
symbol.IsOverhead() or
symbol.IsOther()):
symbol.template_name = full_name
symbol.name = full_name
elif symbol.IsDex():
symbol.full_name, symbol.template_name, symbol.name = (
function_signature.ParseJava(full_name))
elif symbol.IsNative():
# Remove [clone] suffix, and set flag accordingly.
# Search from left-to-right, as multiple [clone]s can exist.
# Example name suffixes:
# [clone .part.322] # GCC
# [clone .isra.322] # GCC
# [clone .constprop.1064] # GCC
# [clone .11064] # clang
# http://unix.stackexchange.com/questions/223013/function-symbol-gets-part-suffix-after-compilation
idx = full_name.find(' [clone ')
if idx != -1:
full_name = full_name[:idx]
symbol.flags |= models.FLAG_CLONE
# Clones for C symbols.
if symbol.section == 't':
idx = full_name.rfind('.')
if idx != -1 and full_name[idx + 1:].isdigit():
new_name = full_name[:idx]
# Generated symbols that end with .123 but are not clones.
# Find these via:
# size_info.symbols.WhereInSection('t').WhereIsGroup().SortedByCount()
if new_name not in ('__tcf_0', 'startup'):
full_name = new_name
symbol.flags |= models.FLAG_CLONE
# Remove .part / .isra / .constprop.
idx = full_name.rfind('.', 0, idx)
if idx != -1:
full_name = full_name[:idx]
# E.g.: vtable for FOO
idx = full_name.find(' for ', 0, 30)
if idx != -1:
found_prefixes.add(full_name[:idx + 4])
full_name = '{} [{}]'.format(full_name[idx + 5:], full_name[:idx])
# E.g.: virtual thunk to FOO
idx = full_name.find(' to ', 0, 30)
if idx != -1:
found_prefixes.add(full_name[:idx + 3])
full_name = '{} [{}]'.format(full_name[idx + 4:], full_name[:idx])
# Strip out return type, and split out name, template_name.
# Function parsing also applies to non-text symbols.
# E.g. Function statics.
symbol.full_name, symbol.template_name, symbol.name = (
function_signature.Parse(full_name))
# Remove anonymous namespaces (they just harm clustering).
symbol.template_name = symbol.template_name.replace(
'(anonymous namespace)::', '')
symbol.full_name = symbol.full_name.replace(
'(anonymous namespace)::', '')
non_anonymous_name = symbol.name.replace('(anonymous namespace)::', '')
if symbol.name != non_anonymous_name:
symbol.flags |= models.FLAG_ANONYMOUS
symbol.name = non_anonymous_name
# Allow using "is" to compare names (and should help with RAM). This applies
# to all symbols.
function_signature.InternSameNames(symbol)
logging.debug('Found name prefixes of: %r', found_prefixes)
def _NormalizeObjectPath(path):
"""Normalizes object paths.
Prefixes are removed: obj/, ../../
Archive names made more pathy: foo/bar.a(baz.o) -> foo/bar.a/baz.o
"""
if path.startswith('obj/'):
# Convert obj/third_party/... -> third_party/...
path = path[4:]
elif path.startswith('../../'):
# Convert ../../third_party/... -> third_party/...
path = path[6:]
if path.endswith(')'):
# Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o so that hierarchical
# breakdowns consider the .o part to be a separate node.
start_idx = path.rindex('(')
path = os.path.join(path[:start_idx], path[start_idx + 1:-1])
return path
def _NormalizeSourcePath(path):
"""Returns (is_generated, normalized_path)"""
if path.startswith('gen/'):
# Convert gen/third_party/... -> third_party/...
return True, path[4:]
if path.startswith('../../'):
# Convert ../../third_party/... -> third_party/...
return False, path[6:]
return True, path
def _ExtractSourcePathsAndNormalizeObjectPaths(raw_symbols, source_mapper):
"""Fills in the |source_path| attribute and normalizes |object_path|."""
if source_mapper:
logging.info('Looking up source paths from ninja files')
for symbol in raw_symbols:
object_path = symbol.object_path
if symbol.IsDex() or symbol.IsOther():
if symbol.source_path:
symbol.generated_source, symbol.source_path = _NormalizeSourcePath(
symbol.source_path)
elif object_path:
# We don't have source info for prebuilt .a files.
if not os.path.isabs(object_path) and not object_path.startswith('..'):
source_path = source_mapper.FindSourceForPath(object_path)
if source_path:
symbol.generated_source, symbol.source_path = (
_NormalizeSourcePath(source_path))
symbol.object_path = _NormalizeObjectPath(object_path)
assert source_mapper.unmatched_paths_count == 0, (
'One or more source file paths could not be found. Likely caused by '
'.ninja files being generated at a different time than the .map file.')
else:
logging.info('Normalizing object paths')
for symbol in raw_symbols:
if symbol.object_path:
symbol.object_path = _NormalizeObjectPath(symbol.object_path)
def _ComputeAncestorPath(path_list, symbol_count):
"""Returns the common ancestor of the given paths."""
if not path_list:
return ''
prefix = os.path.commonprefix(path_list)
# Check if all paths were the same.
if prefix == path_list[0]:
return prefix
# Put in buckets to cut down on the number of unique paths.
if symbol_count >= 100:
symbol_count_str = '100+'
elif symbol_count >= 50:
symbol_count_str = '50-99'
elif symbol_count >= 20:
symbol_count_str = '20-49'
elif symbol_count >= 10:
symbol_count_str = '10-19'
else:
symbol_count_str = str(symbol_count)
# Put the path count as a subdirectory so that grouping by path will show
# "{shared}" as a bucket, and the symbol counts as leafs.
if not prefix:
return os.path.join('{shared}', symbol_count_str)
return os.path.join(os.path.dirname(prefix), '{shared}', symbol_count_str)
def _CompactLargeAliasesIntoSharedSymbols(raw_symbols, knobs):
"""Converts symbols with large number of aliases into single symbols.
The merged symbol's path fields are changed to common-ancestor paths in
the form: common/dir/{shared}/$SYMBOL_COUNT
Assumes aliases differ only by path (not by name).
"""
num_raw_symbols = len(raw_symbols)
num_shared_symbols = 0
src_cursor = 0
dst_cursor = 0
while src_cursor < num_raw_symbols:
symbol = raw_symbols[src_cursor]
raw_symbols[dst_cursor] = symbol
dst_cursor += 1
aliases = symbol.aliases
if aliases and len(aliases) > knobs.max_same_name_alias_count:
symbol.source_path = _ComputeAncestorPath(
[s.source_path for s in aliases if s.source_path], len(aliases))
symbol.object_path = _ComputeAncestorPath(
[s.object_path for s in aliases if s.object_path], len(aliases))
symbol.generated_source = all(s.generated_source for s in aliases)
symbol.aliases = None
num_shared_symbols += 1
src_cursor += len(aliases)
else:
src_cursor += 1
raw_symbols[dst_cursor:] = []
num_removed = src_cursor - dst_cursor
logging.debug('Converted %d aliases into %d shared-path symbols',
num_removed, num_shared_symbols)
def _ConnectNmAliases(raw_symbols):
"""Ensures |aliases| is set correctly for all symbols."""
prev_sym = raw_symbols[0]
for sym in raw_symbols[1:]:
# Don't merge bss symbols.
if sym.address > 0 and prev_sym.address == sym.address:
# Don't merge padding-only symbols (** symbol gaps).
if prev_sym.size > 0:
# Don't merge if already merged.
if prev_sym.aliases is None or prev_sym.aliases is not sym.aliases:
if prev_sym.aliases:
prev_sym.aliases.append(sym)
else:
prev_sym.aliases = [prev_sym, sym]
sym.aliases = prev_sym.aliases
prev_sym = sym
def _AssignNmAliasPathsAndCreatePathAliases(raw_symbols, object_paths_by_name):
num_found_paths = 0
num_unknown_names = 0
num_path_mismatches = 0
num_aliases_created = 0
ret = []
for symbol in raw_symbols:
ret.append(symbol)
full_name = symbol.full_name
# Don't skip if symbol.IsBss(). This is needed for LLD-LTO to work, since
# .bss object_path data are unavailable for linker_map_parser, and need to
# be extracted here. For regular LLD flow, incorrect aliased symbols can
# arise. But that's a lesser evil compared to having LLD-LTO .bss missing
# object_path and source_path.
# TODO(huangs): Fix aliased symbols for the LLD case.
if (symbol.IsStringLiteral() or
not full_name or
full_name[0] in '*.' or # e.g. ** merge symbols, .Lswitch.table
full_name == 'startup'):
continue
object_paths = object_paths_by_name.get(full_name)
if object_paths:
num_found_paths += 1
else:
if num_unknown_names < 10:
logging.warning('Symbol not found in any .o files: %r', symbol)
num_unknown_names += 1
continue
if symbol.object_path and symbol.object_path not in object_paths:
if num_path_mismatches < 10:
logging.warning('Symbol path reported by .map not found by nm.')
logging.warning('sym=%r', symbol)
logging.warning('paths=%r', object_paths)
object_paths.append(symbol.object_path)
object_paths.sort()
num_path_mismatches += 1
symbol.object_path = object_paths[0]
if len(object_paths) > 1:
# Create one symbol for each object_path.
aliases = symbol.aliases or [symbol]
symbol.aliases = aliases
num_aliases_created += len(object_paths) - 1
for object_path in object_paths[1:]:
new_sym = models.Symbol(
symbol.section_name, symbol.size, address=symbol.address,
full_name=full_name, object_path=object_path, aliases=aliases)
aliases.append(new_sym)
ret.append(new_sym)
logging.debug('Cross-referenced %d symbols with nm output. '
'num_unknown_names=%d num_path_mismatches=%d '
'num_aliases_created=%d',
num_found_paths, num_unknown_names, num_path_mismatches,
num_aliases_created)
return ret
def _DiscoverMissedObjectPaths(raw_symbols, known_inputs):
# Missing object paths are caused by .a files added by -l flags, which are not
# listed as explicit inputs within .ninja rules.
missed_inputs = set()
for symbol in raw_symbols:
path = symbol.object_path
if path.endswith(')'):
# Convert foo/bar.a(baz.o) -> foo/bar.a
path = path[:path.rindex('(')]
if path and path not in known_inputs:
missed_inputs.add(path)
return missed_inputs
def _CreateMergeStringsReplacements(merge_string_syms,
list_of_positions_by_object_path):
"""Creates replacement symbols for |merge_syms|."""
ret = []
STRING_LITERAL_NAME = models.STRING_LITERAL_NAME
assert len(merge_string_syms) == len(list_of_positions_by_object_path)
tups = itertools.izip(merge_string_syms, list_of_positions_by_object_path)
for merge_sym, positions_by_object_path in tups:
merge_sym_address = merge_sym.address
new_symbols = []
ret.append(new_symbols)
for object_path, positions in positions_by_object_path.iteritems():
for offset, size in positions:
address = merge_sym_address + offset
symbol = models.Symbol(
models.SECTION_RODATA, size, address, STRING_LITERAL_NAME,
object_path=object_path)
new_symbols.append(symbol)
logging.debug('Created %d string literal symbols', sum(len(x) for x in ret))
logging.debug('Sorting string literals')
for symbols in ret:
# In order to achieve a total ordering in the presense of aliases, need to
# include both |address| and |object_path|.
# In order to achieve consistent deduping, need to include |size|.
symbols.sort(key=lambda x: (x.address, -x.size, x.object_path))
logging.debug('Deduping string literals')
num_removed = 0
size_removed = 0
num_aliases = 0
for i, symbols in enumerate(ret):
if not symbols:
continue
prev_symbol = symbols[0]
new_symbols = [prev_symbol]
for symbol in symbols[1:]:
padding = symbol.address - prev_symbol.end_address
if (prev_symbol.address == symbol.address and
prev_symbol.size == symbol.size):
# String is an alias.
num_aliases += 1
aliases = prev_symbol.aliases
if aliases:
aliases.append(symbol)
symbol.aliases = aliases
else:
aliases = [prev_symbol, symbol]
prev_symbol.aliases = aliases
symbol.aliases = aliases
elif padding + symbol.size <= 0:
# String is a substring of prior one.
num_removed += 1
size_removed += symbol.size
continue
elif padding < 0:
# String overlaps previous one. Adjust to not overlap.
symbol.address -= padding
symbol.size += padding
new_symbols.append(symbol)
prev_symbol = symbol
ret[i] = new_symbols
# Aliases come out in random order, so sort to be deterministic.
ret[i].sort(key=lambda s: (s.address, s.object_path))
logging.debug(
'Removed %d overlapping string literals (%d bytes) & created %d aliases',
num_removed, size_removed, num_aliases)
return ret
def _CalculatePadding(raw_symbols):
"""Populates the |padding| field based on symbol addresses.
Symbols must already be sorted by |address|.
"""
seen_sections = set()
for i, symbol in enumerate(raw_symbols[1:]):
prev_symbol = raw_symbols[i]
if symbol.IsOverhead():
# Overhead symbols are not actionable so should be padding-only.
symbol.padding = symbol.size
if prev_symbol.section_name != symbol.section_name:
assert symbol.section_name not in seen_sections, (
'Input symbols must be sorted by section, then address.')
seen_sections.add(symbol.section_name)
continue
if (symbol.address <= 0 or prev_symbol.address <= 0 or
not symbol.IsNative() or not prev_symbol.IsNative()):
continue
if symbol.address == prev_symbol.address:
if symbol.aliases and symbol.aliases is prev_symbol.aliases:
symbol.padding = prev_symbol.padding
symbol.size = prev_symbol.size
continue
# Padding-only symbols happen for ** symbol gaps.
assert prev_symbol.size_without_padding == 0, (
'Found duplicate symbols:\n%r\n%r' % (prev_symbol, symbol))
padding = symbol.address - prev_symbol.end_address
# These thresholds were found by experimenting with arm32 Chrome.
# E.g.: Set them to 0 and see what warnings get logged, then take max value.
# TODO(agrieve): See if these thresholds make sense for architectures
# other than arm32.
if (not symbol.full_name.startswith('*') and
not symbol.IsStringLiteral() and (
symbol.section in 'rd' and padding >= 256 or
symbol.section in 't' and padding >= 64)):
# Should not happen.
logging.warning('Large padding of %d between:\n A) %r\n B) %r' % (
padding, prev_symbol, symbol))
symbol.padding = padding
symbol.size += padding
assert symbol.size >= 0, (
'Symbol has negative size (likely not sorted propertly): '
'%r\nprev symbol: %r' % (symbol, prev_symbol))
def _ParseComponentFromOwners(filename):
"""Searches an OWNERS file for lines that start with `# COMPONENT:`.
If an OWNERS file has no COMPONENT but references another OWNERS file, follow
the reference and check that file instead.
Args:
filename: Path to the file to parse.
Returns:
The text that follows the `# COMPONENT:` prefix, such as 'component>name'.
Empty string if no component found or the file didn't exist.
"""
reference_paths = []
try:
with open(filename) as f:
for line in f:
component_matches = _COMPONENT_REGEX.match(line)
path_matches = _FILE_PATH_REGEX.match(line)
if component_matches:
return component_matches.group(1)
elif path_matches:
reference_paths.append(path_matches.group(1))
except IOError:
return ''
if len(reference_paths) == 1:
newpath = os.path.join(path_util.SRC_ROOT, reference_paths[0])
return _ParseComponentFromOwners(newpath)
else:
return ''
def _FindComponentRoot(start_path, cache, knobs):
"""Searches all parent directories for COMPONENT in OWNERS files.
Args:
start_path: Path of directory to start searching from. Must be relative to
SRC_ROOT.
cache: Dict of OWNERS paths. Used instead of filesystem if paths are present
in the dict.
knobs: Instance of SectionSizeKnobs with tunable knobs and options.
Returns:
COMPONENT belonging to |start_path|, or empty string if not found.
"""
prev_dir = None
test_dir = start_path
# This loop will traverse the directory structure upwards until reaching
# SRC_ROOT, where test_dir and prev_dir will both equal an empty string.
while test_dir != prev_dir:
cached_component = cache.get(test_dir)
if cached_component:
return cached_component
elif cached_component is None:
owners_path = os.path.join(knobs.src_root, test_dir, _OWNERS_FILENAME)
component = _ParseComponentFromOwners(owners_path)
cache[test_dir] = component
if component:
return component
prev_dir = test_dir
test_dir = os.path.dirname(test_dir)
return ''
def _PopulateComponents(raw_symbols, knobs):
"""Populates the |component| field based on |source_path|.
Symbols without a |source_path| are skipped.
Args:
raw_symbols: list of Symbol objects.
knobs: Instance of SectionSizeKnobs. Tunable knobs and options.
"""
seen_paths = {}
for symbol in raw_symbols:
if symbol.source_path:
folder_path = os.path.dirname(symbol.source_path)
symbol.component = _FindComponentRoot(folder_path, seen_paths, knobs)
def _UpdateSymbolNamesFromNm(raw_symbols, names_by_address):
"""Updates raw_symbols names with extra information from nm."""
logging.debug('Update symbol names')
# linker_map_parser extracts '** outlined function' without knowing how many
# such symbols exist at each address. nm has this information, and stores the
# value as, e.g., '** outlined function * 5'. Copy the information over.
for s in raw_symbols:
if s.full_name.startswith('** outlined function'):
name_list = names_by_address.get(s.address)
if name_list:
for name in name_list:
if name.startswith('** outlined function'):
s.full_name = name
break
def _AddNmAliases(raw_symbols, names_by_address):
"""Adds symbols that were removed by identical code folding."""
# Step 1: Create list of (index_of_symbol, name_list).
logging.debug('Creating alias list')
replacements = []
num_new_symbols = 0
missing_names = collections.defaultdict(list)
for i, s in enumerate(raw_symbols):
# Don't alias padding-only symbols (e.g. ** symbol gap)
if s.size_without_padding == 0:
continue
name_list = names_by_address.get(s.address)
if name_list:
if s.full_name not in name_list:
missing_names[s.full_name].append(s.address)
logging.warning('Name missing from aliases: %s %s', s.full_name,
name_list)
continue
replacements.append((i, name_list))
num_new_symbols += len(name_list) - 1
if missing_names and logging.getLogger().isEnabledFor(logging.INFO):
for address, names in names_by_address.iteritems():
for name in names:
if name in missing_names:
logging.info('Missing name %s is at address %x instead of [%s]' %
(name, address, ','.join('%x' % a for a in missing_names[name])))
if float(num_new_symbols) / len(raw_symbols) < .05:
logging.warning('Number of aliases is oddly low (%.0f%%). It should '
'usually be around 25%%. Ensure --tool-prefix is correct. ',
float(num_new_symbols) / len(raw_symbols) * 100)
# Step 2: Create new symbols as siblings to each existing one.
logging.debug('Creating %d new symbols from nm output', num_new_symbols)
expected_num_symbols = len(raw_symbols) + num_new_symbols
ret = []
prev_src = 0
for cur_src, name_list in replacements:
ret += raw_symbols[prev_src:cur_src]
prev_src = cur_src + 1
sym = raw_symbols[cur_src]
# Create symbols (|sym| gets recreated and discarded).
new_syms = []
for full_name in name_list:
# Do not set |aliases| in order to avoid being pruned by
# _CompactLargeAliasesIntoSharedSymbols(), which assumes aliases differ
# only by path. The field will be set afterwards by _ConnectNmAliases().
new_syms.append(models.Symbol(
sym.section_name, sym.size, address=sym.address, full_name=full_name))
ret += new_syms
ret += raw_symbols[prev_src:]
assert expected_num_symbols == len(ret)
return ret
def LoadAndPostProcessSizeInfo(path, file_obj=None):
"""Returns a SizeInfo for the given |path|."""
logging.debug('Loading results from: %s', path)
size_info = file_format.LoadSizeInfo(path, file_obj=file_obj)
logging.info('Normalizing symbol names')
_NormalizeNames(size_info.raw_symbols)
logging.info('Calculating padding')
_CalculatePadding(size_info.raw_symbols)
logging.info('Loaded %d symbols', len(size_info.raw_symbols))
return size_info
def CreateMetadata(map_path, elf_path, apk_path, tool_prefix, output_directory,
linker_name):
"""Creates metadata dict.
Args:
map_path: Path to the linker .map(.gz) file to parse.
elf_path: Path to the corresponding unstripped ELF file. Used to find symbol
aliases and inlined functions. Can be None.
apk_path: Path to the .apk file to measure.
tool_prefix: Prefix for c++filt & nm.
output_directory: Build output directory.
linker_name: A coded linker name (see linker_map_parser.py).
Returns:
None if |elf_path| is not supplied. Otherwise returns dict mapping string
constants to values.
If |elf_path| is supplied, git revision and elf info are included.
If |output_directory| is also supplied, then filenames will be included.
"""
metadata = None
if elf_path:
logging.debug('Constructing metadata')
git_rev = _DetectGitRevision(os.path.dirname(elf_path))
architecture = _ArchFromElf(elf_path, tool_prefix)
build_id = BuildIdFromElf(elf_path, tool_prefix)
timestamp_obj = datetime.datetime.utcfromtimestamp(os.path.getmtime(
elf_path))
timestamp = calendar.timegm(timestamp_obj.timetuple())
relative_tool_prefix = path_util.ToSrcRootRelative(tool_prefix)
metadata = {
models.METADATA_GIT_REVISION: git_rev,
models.METADATA_ELF_ARCHITECTURE: architecture,
models.METADATA_ELF_MTIME: timestamp,
models.METADATA_ELF_BUILD_ID: build_id,
models.METADATA_LINKER_NAME: linker_name,
models.METADATA_TOOL_PREFIX: relative_tool_prefix,
}
if output_directory:
relative_to_out = lambda path: os.path.relpath(path, output_directory)
gn_args = _ParseGnArgs(os.path.join(output_directory, 'args.gn'))
metadata[models.METADATA_MAP_FILENAME] = relative_to_out(map_path)
metadata[models.METADATA_ELF_FILENAME] = relative_to_out(elf_path)
metadata[models.METADATA_GN_ARGS] = gn_args
if apk_path:
metadata[models.METADATA_APK_FILENAME] = relative_to_out(apk_path)
metadata[models.METADATA_APK_SIZE] = os.path.getsize(apk_path)
return metadata
def _ResolveThinArchivePaths(raw_symbols, thin_archives):
"""Converts object_paths for thin archives to external .o paths."""
for symbol in raw_symbols:
object_path = symbol.object_path
if object_path.endswith(')'):
start_idx = object_path.rindex('(')
archive_path = object_path[:start_idx]
if archive_path in thin_archives:
subpath = object_path[start_idx + 1:-1]
symbol.object_path = ar.CreateThinObjectPath(archive_path, subpath)
def _ParseElfInfo(map_path, elf_path, tool_prefix, track_string_literals,
outdir_context=None, linker_name=None):
"""Adds ELF section sizes and symbols."""
if elf_path:
# Run nm on the elf file to retrieve the list of symbol names per-address.
# This list is required because the .map file contains only a single name
# for each address, yet multiple symbols are often coalesced when they are
# identical. This coalescing happens mainly for small symbols and for C++
# templates. Such symbols make up ~500kb of libchrome.so on Android.
elf_nm_result = nm.CollectAliasesByAddressAsync(elf_path, tool_prefix)
# Run nm on all .o/.a files to retrieve the symbol names within them.
# The list is used to detect when mutiple .o files contain the same symbol
# (e.g. inline functions), and to update the object_path / source_path
# fields accordingly.
# Looking in object files is required because the .map file choses a
# single path for these symbols.
# Rather than record all paths for each symbol, set the paths to be the
# common ancestor of all paths.
if outdir_context:
bulk_analyzer = obj_analyzer.BulkObjectFileAnalyzer(
tool_prefix, outdir_context.output_directory,
track_string_literals=track_string_literals)
bulk_analyzer.AnalyzePaths(outdir_context.elf_object_paths)
logging.info('Parsing Linker Map')
with _OpenMaybeGz(map_path) as map_file:
section_sizes, raw_symbols = (
linker_map_parser.MapFileParser().Parse(linker_name, map_file))
if outdir_context and outdir_context.thin_archives:
_ResolveThinArchivePaths(raw_symbols, outdir_context.thin_archives)
if elf_path:
logging.debug('Validating section sizes')
elf_section_sizes = _SectionSizesFromElf(elf_path, tool_prefix)
for k, v in elf_section_sizes.iteritems():
if v != section_sizes.get(k):
logging.error('ELF file and .map file do not agree on section sizes.')
logging.error('.map file: %r', section_sizes)
logging.error('readelf: %r', elf_section_sizes)
sys.exit(1)
if elf_path and outdir_context:
missed_object_paths = _DiscoverMissedObjectPaths(
raw_symbols, outdir_context.known_inputs)
missed_object_paths = ar.ExpandThinArchives(
missed_object_paths, outdir_context.output_directory)[0]
bulk_analyzer.AnalyzePaths(missed_object_paths)
bulk_analyzer.SortPaths()
if track_string_literals:
merge_string_syms = [s for s in raw_symbols if
s.full_name == '** merge strings' or
s.full_name == '** lld merge strings']
# More likely for there to be a bug in supersize than an ELF to not have a
# single string literal.
assert merge_string_syms
string_ranges = [(s.address, s.size) for s in merge_string_syms]
bulk_analyzer.AnalyzeStringLiterals(elf_path, string_ranges)
logging.info('Stripping linker prefixes from symbol names')
_StripLinkerAddedSymbolPrefixes(raw_symbols)
# Map file for some reason doesn't demangle all names.
# Demangle prints its own log statement.
demangle.DemangleRemainingSymbols(raw_symbols, tool_prefix)
object_paths_by_name = {}
if elf_path:
logging.info(
'Adding symbols removed by identical code folding (as reported by nm)')
# This normally does not block (it's finished by this time).
names_by_address = elf_nm_result.get()
_UpdateSymbolNamesFromNm(raw_symbols, names_by_address)
raw_symbols = _AddNmAliases(raw_symbols, names_by_address)
if outdir_context:
object_paths_by_name = bulk_analyzer.GetSymbolNames()
logging.debug(
'Fetched path information for %d symbols from %d files',
len(object_paths_by_name),
len(outdir_context.elf_object_paths) + len(missed_object_paths))
# For aliases, this provides path information where there wasn't any.
logging.info('Creating aliases for symbols shared by multiple paths')
raw_symbols = _AssignNmAliasPathsAndCreatePathAliases(
raw_symbols, object_paths_by_name)
if track_string_literals:
logging.info('Waiting for string literal extraction to complete.')
list_of_positions_by_object_path = bulk_analyzer.GetStringPositions()
bulk_analyzer.Close()
if track_string_literals:
logging.info('Deconstructing ** merge strings into literals')
replacements = _CreateMergeStringsReplacements(merge_string_syms,
list_of_positions_by_object_path)
for merge_sym, literal_syms in itertools.izip(
merge_string_syms, replacements):
# Don't replace if no literals were found.
if literal_syms:
# Re-find the symbols since aliases cause their indices to change.
idx = raw_symbols.index(merge_sym)
# This assignment is a bit slow (causes array to be shifted), but
# is fast enough since len(merge_string_syms) < 10.
raw_symbols[idx:idx + 1] = literal_syms
return section_sizes, raw_symbols, object_paths_by_name
def _ComputePakFileSymbols(
file_name, contents, res_info, symbols_by_id, compression_ratio=1):
id_map = {id(v): k
for k, v in sorted(contents.resources.items(), reverse=True)}
alias_map = {k: id_map[id(v)] for k, v in contents.resources.iteritems()
if id_map[id(v)] != k}
# Longest locale pak is es-419.pak
if len(os.path.basename(file_name)) <= 9:
section_name = models.SECTION_PAK_TRANSLATIONS
else:
section_name = models.SECTION_PAK_NONTRANSLATED
overhead = (12 + 6) * compression_ratio # Header size plus extra offset
symbols_by_id[hash(file_name)] = models.Symbol(
section_name, overhead, full_name='Overhead: {}'.format(file_name))
for resource_id in sorted(contents.resources):
if resource_id in alias_map:
# 4 extra bytes of metadata (2 16-bit ints)
size = 4
resource_id = alias_map[resource_id]
else:
resource_data = contents.resources[resource_id]
# 6 extra bytes of metadata (1 32-bit int, 1 16-bit int)
size = len(resource_data) + 6
name, source_path = res_info[resource_id]
if resource_id not in symbols_by_id:
full_name = '{}: {}'.format(source_path, name)
new_symbol = models.Symbol(
section_name, 0, address=resource_id, full_name=full_name)
if (section_name == models.SECTION_PAK_NONTRANSLATED and
_IsPakContentUncompressed(resource_data)):
new_symbol.flags |= models.FLAG_UNCOMPRESSED
symbols_by_id[resource_id] = new_symbol
size *= compression_ratio
symbols_by_id[resource_id].size += size
def _IsPakContentUncompressed(content):
raw_size = len(content)
# Assume anything less than 100 bytes cannot be compressed.
if raw_size < 100:
return False
compressed_size = len(zlib.compress(content, 1))
compression_ratio = compressed_size / float(raw_size)
return compression_ratio < _UNCOMPRESSED_COMPRESSION_RATIO_THRESHOLD
class _ResourceSourceMapper(object):
def __init__(self, apk_path, output_directory, knobs):
self._knobs = knobs
self._res_info = self._LoadResInfo(apk_path, output_directory)
self._pattern_dollar_underscore = re.compile(r'\$(.*?)__\d+')
self._pattern_version_suffix = re.compile(r'-v\d+/')
@staticmethod
def _ParseResInfoFile(res_info_path):
with open(res_info_path, 'r') as info_file:
res_info = {}
renames = {}
for line in info_file.readlines():
dest, source = line.strip().split(',')
# Allow indirection due to renames.
if dest.startswith('Rename:'):
dest = dest.split(':', 1)[1]
renames[dest] = source
else:
res_info[dest] = source
for dest, renamed_dest in renames.iteritems():
# Allow one more level of indirection due to renaming renamed files
renamed_dest = renames.get(renamed_dest, renamed_dest)
actual_source = res_info.get(renamed_dest)
if actual_source:
res_info[dest] = actual_source
return res_info
def _LoadResInfo(self, apk_path, output_directory):
apk_name = os.path.basename(apk_path)
apk_res_info_name = apk_name + '.res.info'
apk_res_info_path = os.path.join(
output_directory, 'size-info', apk_res_info_name)
res_info_without_root = self._ParseResInfoFile(apk_res_info_path)
# We package resources in the res/ folder only in the apk.
res_info = {
os.path.join('res', dest): source
for dest, source in res_info_without_root.iteritems()
}
res_info.update(self._knobs.apk_other_files)
return res_info
def FindSourceForPath(self, path):
original_path = path
# Sometimes android adds $ in front and __# before extension.
path = self._pattern_dollar_underscore.sub(r'\1', path)
ret = self._res_info.get(path)
if ret:
return ret
# Android build tools may append extra -v flags for the root dir.
path = self._pattern_version_suffix.sub('/', path)
ret = self._res_info.get(path)
if ret:
return ret
if original_path not in self._knobs.apk_expected_other_files:
logging.warning('Unexpected file in apk: %s', original_path)
return None
def _ParsePakInfoFile(pak_info_path):
with open(pak_info_path, 'r') as info_file:
res_info = {}
for line in info_file.readlines():
name, res_id, path = line.split(',')
res_info[int(res_id)] = (name, path.strip())
return res_info
def _ParsePakSymbols(
section_sizes, symbols_by_id, object_paths_by_pak_id):
raw_symbols = []
for resource_id, symbol in symbols_by_id.iteritems():
raw_symbols.append(symbol)
paths = object_paths_by_pak_id.get(resource_id)
if not paths:
continue
symbol.object_path = paths.pop()
if not paths:
continue
aliases = symbol.aliases or [symbol]
symbol.aliases = aliases
for path in paths:
new_sym = models.Symbol(
symbol.section_name, symbol.size, address=symbol.address,
full_name=symbol.full_name, object_path=path, aliases=aliases)
aliases.append(new_sym)
raw_symbols.append(new_sym)
raw_symbols.sort(key=lambda s: (s.section_name, s.address, s.object_path))
raw_total = 0.0
int_total = 0
for symbol in raw_symbols:
raw_total += symbol.size
# We truncate rather than round to ensure that we do not over attribute. It
# is easier to add another symbol to make up the difference.
symbol.size = int(symbol.size)
int_total += symbol.size
# Attribute excess to translations since only those are compressed.
raw_symbols.append(models.Symbol(
models.SECTION_PAK_TRANSLATIONS, int(round(raw_total - int_total)),
full_name='Overhead: Pak compression artifacts'))
for symbol in raw_symbols:
prev = section_sizes.setdefault(symbol.section_name, 0)
section_sizes[symbol.section_name] = prev + symbol.size
return raw_symbols
def _ParseApkElfSectionSize(section_sizes, metadata, apk_elf_result):
if metadata:
logging.debug('Extracting section sizes from .so within .apk')
apk_build_id, apk_section_sizes, elf_overhead_size = apk_elf_result.get()
assert apk_build_id == metadata[models.METADATA_ELF_BUILD_ID], (
'BuildID from apk_elf_result did not match')
packed_section_name = None
architecture = metadata[models.METADATA_ELF_ARCHITECTURE]
# Packing occurs enabled only arm32 & arm64.
if architecture == 'arm':
packed_section_name = '.rel.dyn'
elif architecture == 'arm64':
packed_section_name = '.rela.dyn'
if packed_section_name:
logging.debug('Recording size of unpacked relocations')
if packed_section_name not in section_sizes:
logging.warning('Packed section not present: %s', packed_section_name)
else:
apk_section_sizes['%s (unpacked)' % packed_section_name] = (
section_sizes.get(packed_section_name))
return apk_section_sizes, elf_overhead_size
return section_sizes, 0
def _ParseDexSymbols(section_sizes, apk_path, output_directory):
symbols = apkanalyzer.CreateDexSymbols(apk_path, output_directory)
prev = section_sizes.setdefault(models.SECTION_DEX, 0)
section_sizes[models.SECTION_DEX] = prev + sum(s.size for s in symbols)
return symbols
def _ParseApkOtherSymbols(section_sizes, apk_path, apk_so_path,
output_directory, knobs):
res_source_mapper = _ResourceSourceMapper(apk_path, output_directory, knobs)
apk_symbols = []
zip_info_total = 0
with zipfile.ZipFile(apk_path) as z:
for zip_info in z.infolist():
zip_info_total += zip_info.compress_size
# Skip main shared library, pak, and dex files as they are accounted for.
if (zip_info.filename == apk_so_path
or zip_info.filename.endswith('.dex')
or zip_info.filename.endswith('.pak')):
continue
source_path = res_source_mapper.FindSourceForPath(zip_info.filename)
if source_path is None:
source_path = os.path.join(models.APK_PREFIX_PATH, zip_info.filename)
apk_symbols.append(models.Symbol(
models.SECTION_OTHER, zip_info.compress_size,
source_path=source_path,
full_name=zip_info.filename)) # Full name must disambiguate
overhead_size = os.path.getsize(apk_path) - zip_info_total
assert overhead_size >= 0, 'Apk overhead must be non-negative'
zip_overhead_symbol = models.Symbol(
models.SECTION_OTHER, overhead_size, full_name='Overhead: APK file')
apk_symbols.append(zip_overhead_symbol)
prev = section_sizes.setdefault(models.SECTION_OTHER, 0)
section_sizes[models.SECTION_OTHER] = prev + sum(s.size for s in apk_symbols)
return apk_symbols
def _CreatePakObjectMap(object_paths_by_name):
# IDS_ macro usages result in templated function calls that contain the
# resource ID in them. These names are collected along with all other symbols
# by running "nm" on them. We just need to extract the values from them.
object_paths_by_pak_id = {}
PREFIX = 'void ui::WhitelistedResource<'
id_start_idx = len(PREFIX)
id_end_idx = -len('>()')
for name in object_paths_by_name:
if name.startswith(PREFIX):
pak_id = int(name[id_start_idx:id_end_idx])
object_paths_by_pak_id[pak_id] = object_paths_by_name[name]
return object_paths_by_pak_id
def _FindPakSymbolsFromApk(apk_path, output_directory, knobs):
with zipfile.ZipFile(apk_path) as z:
pak_zip_infos = (f for f in z.infolist() if f.filename.endswith('.pak'))
apk_info_name = os.path.basename(apk_path) + '.pak.info'
pak_info_path = os.path.join(output_directory, 'size-info', apk_info_name)
res_info = _ParsePakInfoFile(pak_info_path)
symbols_by_id = {}
total_compressed_size = 0
total_uncompressed_size = 0
for zip_info in pak_zip_infos:
contents = data_pack.ReadDataPackFromString(z.read(zip_info))
compression_ratio = 1.0
if zip_info.compress_size < zip_info.file_size:
total_compressed_size += zip_info.compress_size
total_uncompressed_size += zip_info.file_size
compression_ratio = knobs.pak_compression_ratio
_ComputePakFileSymbols(
zip_info.filename, contents,
res_info, symbols_by_id, compression_ratio=compression_ratio)
if total_uncompressed_size > 0:
actual_ratio = (
float(total_compressed_size) / total_uncompressed_size)
logging.info('Pak Compression Ratio: %f Actual: %f Diff: %.0f',
knobs.pak_compression_ratio, actual_ratio,
(knobs.pak_compression_ratio - actual_ratio) *
total_uncompressed_size)
return symbols_by_id
def _FindPakSymbolsFromFiles(pak_files, pak_info_path, output_directory):
"""Uses files from args to find and add pak symbols."""
res_info = _ParsePakInfoFile(pak_info_path)
symbols_by_id = {}
for pak_file_path in pak_files:
with open(pak_file_path, 'r') as f:
contents = data_pack.ReadDataPackFromString(f.read())
_ComputePakFileSymbols(
os.path.relpath(pak_file_path, output_directory), contents, res_info,
symbols_by_id)
return symbols_by_id
def _CalculateElfOverhead(section_sizes, elf_path):
if elf_path:
section_sizes_total_without_bss = sum(
s for k, s in section_sizes.iteritems() if k != models.SECTION_BSS)
elf_overhead_size = (
os.path.getsize(elf_path) - section_sizes_total_without_bss)
assert elf_overhead_size >= 0, (
'Negative ELF overhead {}'.format(elf_overhead_size))
return elf_overhead_size
return 0
def CreateSectionSizesAndSymbols(
map_path=None, tool_prefix=None, output_directory=None, elf_path=None,
apk_path=None, track_string_literals=True, metadata=None,
apk_so_path=None, pak_files=None, pak_info_file=None, linker_name=None,
knobs=SectionSizeKnobs()):
"""Creates sections sizes and symbols for a SizeInfo.
Args:
map_path: Path to the linker .map(.gz) file to parse.
tool_prefix: Prefix for c++filt & nm (required).
output_directory: Build output directory. If None, source_paths and symbol
alias information will not be recorded.
elf_path: Path to the corresponding unstripped ELF file. Used to find symbol
aliases and inlined functions. Can be None.
apk_path: Path to the .apk file to measure.
track_string_literals: Whether to break down "** merge string" sections into
smaller symbols (requires output_directory).
metadata: Metadata dict from CreateMetadata().
apk_so_path: Path to an .so file within an APK file.
pak_files: List of paths to .pak files.
pak_info_file: Path to a .pak.info file.
linker_name: A coded linker name (see linker_map_parser.py).
knobs: Instance of SectionSizeKnobs with tunable knobs and options.
Returns:
A tuple of (section_sizes, raw_symbols).
section_sizes is a dict mapping section names to their size
raw_symbols is a list of Symbol objects
"""
if apk_path and elf_path:
# Extraction takes around 1 second, so do it in parallel.
apk_elf_result = concurrent.ForkAndCall(
_ElfInfoFromApk, (apk_path, apk_so_path, tool_prefix))
outdir_context = None
source_mapper = None
if output_directory:
# Start by finding the elf_object_paths, so that nm can run on them while
# the linker .map is being parsed.
logging.info('Parsing ninja files.')
source_mapper, ninja_elf_object_paths = (
ninja_parser.Parse(output_directory, elf_path))
logging.debug('Parsed %d .ninja files.', source_mapper.parsed_file_count)
assert not elf_path or ninja_elf_object_paths, (
'Failed to find link command in ninja files for ' +
os.path.relpath(elf_path, output_directory))
if ninja_elf_object_paths:
elf_object_paths, thin_archives = ar.ExpandThinArchives(
ninja_elf_object_paths, output_directory)
known_inputs = set(elf_object_paths)
known_inputs.update(ninja_elf_object_paths)
else:
elf_object_paths = None
known_inputs = None
# When we don't know which elf file is used, just search all paths.
thin_archives = set(
p for p in source_mapper.IterAllPaths()
if p.endswith('.a') and ar.IsThinArchive(
os.path.join(output_directory, p)))
outdir_context = _OutputDirectoryContext(
elf_object_paths=elf_object_paths,
known_inputs=known_inputs,
output_directory=output_directory,
source_mapper=source_mapper,
thin_archives=thin_archives)
section_sizes, raw_symbols, object_paths_by_name = _ParseElfInfo(
map_path, elf_path, tool_prefix, track_string_literals,
outdir_context=outdir_context, linker_name=linker_name)
elf_overhead_size = _CalculateElfOverhead(section_sizes, elf_path)
pak_symbols_by_id = None
if apk_path:
pak_symbols_by_id = _FindPakSymbolsFromApk(
apk_path, output_directory, knobs)
if elf_path:
section_sizes, elf_overhead_size = _ParseApkElfSectionSize(
section_sizes, metadata, apk_elf_result)
raw_symbols.extend(
_ParseDexSymbols(section_sizes, apk_path, output_directory))
raw_symbols.extend(
_ParseApkOtherSymbols(section_sizes, apk_path, apk_so_path,
output_directory, knobs))
elif pak_files and pak_info_file:
pak_symbols_by_id = _FindPakSymbolsFromFiles(
pak_files, pak_info_file, output_directory)
if elf_path:
elf_overhead_symbol = models.Symbol(
models.SECTION_OTHER, elf_overhead_size, full_name='Overhead: ELF file')
prev = section_sizes.setdefault(models.SECTION_OTHER, 0)
section_sizes[models.SECTION_OTHER] = prev + elf_overhead_size
raw_symbols.append(elf_overhead_symbol)
if pak_symbols_by_id:
logging.debug('Extracting pak IDs from symbol names, and creating symbols')
object_paths_by_pak_id = _CreatePakObjectMap(object_paths_by_name)
pak_raw_symbols = _ParsePakSymbols(
section_sizes, pak_symbols_by_id, object_paths_by_pak_id)
raw_symbols.extend(pak_raw_symbols)
_ExtractSourcePathsAndNormalizeObjectPaths(raw_symbols, source_mapper)
_PopulateComponents(raw_symbols, knobs)
logging.info('Converting excessive aliases into shared-path symbols')
_CompactLargeAliasesIntoSharedSymbols(raw_symbols, knobs)
logging.debug('Connecting nm aliases')
_ConnectNmAliases(raw_symbols)
return section_sizes, raw_symbols
def CreateSizeInfo(
section_sizes, raw_symbols, metadata=None, normalize_names=True):
"""Performs operations on all symbols and creates a SizeInfo object."""
logging.debug('Sorting %d symbols', len(raw_symbols))
# TODO(agrieve): Either change this sort so that it's only sorting by section
# (and not using .sort()), or have it specify a total ordering (which must
# also include putting padding-only symbols before others of the same
# address). Note: The sort as-is takes ~1.5 seconds.
raw_symbols.sort(key=lambda s: (
s.IsPak(), s.IsBss(), s.section_name, s.address))
logging.info('Processed %d symbols', len(raw_symbols))
# Padding not really required, but it is useful to check for large padding and
# log a warning.
logging.info('Calculating padding')
_CalculatePadding(raw_symbols)
# Do not call _NormalizeNames() during archive since that method tends to need
# tweaks over time. Calling it only when loading .size files allows for more
# flexability.
if normalize_names:
_NormalizeNames(raw_symbols)
return models.SizeInfo(section_sizes, raw_symbols, metadata=metadata)
def _DetectGitRevision(directory):
"""Runs git rev-parse to get the SHA1 hash of the current revision.
Args:
directory: Path to directory where rev-parse command will be run.
Returns:
A string with the SHA1 hash, or None if an error occured.
"""
try:
git_rev = subprocess.check_output(
['git', '-C', directory, 'rev-parse', 'HEAD'])
return git_rev.rstrip()
except Exception:
logging.warning('Failed to detect git revision for file metadata.')
return None
def BuildIdFromElf(elf_path, tool_prefix):
args = [path_util.GetReadElfPath(tool_prefix), '-n', elf_path]
stdout = subprocess.check_output(args)
match = re.search(r'Build ID: (\w+)', stdout)
assert match, 'Build ID not found from running: ' + ' '.join(args)
return match.group(1)
def _SectionSizesFromElf(elf_path, tool_prefix):
args = [path_util.GetReadElfPath(tool_prefix), '-S', '--wide', elf_path]
stdout = subprocess.check_output(args)
section_sizes = {}
# Matches [ 2] .hash HASH 00000000006681f0 0001f0 003154 04 A 3 0 8
for match in re.finditer(r'\[[\s\d]+\] (\..*)$', stdout, re.MULTILINE):
items = match.group(1).split()
section_sizes[items[0]] = int(items[4], 16)
return section_sizes
def _ArchFromElf(elf_path, tool_prefix):
args = [path_util.GetReadElfPath(tool_prefix), '-h', elf_path]
stdout = subprocess.check_output(args)
machine = re.search('Machine:\s*(.+)', stdout).group(1)
if machine == 'Intel 80386':
return 'x86'
if machine == 'Advanced Micro Devices X86-64':
return 'x64'
elif machine == 'ARM':
return 'arm'
elif machine == 'AArch64':
return 'arm64'
return machine
def _ParseGnArgs(args_path):
"""Returns a list of normalized "key=value" strings."""
args = {}
with open(args_path) as f:
for l in f:
# Strips #s even if within string literal. Not a problem in practice.
parts = l.split('#')[0].split('=')
if len(parts) != 2:
continue
args[parts[0].strip()] = parts[1].strip()
return ["%s=%s" % x for x in sorted(args.iteritems())]
def _DetectLinkerName(map_path):
with _OpenMaybeGz(map_path) as map_file:
return linker_map_parser.DetectLinkerNameFromMapFile(map_file)
def _ElfInfoFromApk(apk_path, apk_so_path, tool_prefix):
"""Returns a tuple of (build_id, section_sizes)."""
with zipfile.ZipFile(apk_path) as apk, \
tempfile.NamedTemporaryFile() as f:
f.write(apk.read(apk_so_path))
f.flush()
build_id = BuildIdFromElf(f.name, tool_prefix)
section_sizes = _SectionSizesFromElf(f.name, tool_prefix)
elf_overhead_size = _CalculateElfOverhead(section_sizes, f.name)
return build_id, section_sizes, elf_overhead_size
def _AutoIdentifyInputFile(args):
file_output = subprocess.check_output(['file', args.f])
format_text = file_output[file_output.find(': ') + 2:]
# File-not-found -> 'cannot ...' and directory -> 'directory', which don't
# match anything here, so they are handled by the final 'return False'.
if (format_text.startswith('Java archive data') or
format_text.startswith('Zip archive data')):
logging.info('Auto-identified --apk-file.')
args.apk_file = args.f
return True
if format_text.startswith('ELF '):
logging.info('Auto-identified --elf-file.')
args.elf_file = args.f
return True
if format_text.startswith('ASCII text'):
logging.info('Auto-identified --map-file.')
args.map_file = args.f
return True
return False
def AddMainPathsArguments(parser):
"""Add arguments for DeduceMainPaths()."""
parser.add_argument('-f', metavar='FILE',
help='Auto-identify input file type.')
parser.add_argument('--apk-file',
help='.apk file to measure. When set, --elf-file will be '
'derived (if unset). Providing the .apk allows '
'for the size of packed relocations to be recorded')
parser.add_argument('--elf-file',
help='Path to input ELF file. Currently used for '
'capturing metadata.')
parser.add_argument('--map-file',
help='Path to input .map(.gz) file. Defaults to '
'{{elf_file}}.map(.gz)?. If given without '
'--elf-file, no size metadata will be recorded.')
parser.add_argument('--no-source-paths', action='store_true',
help='Do not use .ninja files to map '
'object_path -> source_path')
parser.add_argument('--output-directory',
help='Path to the root build directory.')
parser.add_argument('--tool-prefix',
help='Path prefix for c++filt, nm, readelf.')
def AddArguments(parser):
parser.add_argument('size_file', help='Path to output .size file.')
parser.add_argument('--pak-file', action='append',
help='Paths to pak files.')
parser.add_argument('--pak-info-file',
help='This file should contain all ids found in the pak '
'files that have been passed in.')
parser.add_argument('--no-string-literals', dest='track_string_literals',
default=True, action='store_false',
help='Disable breaking down "** merge strings" into more '
'granular symbols.')
parser.add_argument('--source-directory',
help='Custom path to the root source directory.')
AddMainPathsArguments(parser)
def DeduceMainPaths(args, parser):
"""Computes main paths based on input, and deduces them if needed."""
if args.f is not None:
if not _AutoIdentifyInputFile(args):
parser.error('Cannot find or identify file %s' % args.f)
apk_path = args.apk_file
elf_path = args.elf_file
map_path = args.map_file
any_input = apk_path or elf_path or map_path
if not any_input:
parser.error('Must pass at least one of --apk-file, --elf-file, --map-file')
output_directory_finder = path_util.OutputDirectoryFinder(
value=args.output_directory,
any_path_within_output_directory=any_input)
apk_so_path = None
if apk_path:
with zipfile.ZipFile(apk_path) as z:
lib_infos = [f for f in z.infolist()
if f.filename.endswith('.so') and f.file_size > 0]
assert lib_infos, 'APK has no .so files.'
# TODO(agrieve): Add support for multiple .so files, and take into account
# secondary architectures.
apk_so_path = max(lib_infos, key=lambda x:x.file_size).filename
logging.debug('Sub-apk path=%s', apk_so_path)
if not elf_path and output_directory_finder.Tentative():
elf_path = os.path.join(
output_directory_finder.Tentative(), 'lib.unstripped',
os.path.basename(apk_so_path.replace('crazy.', '')))
logging.debug('Detected --elf-file=%s', elf_path)
if map_path:
if not map_path.endswith('.map') and not map_path.endswith('.map.gz'):
parser.error('Expected --map-file to end with .map or .map.gz')
else:
map_path = elf_path + '.map'
if not os.path.exists(map_path):
map_path += '.gz'
if not os.path.exists(map_path):
parser.error('Could not find .map(.gz)? file. Ensure you have built with '
'is_official_build=true and generate_linker_map=true, or '
'use --map-file to point me a linker map file.')
linker_name = _DetectLinkerName(map_path)
logging.info('Linker name: %s' % linker_name)
tool_prefix_finder = path_util.ToolPrefixFinder(
value=args.tool_prefix,
output_directory_finder=output_directory_finder,
linker_name=linker_name)
tool_prefix = tool_prefix_finder.Finalized()
output_directory = None
if not args.no_source_paths:
output_directory = output_directory_finder.Finalized()
return (output_directory, tool_prefix, apk_path, apk_so_path, elf_path,
map_path, linker_name)
def Run(args, parser):
if not args.size_file.endswith('.size'):
parser.error('size_file must end with .size')
(output_directory, tool_prefix, apk_path, apk_so_path, elf_path, map_path,
linker_name) = (DeduceMainPaths(args, parser))
metadata = CreateMetadata(map_path, elf_path, apk_path, tool_prefix,
output_directory, linker_name)
knobs = SectionSizeKnobs()
if args.source_directory:
knobs.src_root = args.source_directory
section_sizes, raw_symbols = CreateSectionSizesAndSymbols(
map_path=map_path, tool_prefix=tool_prefix, elf_path=elf_path,
apk_path=apk_path, output_directory=output_directory,
track_string_literals=args.track_string_literals,
metadata=metadata, apk_so_path=apk_so_path,
pak_files=args.pak_file, pak_info_file=args.pak_info_file,
linker_name=linker_name, knobs=knobs)
size_info = CreateSizeInfo(
section_sizes, raw_symbols, metadata=metadata, normalize_names=False)
if logging.getLogger().isEnabledFor(logging.INFO):
for line in describe.DescribeSizeInfoCoverage(size_info):
logging.info(line)
logging.info('Recorded info for %d symbols', len(size_info.raw_symbols))
logging.info('Recording metadata: \n %s',
'\n '.join(describe.DescribeMetadata(size_info.metadata)))
logging.info('Saving result to %s', args.size_file)
file_format.SaveSizeInfo(size_info, args.size_file)
size_in_mb = os.path.getsize(args.size_file) / 1024.0 / 1024.0
logging.info('Done. File size is %.2fMiB.', size_in_mb)