blob: f73b0fe56f42d1431ac4da1ddf6208565284345a [file] [log] [blame]
# Copyright 2017 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Classes that comprise the data model for binary size analysis."""
import collections
import copy
import os
import re
SECTION_TO_SECTION_NAME = {
'b': '.bss',
'd': '.data',
'r': '.rodata',
't': '.text',
}
class SizeInfo(object):
"""Represents all size information for a single binary.
Fields:
section_sizes: A dict of section_name -> size.
symbols: A SymbolGroup (or SymbolDiff) with all symbols in it.
"""
__slots__ = (
'section_sizes',
'symbols',
'tag',
'timestamp',
)
"""Root size information."""
def __init__(self, section_sizes, symbols, timestamp=None, tag=''):
self.section_sizes = section_sizes # E.g. {'.text': 0}
self.symbols = symbols # List of symbols sorted by address per-section.
self.timestamp = timestamp # UTC datetime object.
self.tag = tag # E.g. git revision.
assert not tag or '\n' not in tag # Simplifies file format.
class BaseSymbol(object):
"""Base class for Symbol and SymbolGroup."""
__slots__ = ()
@property
def section(self):
"""Returns the one-letter section.
E.g. If section_name == '.rodata', then section == 'r'.
"""
return self.section_name[1]
@property
def size_without_padding(self):
return self.size - self.padding
@property
def end_address(self):
return self.address + self.size_without_padding
def IsBss(self):
return self.section_name == '.bss'
def IsGroup(self):
return False
def IsGenerated(self):
# TODO(agrieve): Also match generated functions such as:
# startup._GLOBAL__sub_I_page_allocator.cc
return self.name.endswith(']') and not self.name.endswith('[]')
def _Key(self):
"""Returns a tuple that can be used to see if two Symbol are the same.
Keys are not guaranteed to be unique within a SymbolGroup. For example, it
is common to have multiple "** merge strings" symbols, which will have a
common key."""
return (self.section_name, self.full_name or self.name)
class Symbol(BaseSymbol):
"""Represents a single symbol within a binary."""
__slots__ = (
'address',
'full_name',
'is_anonymous',
'object_path',
'name',
'flags',
'padding',
'section_name',
'source_path',
'size',
)
def __init__(self, section_name, size_without_padding, address=None,
name=None, source_path=None, object_path=None,
full_name=None, is_anonymous=False):
self.section_name = section_name
self.address = address or 0
self.name = name or ''
self.full_name = full_name or ''
self.source_path = source_path or ''
self.object_path = object_path or ''
self.size = size_without_padding
# Change this to be a bitfield of flags if ever there is a need to add
# another similar thing.
self.is_anonymous = is_anonymous
self.padding = 0
def __repr__(self):
return '%s@%x(size=%d,padding=%d,name=%s,path=%s,anon=%d)' % (
self.section_name, self.address, self.size_without_padding,
self.padding, self.name, self.source_path or self.object_path,
int(self.is_anonymous))
class SymbolGroup(BaseSymbol):
"""Represents a group of symbols using the same interface as Symbol.
SymbolGroups are immutable. All filtering / sorting will return new
SymbolGroups objects.
"""
__slots__ = (
'symbols',
'filtered_symbols',
'name',
'section_name',
)
def __init__(self, symbols, filtered_symbols=None, name=None,
section_name=None):
self.symbols = symbols
self.filtered_symbols = filtered_symbols or []
self.name = name or ''
self.section_name = section_name or '.*'
def __repr__(self):
return 'Group(name=%s,count=%d,size=%d)' % (
self.name, len(self), self.size)
def __iter__(self):
return iter(self.symbols)
def __len__(self):
return len(self.symbols)
def __getitem__(self, index):
return self.symbols[index]
def __sub__(self, other):
other_ids = set(id(s) for s in other)
new_symbols = [s for s in self if id(s) not in other_ids]
return self._CreateTransformed(new_symbols, section_name=self.section_name)
def __add__(self, other):
self_ids = set(id(s) for s in self)
new_symbols = self.symbols + [s for s in other if id(s) not in self_ids]
return self._CreateTransformed(new_symbols, section_name=self.section_name)
@property
def address(self):
return 0
@property
def full_name(self):
return None
@property
def is_anonymous(self):
return False
@property
def source_path(self):
return None
@property
def size(self):
if self.IsBss():
return sum(s.size for s in self)
return sum(s.size for s in self if not s.IsBss())
@property
def padding(self):
return sum(s.padding for s in self)
def IsGroup(self):
return True
def _CreateTransformed(self, symbols, filtered_symbols=None, name=None,
section_name=None):
return SymbolGroup(symbols, filtered_symbols=filtered_symbols, name=name,
section_name=section_name)
def Sorted(self, cmp_func=None, key=None, reverse=False):
# Default to sorting by abs(size) then name.
if cmp_func is None and key is None:
cmp_func = lambda a, b: cmp((a.IsBss(), abs(b.size), a.name),
(b.IsBss(), abs(a.size), b.name))
new_symbols = sorted(self.symbols, cmp_func, key, reverse)
return self._CreateTransformed(new_symbols,
filtered_symbols=self.filtered_symbols,
section_name=self.section_name)
def SortedByName(self, reverse=False):
return self.Sorted(key=(lambda s:s.name), reverse=reverse)
def SortedByAddress(self, reverse=False):
return self.Sorted(key=(lambda s:s.address), reverse=reverse)
def SortedByCount(self, reverse=False):
return self.Sorted(key=(lambda s:len(s) if s.IsGroup() else 1),
reverse=not reverse)
def Filter(self, func):
filtered_and_kept = ([], [])
for symbol in self:
filtered_and_kept[int(bool(func(symbol)))].append(symbol)
return self._CreateTransformed(filtered_and_kept[1],
filtered_symbols=filtered_and_kept[0],
section_name=self.section_name)
def WhereBiggerThan(self, min_size):
return self.Filter(lambda s: s.size >= min_size)
def WhereInSection(self, section):
if len(section) == 1:
ret = self.Filter(lambda s: s.section == section)
ret.section_name = SECTION_TO_SECTION_NAME[section]
else:
ret = self.Filter(lambda s: s.section_name == section)
ret.section_name = section
return ret
def WhereIsGenerated(self):
return self.Filter(lambda s: s.IsGenerated())
def WhereNameMatches(self, pattern):
regex = re.compile(pattern)
return self.Filter(lambda s: regex.search(s.name))
def WhereObjectPathMatches(self, pattern):
regex = re.compile(pattern)
return self.Filter(lambda s: regex.search(s.object_path))
def WhereSourcePathMatches(self, pattern):
regex = re.compile(pattern)
return self.Filter(lambda s: regex.search(s.source_path))
def WherePathMatches(self, pattern):
regex = re.compile(pattern)
return self.Filter(lambda s: regex.search(s.source_path or s.object_path))
def WhereAddressInRange(self, start, end):
return self.Filter(lambda s: s.address >= start and s.address <= end)
def WhereHasAnyAttribution(self):
return self.Filter(lambda s: s.name or s.source_path or s.object_path)
def Inverted(self):
return self._CreateTransformed(self.filtered_symbols,
filtered_symbols=self.symbols)
def GroupBy(self, func, min_count=0):
"""Returns a SymbolGroup of SymbolGroups, indexed by |func|.
Args:
func: Grouping function. Passed a symbol and returns a string for the
name of the subgroup to put the symbol in. If None is returned, the
symbol is omitted.
min_count: Miniumum number of symbols for a group. If fewer than this many
symbols end up in a group, they will not be put within a group.
Use a negative value to omit symbols entirely rather than
include them outside of a group.
"""
new_syms = []
filtered_symbols = []
symbols_by_token = collections.defaultdict(list)
# Index symbols by |func|.
for symbol in self:
token = func(symbol)
if token is None:
filtered_symbols.append(symbol)
symbols_by_token[token].append(symbol)
# Create the subgroups.
include_singles = min_count >= 0
min_count = abs(min_count)
for token, symbols in symbols_by_token.iteritems():
if len(symbols) >= min_count:
new_syms.append(self._CreateTransformed(symbols, name=token,
section_name=self.section_name))
elif include_singles:
new_syms.extend(symbols)
else:
filtered_symbols.extend(symbols)
return self._CreateTransformed(new_syms, filtered_symbols=filtered_symbols,
section_name=self.section_name)
def GroupBySectionName(self):
return self.GroupBy(lambda s: s.section_name)
def GroupByNamespace(self, depth=0, fallback='{global}', min_count=0):
"""Groups by symbol namespace (as denoted by ::s).
Does not differentiate between C++ namespaces and C++ classes.
Args:
depth: When 0 (default), groups by entire namespace. When 1, groups by
top-level name, when 2, groups by top 2 names, etc.
fallback: Use this value when no namespace exists.
min_count: Miniumum number of symbols for a group. If fewer than this many
symbols end up in a group, they will not be put within a group.
Use a negative value to omit symbols entirely rather than
include them outside of a group.
"""
def extract_namespace(symbol):
# Remove template params.
name = symbol.name
template_idx = name.find('<')
if template_idx:
name = name[:template_idx]
# Remove after the final :: (not part of the namespace).
colon_idx = name.rfind('::')
if colon_idx == -1:
return fallback
name = name[:colon_idx]
return _ExtractPrefixBeforeSeparator(name, '::', depth)
return self.GroupBy(extract_namespace, min_count=min_count)
def GroupBySourcePath(self, depth=0, fallback='{no path}',
fallback_to_object_path=True, min_count=0):
"""Groups by source_path.
Args:
depth: When 0 (default), groups by entire path. When 1, groups by
top-level directory, when 2, groups by top 2 directories, etc.
fallback: Use this value when no namespace exists.
fallback_to_object_path: When True (default), uses object_path when
source_path is missing.
min_count: Miniumum number of symbols for a group. If fewer than this many
symbols end up in a group, they will not be put within a group.
Use a negative value to omit symbols entirely rather than
include them outside of a group.
"""
def extract_path(symbol):
path = symbol.source_path
if fallback_to_object_path and not path:
path = symbol.object_path
path = path or fallback
return _ExtractPrefixBeforeSeparator(path, os.path.sep, depth)
return self.GroupBy(extract_path, min_count=min_count)
def GroupByObjectPath(self, depth=0, fallback='{no path}', min_count=0):
"""Groups by object_path.
Args:
depth: When 0 (default), groups by entire path. When 1, groups by
top-level directory, when 2, groups by top 2 directories, etc.
fallback: Use this value when no namespace exists.
min_count: Miniumum number of symbols for a group. If fewer than this many
symbols end up in a group, they will not be put within a group.
Use a negative value to omit symbols entirely rather than
include them outside of a group.
"""
def extract_path(symbol):
path = symbol.object_path or fallback
return _ExtractPrefixBeforeSeparator(path, os.path.sep, depth)
return self.GroupBy(extract_path, min_count=min_count)
class SymbolDiff(SymbolGroup):
"""A SymbolGroup subclass representing a diff of two other SymbolGroups.
All Symbols contained within have a |size| which is actually the size delta.
Additionally, metadata is kept about which symbols were added / removed /
changed.
"""
__slots__ = (
'_added_ids',
'_removed_ids',
)
def __init__(self, added, removed, similar):
self._added_ids = set(id(s) for s in added)
self._removed_ids = set(id(s) for s in removed)
symbols = []
symbols.extend(added)
symbols.extend(removed)
symbols.extend(similar)
super(SymbolDiff, self).__init__(symbols)
def __repr__(self):
return '%s(%d added, %d removed, %d changed, %d unchanged, size=%d)' % (
'SymbolGroup', self.added_count, self.removed_count, self.changed_count,
self.unchanged_count, self.size)
def _CreateTransformed(self, symbols, filtered_symbols=None, name=None,
section_name=None):
ret = SymbolDiff.__new__(SymbolDiff)
# Printing sorts, so fast-path the same symbols case.
if len(symbols) == len(self.symbols):
ret._added_ids = self._added_ids
ret._removed_ids = self._removed_ids
else:
ret._added_ids = set(id(s) for s in symbols if self.IsAdded(s))
ret._removed_ids = set(id(s) for s in symbols if self.IsRemoved(s))
super(SymbolDiff, ret).__init__(symbols, filtered_symbols=filtered_symbols,
name=name, section_name=section_name)
return ret
@property
def added_count(self):
return len(self._added_ids)
@property
def removed_count(self):
return len(self._removed_ids)
@property
def changed_count(self):
not_changed = self.unchanged_count + self.added_count + self.removed_count
return len(self) - not_changed
@property
def unchanged_count(self):
return sum(1 for s in self if self.IsSimilar(s) and s.size == 0)
def IsAdded(self, sym):
return id(sym) in self._added_ids
def IsSimilar(self, sym):
key = id(sym)
return key not in self._added_ids and key not in self._removed_ids
def IsRemoved(self, sym):
return id(sym) in self._removed_ids
def WhereNotUnchanged(self):
return self.Filter(lambda s: not self.IsSimilar(s) or s.size)
def Diff(new, old):
"""Diffs two SizeInfo or SymbolGroup objects.
When diffing SizeInfos, ret.section_sizes are the result of |new| - |old|, and
ret.symbols will be a SymbolDiff.
When diffing SymbolGroups, a SymbolDiff is returned.
Returns:
Returns a SizeInfo when args are of type SizeInfo.
Returns a SymbolDiff when args are of type SymbolGroup.
"""
if isinstance(new, SizeInfo):
assert isinstance(old, SizeInfo)
section_sizes = {
k:new.section_sizes[k] - v for k, v in old.section_sizes.iteritems()}
symbol_diff = Diff(new.symbols, old.symbols)
return SizeInfo(section_sizes, symbol_diff)
assert isinstance(new, SymbolGroup) and isinstance(old, SymbolGroup)
symbols_by_key = collections.defaultdict(list)
for s in old:
symbols_by_key[s._Key()].append(s)
added = []
removed = []
similar = []
# For similar symbols, padding is zeroed out. In order to not lose the
# information entirely, store it in aggregate.
padding_by_section_name = collections.defaultdict(int)
for new_sym in new:
matching_syms = symbols_by_key.get(new_sym._Key())
if matching_syms:
old_sym = matching_syms.pop(0)
# More stable/useful to compare size without padding.
size_diff = (new_sym.size_without_padding -
old_sym.size_without_padding)
merged_sym = Symbol(new_sym.section_name, size_diff,
address=new_sym.address, name=new_sym.name,
source_path=new_sym.source_path,
object_path=new_sym.object_path,
full_name=new_sym.full_name,
is_anonymous=new_sym.is_anonymous)
similar.append(merged_sym)
padding_by_section_name[new_sym.section_name] += (
new_sym.padding - old_sym.padding)
else:
added.append(new_sym)
for remaining_syms in symbols_by_key.itervalues():
for old_sym in remaining_syms:
duped = copy.copy(old_sym)
duped.size = -duped.size
duped.padding = -duped.padding
removed.append(duped)
for section_name, padding in padding_by_section_name.iteritems():
similar.append(Symbol(section_name, padding,
name='** aggregate padding of delta symbols'))
return SymbolDiff(added, removed, similar)
def _ExtractPrefixBeforeSeparator(string, separator, count=1):
idx = -len(separator)
prev_idx = None
for _ in xrange(count):
idx = string.find(separator, idx + len(separator))
if idx < 0:
break
prev_idx = idx
return string[:prev_idx]