tools/binary_size/libsupersize/native.py - chromium/src.git - Git at Google

 # Copyright 2022 The Chromium Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Functions for creating native code symbols from ELF files."""

 import calendar
 import collections
 import dataclasses
 import datetime
 import itertools
 import logging
 import os
 import posixpath
 import re
 import subprocess
 import sys
 import tempfile

 import ar
 import archive_util
 import demangle
 import dwarfdump
 import linker_map_parser
 import models
 import ninja_parser
 import nm
 import obj_analyzer
 import parallel
 import path_util
 import readelf
 import string_extract
 import zip_util

 # When ensuring matching section sizes between .elf and .map files, these
 # sections should be ignored. When lld creates a combined library with
 # partitions, some sections (like .text) exist in each partition, but the ones
 # below are common. At library splitting time, llvm-objcopy pulls what's needed
 # from these sections into the new libraries. Hence, the ELF sections will end
 # up smaller than the combined .map file sections.
 _SECTION_SIZE_BLOCKLIST = ['.symtab', '.shstrtab', '.strtab']

 # A limit on the number of symbols an address can have, before these symbols
 # are compacted into shared symbols. Increasing this value causes more data
 # to be stored .size files, but is also more expensive.
 # Effect as of Oct 2017, with min_pss = max:
 # 1: shared .text syms = 1772874 bytes, file size = 9.43MiB (645476 syms).
 # 2: shared .text syms = 1065654 bytes, file size = 9.58MiB (669952 syms).
 # 6: shared .text syms = 464058 bytes, file size = 10.11MiB (782693 syms).
 # 10: shared .text syms = 365648 bytes, file size = 10.24MiB (813758 syms).
 # 20: shared .text syms = 86202 bytes, file size = 10.38MiB (854548 syms).
 # 40: shared .text syms = 48424 bytes, file size = 10.50MiB (890396 syms).
 # 50: shared .text syms = 41860 bytes, file size = 10.54MiB (902304 syms).
 # max: shared .text syms = 0 bytes, file size = 11.10MiB (1235449 syms).
 _MAX_SAME_NAME_ALIAS_COUNT = 40  # 50kb is basically negligible.


 # Holds computation state that is live only when an output directory exists.
 @dataclasses.dataclass
 class _OutputDirectoryContext:
   elf_object_paths: list  # Non-None only when elf_path is.
   known_inputs: list  # Non-None only when elf_path is.
   output_directory: str
   thin_archives: list


 @dataclasses.dataclass
 class ElfInfo:
   architecture: str  # Results of ArchFromElf().
   build_id: str  # Result of BuildIdFromElf().
   section_ranges: dict  # Results of SectionInfoFromElf().
   size: int  # Result of os.path.getsize().

   def OverheadSize(self):
     section_sizes_total_without_bss = sum(
         size for k, (_, size) in self.section_ranges.items()
         if k not in models.BSS_SECTIONS)
     ret = self.size - section_sizes_total_without_bss
     assert ret >= 0, 'Negative ELF overhead {}'.format(ret)
     return ret


 def _CreateElfInfo(elf_path):
   return ElfInfo(architecture=readelf.ArchFromElf(elf_path),
                  build_id=readelf.BuildIdFromElf(elf_path),
                  section_ranges=readelf.SectionInfoFromElf(elf_path),
                  size=os.path.getsize(elf_path))


 def _AddSourcePathsUsingObjectPaths(ninja_source_mapper, raw_symbols):
   logging.info('Looking up source paths from ninja files')
   for symbol in raw_symbols:
     # Native symbols and pak symbols use object paths.
     object_path = symbol.object_path
     if not object_path:
       continue

     # We don't have source info for prebuilt .a files.
     if not os.path.isabs(object_path) and not object_path.startswith('..'):
       symbol.source_path = ninja_source_mapper.FindSourceForPath(object_path)
   assert ninja_source_mapper.unmatched_paths_count == 0, (
       'One or more source file paths could not be found. Likely caused by '
       '.ninja files being generated at a different time than the .map file.')


 def _AddSourcePathsUsingAddress(dwarf_source_mapper, raw_symbols):
   logging.debug('Looking up source paths from dwarfdump')
   query_count = 0
   match_count = 0
   for symbol in raw_symbols:
     if symbol.section_name != models.SECTION_TEXT:
       continue
     query_count += 1
     source_path = dwarf_source_mapper.FindSourceForTextAddress(symbol.address)
     if source_path:
       match_count += 1
       symbol.source_path = source_path
   logging.info('dwarfdump found paths for %d of %d .text symbols.', match_count,
                query_count)
   # Majority of unmatched queries are for assembly source files (ex libav1d)
   # and v8 builtins.
   if query_count > 0:
     unmatched_ratio = (query_count - match_count) / query_count
     assert unmatched_ratio < 0.2, (
         'Percentage of failing |dwarf_source_mapper| queries ' +
         '({}%) >= 20% '.format(unmatched_ratio * 100) +
         'FindSourceForTextAddress() likely has a bug.')


 def _ConnectNmAliases(raw_symbols):
   """Ensures |aliases| is set correctly for all symbols."""
   prev_sym = raw_symbols[0]
   for sym in raw_symbols[1:]:
     # Don't merge bss symbols.
     if sym.address > 0 and prev_sym.address == sym.address:
       # Don't merge padding-only symbols (** symbol gaps).
       if prev_sym.size > 0:
         # Don't merge if already merged.
         if prev_sym.aliases is None or prev_sym.aliases is not sym.aliases:
           if prev_sym.aliases:
             prev_sym.aliases.append(sym)
           else:
             prev_sym.aliases = [prev_sym, sym]
           sym.aliases = prev_sym.aliases
     prev_sym = sym


 def _AssignNmAliasPathsAndCreatePathAliases(raw_symbols, object_paths_by_name):
   num_found_paths = 0
   num_unknown_names = 0
   num_path_mismatches = 0
   num_aliases_created = 0
   ret = []
   for symbol in raw_symbols:
     ret.append(symbol)
     full_name = symbol.full_name
     # '__typeid_' symbols appear in linker .map only, and not nm output.
     if full_name.startswith('__typeid_'):
       if object_paths_by_name.get(full_name):
         logging.warning('Found unexpected __typeid_ symbol in nm output: %s',
                         full_name)
       continue

     # Don't skip if symbol.IsBss(). This is needed for LLD-LTO to work, since
     # .bss object_path data are unavailable for linker_map_parser, and need to
     # be extracted here. For regular LLD flow, incorrect aliased symbols can
     # arise. But that's a lesser evil compared to having LLD-LTO .bss missing
     # object_path and source_path.
     # TODO(huangs): Fix aliased symbols for the LLD case.
     if (symbol.IsStringLiteral() or not full_name or full_name[0] in '*.'
         or  # e.g. ** merge symbols, .Lswitch.table
         full_name == 'startup'):
       continue

     object_paths = object_paths_by_name.get(full_name)
     if object_paths:
       num_found_paths += 1
     else:
       # Happens a lot with code that has LTO enabled (linker creates symbols).
       num_unknown_names += 1
       continue

     if symbol.object_path and symbol.object_path not in object_paths:
       if num_path_mismatches < 10:
         logging.warning('Symbol path reported by .map not found by nm.')
         logging.warning('sym=%r', symbol)
         logging.warning('paths=%r', object_paths)
       object_paths.append(symbol.object_path)
       object_paths.sort()
       num_path_mismatches += 1

     symbol.object_path = object_paths[0]

     if len(object_paths) > 1:
       # Create one symbol for each object_path.
       aliases = symbol.aliases or [symbol]
       symbol.aliases = aliases
       num_aliases_created += len(object_paths) - 1
       for object_path in object_paths[1:]:
         new_sym = models.Symbol(symbol.section_name,
                                 symbol.size,
                                 address=symbol.address,
                                 full_name=full_name,
                                 object_path=object_path,
                                 aliases=aliases)
         aliases.append(new_sym)
         ret.append(new_sym)

   logging.debug(
       'Cross-referenced %d symbols with nm output. '
       'num_unknown_names=%d num_path_mismatches=%d '
       'num_aliases_created=%d', num_found_paths, num_unknown_names,
       num_path_mismatches, num_aliases_created)
   # Currently: num_unknown_names=1246 out of 591206 (0.2%).
   if num_unknown_names > min(20, len(raw_symbols) * 0.01):
     logging.warning(
         'Abnormal number of symbols not found in .o files (%d of %d)',
         num_unknown_names, len(raw_symbols))
   return ret


 def _DiscoverMissedObjectPaths(raw_symbols, known_inputs):
   # Missing object paths are caused by .a files added by -l flags, which are not
   # listed as explicit inputs within .ninja rules.
   missed_inputs = set()
   for symbol in raw_symbols:
     path = symbol.object_path
     if path.endswith(')'):
       # Convert foo/bar.a(baz.o) -> foo/bar.a
       path = path[:path.rindex('(')]
     if path and path not in known_inputs:
       missed_inputs.add(path)
   return missed_inputs


 def _CreateMergeStringsReplacements(merge_string_syms,
                                     list_of_positions_by_object_path):
   """Creates replacement symbols for |merge_syms|."""
   ret = []
   STRING_LITERAL_NAME = models.STRING_LITERAL_NAME
   assert len(merge_string_syms) == len(list_of_positions_by_object_path)
   tups = zip(merge_string_syms, list_of_positions_by_object_path)
   for merge_sym, positions_by_object_path in tups:
     merge_sym_address = merge_sym.address
     new_symbols = []
     ret.append(new_symbols)
     for object_path, positions in positions_by_object_path.items():
       for offset, size in positions:
         address = merge_sym_address + offset
         symbol = models.Symbol(models.SECTION_RODATA,
                                size,
                                address=address,
                                full_name=STRING_LITERAL_NAME,
                                object_path=object_path)
         new_symbols.append(symbol)

   logging.debug('Created %d string literal symbols', sum(len(x) for x in ret))
   logging.debug('Sorting string literals')
   for symbols in ret:
     # For de-duping & alias creation, order by address & size.
     # For alias symbol ordering, sort by object_path.
     symbols.sort(key=lambda x: (x.address, -x.size, x.object_path))

   logging.debug('Deduping string literals')
   num_removed = 0
   size_removed = 0
   num_aliases = 0
   for i, symbols in enumerate(ret):
     if not symbols:
       continue
     prev_symbol = symbols[0]
     new_symbols = [prev_symbol]
     for symbol in symbols[1:]:
       padding = symbol.address - prev_symbol.end_address
       if (prev_symbol.address == symbol.address
           and prev_symbol.size == symbol.size):
         # String is an alias.
         num_aliases += 1
         aliases = prev_symbol.aliases
         if aliases:
           aliases.append(symbol)
           symbol.aliases = aliases
         else:
           aliases = [prev_symbol, symbol]
           prev_symbol.aliases = aliases
           symbol.aliases = aliases
       elif padding + symbol.size <= 0:
         # String is a substring of prior one.
         num_removed += 1
         size_removed += symbol.size
         continue
       elif padding < 0:
         # String overlaps previous one. Adjust to not overlap.
         symbol.address -= padding
         symbol.size += padding
       new_symbols.append(symbol)
       prev_symbol = symbol
     ret[i] = new_symbols

   logging.debug(
       'Removed %d overlapping string literals (%d bytes) & created %d aliases',
       num_removed, size_removed, num_aliases)
   return ret


 def _AddOutlinedSymbolCountsFromNm(raw_symbols, names_by_address):
   logging.debug('Update symbol names')
   # linker_map_parser extracts '** outlined function' without knowing how many
   # such symbols exist at each address. nm has this information, and stores the
   # value as, e.g., '** outlined function * 5'. Copy the information over.
   for s in raw_symbols:
     if s.full_name.startswith('** outlined function'):
       name_list = names_by_address.get(s.address)
       if name_list:
         for name in name_list:
           if name.startswith('** outlined function'):
             s.full_name = name
             break


 def _AddNmAliases(raw_symbols, names_by_address):
   """Adds symbols that were removed by identical code folding."""
   # Step 1: Create list of (index_of_symbol, name_list).
   logging.debug('Creating alias list')
   replacements = []
   num_new_symbols = 0
   num_missing = 0
   missing_names = collections.defaultdict(list)
   for i, s in enumerate(raw_symbols):
     # Don't alias padding-only symbols (e.g. ** symbol gap)
     if s.size_without_padding == 0:
       continue
     # Also skip artificial symbols that won't appear in nm output.
     if s.full_name.startswith('** CFI jump table'):
       continue
     name_list = names_by_address.get(s.address)
     if name_list:
       if s.full_name not in name_list:
         num_missing += 1
         missing_names[s.full_name].append(s.address)
         # Sometimes happens for symbols from assembly files.
         if num_missing < 10:
           logging.debug('Name missing from aliases: %s %s (addr=%x)',
                         s.full_name, name_list, s.address)
         continue
       replacements.append((i, name_list))
       num_new_symbols += len(name_list) - 1

   if missing_names and logging.getLogger().isEnabledFor(logging.INFO):
     for address, names in names_by_address.items():
       for name in names:
         if name in missing_names:
           logging.info('Missing name %s is at address %x instead of [%s]' %
                        (name, address, ','.join('%x' % a
                                                 for a in missing_names[name])))

   is_small_file = len(raw_symbols) < 1000
   if not is_small_file and num_new_symbols / len(raw_symbols) < .05:
     logging.warning(
         'Number of aliases is oddly low (%.0f%%). It should '
         'usually be around 25%%.', num_new_symbols / len(raw_symbols) * 100)

   # Step 2: Create new symbols as siblings to each existing one.
   logging.debug('Creating %d new symbols from nm output', num_new_symbols)
   expected_num_symbols = len(raw_symbols) + num_new_symbols
   ret = []
   prev_src = 0
   for cur_src, name_list in replacements:
     ret += raw_symbols[prev_src:cur_src]
     prev_src = cur_src + 1
     sym = raw_symbols[cur_src]
     # Create symbols (|sym| gets recreated and discarded).
     new_syms = []
     for full_name in name_list:
       # Do not set |aliases| in order to avoid being pruned by
       # CompactLargeAliasesIntoSharedSymbols(), which assumes aliases differ
       # only by path. The field will be set afterwards by _ConnectNmAliases().
       new_syms.append(
           models.Symbol(sym.section_name,
                         sym.size,
                         address=sym.address,
                         full_name=full_name))
     ret += new_syms
   ret += raw_symbols[prev_src:]
   assert expected_num_symbols == len(ret)
   return ret


 def _ResolveThinArchivePaths(raw_symbols, thin_archives):
   """Converts object_paths for thin archives to external .o paths."""
   for symbol in raw_symbols:
     object_path = symbol.object_path
     if object_path.endswith(')'):
       start_idx = object_path.rindex('(')
       archive_path = object_path[:start_idx]
       if archive_path in thin_archives:
         subpath = object_path[start_idx + 1:-1]
         symbol.object_path = ar.CreateThinObjectPath(archive_path, subpath)


 def _DeduceObjectPathForSwitchTables(raw_symbols, object_paths_by_name):
   strip_num_suffix_regexp = re.compile(r'\s+\(\.\d+\)$')
   num_switch_tables = 0
   num_unassigned = 0
   num_deduced = 0
   num_arbitrations = 0
   for s in raw_symbols:
     if s.full_name.startswith('Switch table for '):
       num_switch_tables += 1
       # Strip 'Switch table for ' prefix.
       name = s.full_name[17:]
       # Strip, e.g., ' (.123)' suffix.
       name = re.sub(strip_num_suffix_regexp, '', name)
       object_paths = object_paths_by_name.get(name, None)
       if not s.object_path:
         if object_paths is None:
           num_unassigned += 1
         else:
           num_deduced += 1
           # If ambiguity arises, arbitrate by taking the first.
           s.object_path = object_paths[0]
           if len(object_paths) > 1:
             num_arbitrations += 1
       else:
         assert object_paths and s.object_path in object_paths
   if num_switch_tables > 0:
     logging.info(
         'Found %d switch tables: Deduced %d object paths with ' +
         '%d arbitrations. %d remain unassigned.', num_switch_tables,
         num_deduced, num_arbitrations, num_unassigned)


 def _ParseElfInfo(native_spec, outdir_context=None):
   """Adds ELF section ranges and symbols."""
   assert native_spec.map_path or native_spec.elf_path, (
       'Need a linker map or an ELF file.')
   assert native_spec.map_path or not native_spec.track_string_literals, (
       'track_string_literals not yet implemented without map file')
   if native_spec.elf_path:
     elf_section_ranges = readelf.SectionInfoFromElf(native_spec.elf_path)

     # Run nm on the elf file to retrieve the list of symbol names per-address.
     # This list is required because the .map file contains only a single name
     # for each address, yet multiple symbols are often coalesced when they are
     # identical. This coalescing happens mainly for small symbols and for C++
     # templates. Such symbols make up ~500kb of libchrome.so on Android.
     elf_nm_result = nm.CollectAliasesByAddressAsync(native_spec.elf_path)

     # Run nm on all .o/.a files to retrieve the symbol names within them.
     # The list is used to detect when multiple .o files contain the same symbol
     # (e.g. inline functions), and to update the object_path / source_path
     # fields accordingly.
     # Looking in object files is required because the .map file choses a
     # single path for these symbols.
     # Rather than record all paths for each symbol, set the paths to be the
     # common ancestor of all paths.
     if outdir_context and native_spec.map_path:
       bulk_analyzer = obj_analyzer.BulkObjectFileAnalyzer(
           outdir_context.output_directory,
           track_string_literals=native_spec.track_string_literals)
       bulk_analyzer.AnalyzePaths(outdir_context.elf_object_paths)

   if native_spec.map_path:
     logging.info('Parsing Linker Map')
     map_section_ranges, raw_symbols, linker_map_extras = (
         linker_map_parser.ParseFile(native_spec.map_path))

     if outdir_context and outdir_context.thin_archives:
       _ResolveThinArchivePaths(raw_symbols, outdir_context.thin_archives)
   else:
     logging.info('Collecting symbols from nm')
     raw_symbols = nm.CreateUniqueSymbols(native_spec.elf_path,
                                          elf_section_ranges)

   if native_spec.elf_path and native_spec.map_path:
     logging.debug('Validating section sizes')
     differing_elf_section_sizes = {}
     differing_map_section_sizes = {}
     for k, (_, elf_size) in elf_section_ranges.items():
       if k in _SECTION_SIZE_BLOCKLIST:
         continue
       _, map_size = map_section_ranges.get(k)
       if map_size != elf_size:
         differing_map_section_sizes[k] = map_size
         differing_elf_section_sizes[k] = elf_size
     if differing_map_section_sizes:
       logging.error('ELF file and .map file do not agree on section sizes.')
       logging.error('readelf: %r', differing_elf_section_sizes)
       logging.error('.map file: %r', differing_map_section_sizes)
       sys.exit(1)

   if native_spec.elf_path and native_spec.map_path and outdir_context:
     missed_object_paths = _DiscoverMissedObjectPaths(
         raw_symbols, outdir_context.known_inputs)
     missed_object_paths = ar.ExpandThinArchives(
         missed_object_paths, outdir_context.output_directory)[0]
     bulk_analyzer.AnalyzePaths(missed_object_paths)
     bulk_analyzer.SortPaths()
     if native_spec.track_string_literals:
       merge_string_syms = [
           s for s in raw_symbols if s.full_name == '** merge strings'
           or s.full_name == '** lld merge strings'
       ]
       # More likely for there to be a bug in supersize than an ELF to not have a
       # single string literal.
       assert merge_string_syms
       string_ranges = [(s.address, s.size) for s in merge_string_syms]
       bulk_analyzer.AnalyzeStringLiterals(native_spec.elf_path, string_ranges)

   # Map file for some reason doesn't demangle all names.
   # Demangle prints its own log statement.
   demangle.DemangleRemainingSymbols(raw_symbols)

   object_paths_by_name = {}
   if native_spec.elf_path:
     logging.info(
         'Adding symbols removed by identical code folding (as reported by nm)')
     # This normally does not block (it's finished by this time).
     names_by_address = elf_nm_result.get()
     if native_spec.map_path:
       # This rewrites outlined symbols from |map_path|, and can be skipped if
       # symbols already came from nm (e.g., for dwarf mode).
       _AddOutlinedSymbolCountsFromNm(raw_symbols, names_by_address)

     raw_symbols = _AddNmAliases(raw_symbols, names_by_address)

     if native_spec.map_path and outdir_context:
       object_paths_by_name = bulk_analyzer.GetSymbolNames()
       logging.debug(
           'Fetched path information for %d symbols from %d files',
           len(object_paths_by_name),
           len(outdir_context.elf_object_paths) + len(missed_object_paths))
       _DeduceObjectPathForSwitchTables(raw_symbols, object_paths_by_name)
       # For aliases, this provides path information where there wasn't any.
       logging.info('Creating aliases for symbols shared by multiple paths')
       raw_symbols = _AssignNmAliasPathsAndCreatePathAliases(
           raw_symbols, object_paths_by_name)

       if native_spec.track_string_literals:
         logging.info('Waiting for string literal extraction to complete.')
         list_of_positions_by_object_path = bulk_analyzer.GetStringPositions()
       bulk_analyzer.Close()

       if native_spec.track_string_literals:
         logging.info('Deconstructing ** merge strings into literals')
         replacements = _CreateMergeStringsReplacements(
             merge_string_syms, list_of_positions_by_object_path)
         for merge_sym, literal_syms in zip(merge_string_syms, replacements):
           # Don't replace if no literals were found.
           if literal_syms:
             # Re-find the symbols since aliases cause their indices to change.
             idx = raw_symbols.index(merge_sym)
             # This assignment is a bit slow (causes array to be shifted), but
             # is fast enough since len(merge_string_syms) < 10.
             raw_symbols[idx:idx + 1] = literal_syms

   if native_spec.map_path:
     linker_map_parser.DeduceObjectPathsFromThinMap(raw_symbols,
                                                    linker_map_extras)

   if native_spec.elf_path and native_spec.track_string_literals:
     sym_and_string_literals = string_extract.ReadStringLiterals(
         raw_symbols, native_spec.elf_path)
     for sym, data in sym_and_string_literals:
       sym.full_name = string_extract.GetNameOfStringLiteralBytes(data)

   # If we have an ELF file, use its ranges as the source of truth, since some
   # sections can differ from the .map.
   return (elf_section_ranges if native_spec.elf_path else map_section_ranges,
           raw_symbols, object_paths_by_name)


 def _AddUnattributedSectionSymbols(raw_symbols, section_ranges, source_path):
   # Create symbols for ELF sections not covered by existing symbols.
   logging.info('Searching for symbol gaps...')
   new_syms_by_section = collections.defaultdict(list)
   seen_sections = set()

   for section_name, group in itertools.groupby(
       raw_symbols, lambda s: s.section_name):
     seen_sections.add(section_name)
     # Get last Last symbol in group.
     sym = None  # Needed for pylint.
     for sym in group:
       pass
     end_address = sym.end_address  # pylint: disable=undefined-loop-variable
     size_from_syms = end_address - section_ranges[section_name][0]
     overhead = section_ranges[section_name][1] - size_from_syms
     assert overhead >= 0, (
         'Last symbol (%s) ends %d bytes after section boundary (%x)' %
         (sym, -overhead, sum(section_ranges[section_name])))
     if overhead > 0 and section_name not in models.BSS_SECTIONS:
       new_syms_by_section[section_name].append(
           models.Symbol(section_name,
                         overhead,
                         address=end_address,
                         full_name='** {} (unattributed)'.format(section_name),
                         source_path=source_path))
       logging.info('Last symbol in %s does not reach end of section, gap=%d',
                    section_name, overhead)

   # Sections that should not bundle into ".other".
   unsummed_sections, summed_sections = models.ClassifySections(
       section_ranges.keys())
   ret = []
   other_symbols = []
   # Sort keys to ensure consistent order (> 1 sections may have address = 0).
   for section_name, (_, section_size) in list(section_ranges.items()):
     if section_name in seen_sections:
       continue
     # Handle sections that don't appear in |raw_symbols|.
     if (section_name not in unsummed_sections
         and section_name not in summed_sections):
       other_symbols.append(
           models.Symbol(models.SECTION_OTHER,
                         section_size,
                         full_name='** ELF Section: {}'.format(section_name),
                         source_path=source_path))
       archive_util.ExtendSectionRange(section_ranges, models.SECTION_OTHER,
                                       section_size)
     else:
       ret.append(
           models.Symbol(section_name,
                         section_size,
                         full_name='** ELF Section: {}'.format(section_name),
                         source_path=source_path))
   other_symbols.sort(key=lambda s: (s.address, s.full_name))

   # TODO(agrieve): It would probably simplify things to use a dict of
   #     section_name->raw_symbols while creating symbols.
   # Merge |new_syms_by_section| into |raw_symbols| while maintaining ordering.
   for section_name, group in itertools.groupby(
       raw_symbols, lambda s: s.section_name):
     ret.extend(group)
     ret.extend(new_syms_by_section[section_name])
   return ret, other_symbols


 def _ParseNinjaFiles(output_directory, elf_path=None):
   linker_elf_path = elf_path
   if elf_path:
     # For partitioned libraries, the actual link command outputs __combined.so.
     partitioned_elf_path = elf_path.replace('.so', '__combined.so')
     if os.path.exists(partitioned_elf_path):
       linker_elf_path = partitioned_elf_path

   logging.info('Parsing ninja files, looking for %s.',
                (linker_elf_path or 'source mapping only (elf_path=None)'))

   source_mapper, ninja_elf_object_paths = ninja_parser.Parse(
       output_directory, linker_elf_path)

   logging.debug('Parsed %d .ninja files. Linker inputs=%d',
                 source_mapper.parsed_file_count,
                 len(ninja_elf_object_paths or []))
   if elf_path:
     assert ninja_elf_object_paths, (
         'Failed to find link command in ninja files for ' +
         os.path.relpath(linker_elf_path, output_directory))

   return source_mapper, ninja_elf_object_paths


 def _ElfInfoFromApk(apk_path, apk_so_path):
   with zip_util.UnzipToTemp(apk_path, apk_so_path) as temp:
     return _CreateElfInfo(temp)


 def _CountRelocationsFromElf(elf_path):
   args = [path_util.GetReadElfPath(), '-r', elf_path]
   stdout = subprocess.check_output(args).decode('ascii')
   relocations = re.findall(
       'Relocation section .* at offset .* contains (\d+) entries', stdout)
   return sum([int(i) for i in relocations])


 def _FindToolchainSubdirs(output_directory):
   return [
       n for n in os.listdir(output_directory)
       if os.path.exists(os.path.join(output_directory, n, 'toolchain.ninja'))
   ]


 def CreateMetadata(*, native_spec, elf_info, shorten_path):
   """Returns metadata for the given native_spec / elf_info."""
   logging.debug('Constructing native metadata')
   native_metadata = {}
   native_metadata[models.METADATA_ELF_ALGORITHM] = native_spec.algorithm

   if elf_info:
     native_metadata[models.METADATA_ELF_ARCHITECTURE] = elf_info.architecture
     native_metadata[models.METADATA_ELF_BUILD_ID] = elf_info.build_id

   if native_spec.apk_so_path:
     native_metadata[models.METADATA_ELF_APK_PATH] = native_spec.apk_so_path

   if native_spec.elf_path:
     native_metadata[models.METADATA_ELF_FILENAME] = shorten_path(
         native_spec.elf_path)
     timestamp_obj = datetime.datetime.utcfromtimestamp(
         os.path.getmtime(native_spec.elf_path))
     timestamp = calendar.timegm(timestamp_obj.timetuple())
     native_metadata[models.METADATA_ELF_MTIME] = timestamp

   if native_spec.map_path:
     native_metadata[models.METADATA_MAP_FILENAME] = shorten_path(
         native_spec.map_path)
   return native_metadata


 def CreateSymbols(*,
                   apk_spec,
                   native_spec,
                   output_directory=None,
                   pak_id_map=None):
   """Creates native symbols for the given native_spec.

   Args:
     apk_spec: Instance of ApkSpec, or None.
     native_spec: Instance of NativeSpec.
     output_directory: Build output directory. If None, source_paths and symbol
         alias information will not be recorded.
     pak_id_map: Instance of PakIdMap.

   Returns:
     A tuple of (section_ranges, raw_symbols, elf_info, metrics_by_file), where
     metrics_by_file is a dict from file name to a dict of {metric_name: value}.
   """
   apk_elf_info_result = None
   if apk_spec and native_spec.apk_so_path:
     # Extraction takes around 1 second, so do it in parallel.
     apk_elf_info_result = parallel.ForkAndCall(
         _ElfInfoFromApk, (apk_spec.apk_path, native_spec.apk_so_path))

   raw_symbols = []
   ninja_source_mapper = None
   dwarf_source_mapper = None
   section_ranges = {}
   ninja_elf_object_paths = None
   metrics_by_file = {}
   if output_directory and native_spec.map_path:
     # Finds all objects passed to the linker and creates a map of .o -> .cc.
     ninja_source_mapper, ninja_elf_object_paths = _ParseNinjaFiles(
         output_directory, native_spec.elf_path)
   elif native_spec.elf_path:
     logging.info('Parsing source path info via dwarfdump')
     dwarf_source_mapper = dwarfdump.CreateAddressSourceMapper(
         native_spec.elf_path)
     logging.info('Found %d source paths across %s ranges',
                  dwarf_source_mapper.NumberOfPaths(),
                  dwarf_source_mapper.num_ranges)

   # Start by finding elf_object_paths so that nm can run on them while the
   # linker .map is being parsed.
   if ninja_elf_object_paths:
     elf_object_paths, thin_archives = ar.ExpandThinArchives(
         ninja_elf_object_paths, output_directory)
     known_inputs = set(elf_object_paths)
     known_inputs.update(ninja_elf_object_paths)
   else:
     elf_object_paths = []
     known_inputs = None
     # When we don't know which elf file is used, just search all paths.
     # TODO(agrieve): Seems to be used only for tests. Remove?
     if ninja_source_mapper:
       thin_archives = set(
           p for p in ninja_source_mapper.IterAllPaths() if p.endswith('.a')
           and ar.IsThinArchive(os.path.join(output_directory, p)))
     else:
       thin_archives = None

   if output_directory:
     toolchain_subdirs = _FindToolchainSubdirs(output_directory)
     outdir_context = _OutputDirectoryContext(elf_object_paths=elf_object_paths,
                                              known_inputs=known_inputs,
                                              output_directory=output_directory,
                                              thin_archives=thin_archives)
   else:
     toolchain_subdirs = None
     outdir_context = None

   object_paths_by_name = None
   if native_spec.elf_path or native_spec.map_path:
     section_ranges, raw_symbols, object_paths_by_name = _ParseElfInfo(
         native_spec, outdir_context=outdir_context)
     if pak_id_map and native_spec.map_path:
       # For trichrome, pak files are in different apks than native library,
       # so need to pass along pak_id_map separately and ensure
       # TrichromeLibrary appears first in .ssargs file.
       logging.debug('Extracting pak IDs from symbol names')
       pak_id_map.Update(object_paths_by_name, ninja_source_mapper)

   elf_info = None
   if apk_elf_info_result:
     logging.debug('Extracting section sizes from .so within .apk')
     elf_info = apk_elf_info_result.get()
     if native_spec.elf_path:
       expected_build_id = readelf.BuildIdFromElf(native_spec.elf_path)
       assert elf_info.build_id == expected_build_id, (
           'BuildID of {} != $APK/{}: {} != {}'.format(native_spec.elf_path,
                                                       native_spec.apk_so_path,
                                                       expected_build_id,
                                                       elf_info.build_id))
   elif native_spec.elf_path:
     # Strip ELF before capturing section information to avoid recording
     # debug sections.
     with tempfile.NamedTemporaryFile(
         suffix=os.path.basename(native_spec.elf_path)) as f:
       strip_path = path_util.GetStripPath()
       subprocess.run([strip_path, '-o', f.name, native_spec.elf_path],
                      check=True)
       elf_info = _CreateElfInfo(f.name)

   if elf_info:
     section_ranges = elf_info.section_ranges.copy()
     if native_spec.elf_path:
       key = posixpath.basename(native_spec.elf_path)
       metrics_by_file[key] = {
           f'{models.METRICS_SIZE}/{k}': size
           for (k, (offset, size)) in section_ranges.items()
       }
       relocations_count = _CountRelocationsFromElf(native_spec.elf_path)
       metrics_by_file[key][
           f'{models.METRICS_COUNT}/{models.METRICS_COUNT_RELOCATIONS}'] = (
               relocations_count)

   source_path = ''
   if native_spec.apk_so_path:
     # Put section symbols under $NATIVE/libfoo.so (abi)/...
     source_path = '{}/{} ({})'.format(
         models.NATIVE_PREFIX_PATH, posixpath.basename(native_spec.apk_so_path),
         elf_info.architecture)

   raw_symbols, other_symbols = _AddUnattributedSectionSymbols(
       raw_symbols, section_ranges, source_path)

   if elf_info:
     elf_overhead_size = elf_info.OverheadSize()
     elf_overhead_symbol = models.Symbol(models.SECTION_OTHER,
                                         elf_overhead_size,
                                         full_name='Overhead: ELF file',
                                         source_path=source_path)
     archive_util.ExtendSectionRange(section_ranges, models.SECTION_OTHER,
                                     elf_overhead_size)
     other_symbols.append(elf_overhead_symbol)

   # Always have .other come last.
   other_symbols.sort(key=lambda s: (s.IsOverhead(), s.full_name.startswith(
       '**'), s.address, s.full_name))

   if ninja_source_mapper:
     _AddSourcePathsUsingObjectPaths(ninja_source_mapper, raw_symbols)
   elif dwarf_source_mapper:
     _AddSourcePathsUsingAddress(dwarf_source_mapper, raw_symbols)

   raw_symbols.extend(other_symbols)

   # Path normalization must come before compacting aliases so that
   # ancestor paths do not mix generated and non-generated paths.
   archive_util.NormalizePaths(raw_symbols,
                               gen_dir_regex=native_spec.gen_dir_regex,
                               toolchain_subdirs=toolchain_subdirs)

   if native_spec.elf_path or native_spec.map_path:
     logging.info('Converting excessive aliases into shared-path symbols')
     archive_util.CompactLargeAliasesIntoSharedSymbols(
         raw_symbols, _MAX_SAME_NAME_ALIAS_COUNT)

     logging.debug('Connecting nm aliases')
     _ConnectNmAliases(raw_symbols)

   return section_ranges, raw_symbols, elf_info, metrics_by_file