| # Copyright 2018 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Utilities to extract string literals from object files. |
| |
| LookupElfRodataInfo(): |
| Runs readelf to extract and return .rodata section spec of an ELF file. |
| |
| ReadFileChunks(): |
| Reads raw data from a file, given a list of ranges in the file. |
| |
| ResolveStringPiecesIndirect(): |
| BulkForkAndCall() target: Given {path: [string addresses]} and |
| [raw_string_data for each string_section]: |
| - Reads {path: [src_strings]}. |
| - For each path, searches for src_strings in at most 1 raw_string_data over |
| each string_section. If found, translates to string_range and annotates it |
| to the string_section. |
| - Returns [{path: [string_ranges]} for each string_section]. |
| |
| ResolveStringPieces(): |
| BulkForkAndCall() target: Given {path: [strings]} and |
| [raw_string_data for each string_section]: |
| - For each path, searches for src_strings in at most 1 raw_string_data over |
| each string_section. If found, translates to string_range and annotates it |
| to the string_section. |
| - Returns [{path: [string_ranges]} for each string_section]. |
| """ |
| |
| import ast |
| import collections |
| import itertools |
| import logging |
| import os |
| import subprocess |
| |
| import ar |
| import concurrent |
| import models |
| import path_util |
| |
| |
| def LookupElfRodataInfo(elf_path, tool_prefix): |
| """Returns (address, offset, size) for the .rodata section.""" |
| args = [path_util.GetReadElfPath(tool_prefix), '-S', '--wide', elf_path] |
| output = subprocess.check_output(args) |
| lines = output.splitlines() |
| for line in lines: |
| # [Nr] Name Type Addr Off Size ES Flg Lk Inf Al |
| # [07] .rodata PROGBITS 025e7000 237c000 5ec4f6 00 A 0 0 256 |
| if '.rodata ' in line: |
| fields = line[line.index(models.SECTION_RODATA):].split() |
| return int(fields[2], 16), int(fields[3], 16), int(fields[4], 16) |
| raise AssertionError('No .rodata for command: ' + repr(args)) |
| |
| |
| def ReadFileChunks(path, section_ranges): |
| """Returns a list of raw data from |path|, specified by |section_ranges|. |
| |
| Args: |
| section_ranges: List of (offset, size). |
| """ |
| ret = [] |
| if not section_ranges: |
| return ret |
| with open(path, 'rb') as f: |
| for offset, size in section_ranges: |
| f.seek(offset) |
| ret.append(f.read(size)) |
| return ret |
| |
| |
| def _ExtractArchivePath(path): |
| # E.g. foo/bar.a(baz.o) |
| if path.endswith(')'): |
| start_idx = path.index('(') |
| return path[:start_idx] |
| return None |
| |
| |
| def _LookupStringSectionPositions(target, tool_prefix, output_directory): |
| """Returns a dict of object_path -> [(offset, size)...] of .rodata sections. |
| |
| Args: |
| target: An archive path string (e.g., "foo.a") or a list of object paths. |
| """ |
| is_archive = isinstance(target, basestring) |
| args = [path_util.GetReadElfPath(tool_prefix), '-S', '--wide'] |
| if is_archive: |
| args.append(target) |
| else: |
| # Assign path for when len(target) == 1, (no File: line exists). |
| path = target[0] |
| args.extend(target) |
| |
| output = subprocess.check_output(args, cwd=output_directory) |
| lines = output.splitlines() |
| section_positions_by_path = {} |
| cur_offsets = [] |
| for line in lines: |
| # File: base/third_party/libevent/libevent.a(buffer.o) |
| # [Nr] Name Type Addr Off Size ES Flg Lk Inf Al |
| # [11] .rodata.str1.1 PROGBITS 00000000 0000b4 000004 01 AMS 0 0 1 |
| # [11] .rodata.str4.4 PROGBITS 00000000 0000b4 000004 01 AMS 0 0 4 |
| # [11] .rodata.str8.8 PROGBITS 00000000 0000b4 000004 01 AMS 0 0 8 |
| # [80] .rodata..L.str PROGBITS 00000000 000530 000002 00 A 0 0 1 |
| # The various string sections differ by alignment. |
| # The presence of a wchar_t literal (L"asdf") seems to make a str4 section. |
| # When multiple sections exist, nm gives us no indication as to which |
| # section each string corresponds to. |
| if line.startswith('File: '): |
| if cur_offsets: |
| section_positions_by_path[path] = cur_offsets |
| cur_offsets = [] |
| path = line[6:] |
| elif '.rodata.' in line: |
| progbits_idx = line.find('PROGBITS ') |
| if progbits_idx != -1: |
| fields = line[progbits_idx:].split() |
| position = (int(fields[2], 16), int(fields[3], 16)) |
| # The heuristics in _IterStringLiterals rely on str1 coming first. |
| if fields[-1] == '1': |
| cur_offsets.insert(0, position) |
| else: |
| cur_offsets.append(position) |
| if cur_offsets: |
| section_positions_by_path[path] = cur_offsets |
| return section_positions_by_path |
| |
| |
| def _ReadStringSections(target, output_directory, positions_by_path): |
| """Returns a dict of object_path -> [string...] of .rodata chunks. |
| |
| Args: |
| target: An archive path string (e.g., "foo.a") or a list of object paths. |
| positions_by_path: A dict of object_path -> [(offset, size)...] |
| """ |
| is_archive = isinstance(target, basestring) |
| string_sections_by_path = {} |
| if is_archive: |
| for subpath, chunk in ar.IterArchiveChunks( |
| os.path.join(output_directory, target)): |
| path = '{}({})'.format(target, subpath) |
| positions = positions_by_path.get(path) |
| # No positions if file has no string literals. |
| if positions: |
| string_sections_by_path[path] = ( |
| [chunk[offset:offset + size] for offset, size in positions]) |
| else: |
| for path in target: |
| positions = positions_by_path.get(path) |
| # We already log a warning about this in _IterStringLiterals(). |
| if positions: |
| string_sections_by_path[path] = ReadFileChunks( |
| os.path.join(output_directory, path), positions) |
| return string_sections_by_path |
| |
| |
| def _IterStringLiterals(path, addresses, obj_sections): |
| """Yields all string literals (including \0) for the given object path. |
| |
| Args: |
| path: Object file path. |
| addresses: List of string offsets encoded as hex strings. |
| obj_sections: List of contents of .rodata.str sections read from the given |
| object file. |
| """ |
| |
| next_offsets = sorted(int(a, 16) for a in addresses) |
| if not obj_sections: |
| # Happens when there is an address for a symbol which is not actually a |
| # string literal, or when string_sections_by_path is missing an entry. |
| logging.warning('Object has %d strings but no string sections: %s', |
| len(addresses), path) |
| return |
| for section_data in obj_sections: |
| cur_offsets = next_offsets |
| # Always assume first element is 0. I'm not entirely sure why this is |
| # necessary, but strings get missed without it. |
| next_offsets = [0] |
| prev_offset = 0 |
| # TODO(agrieve): Switch to using nm --print-size in order to capture the |
| # address+size of each string rather than just the address. |
| for offset in cur_offsets[1:]: |
| if offset >= len(section_data): |
| # Remaining offsets are for next section. |
| next_offsets.append(offset) |
| continue |
| # Figure out which offsets apply to this section via heuristic of them |
| # all ending with a null character. |
| if offset == prev_offset or section_data[offset - 1] != '\0': |
| next_offsets.append(offset) |
| continue |
| yield section_data[prev_offset:offset] |
| prev_offset = offset |
| |
| if prev_offset < len(section_data): |
| yield section_data[prev_offset:] |
| |
| |
| def _AnnotateStringData(string_data, path_value_gen): |
| """Annotates each |string_data| section data with paths and ranges. |
| |
| Args: |
| string_data: [raw_string_data for each string_section] from an ELF file. |
| path_value_gen: A generator of (path, value) pairs, where |path| |
| is the path to an object file and |value| is a string to annotate. |
| |
| Returns: |
| [{path: [string_ranges]} for each string_section]. |
| """ |
| ret = [collections.defaultdict(list) for _ in string_data] |
| |
| # Brute-force search ** merge strings sections in |string_data| for string |
| # values from |path_value_gen|. This is by far the slowest part of |
| # AnalyzeStringLiterals(). |
| # TODO(agrieve): Pre-process |string_data| into a dict of literal->address (at |
| # least for ASCII strings). |
| for path, value in path_value_gen: |
| first_match = -1 |
| first_match_dict = None |
| for target_dict, data in itertools.izip(ret, string_data): |
| # Set offset so that it will be 0 when len(value) is added to it below. |
| offset = -len(value) |
| while True: |
| offset = data.find(value, offset + len(value)) |
| if offset == -1: |
| break |
| # Preferring exact matches (those following \0) over substring matches |
| # significantly increases accuracy (although shows that linker isn't |
| # being optimal). |
| if offset == 0 or data[offset - 1] == '\0': |
| break |
| if first_match == -1: |
| first_match = offset |
| first_match_dict = target_dict |
| if offset != -1: |
| break |
| if offset == -1: |
| # Exact match not found, so take suffix match if it exists. |
| offset = first_match |
| target_dict = first_match_dict |
| # Missing strings happen when optimization make them unused. |
| if offset != -1: |
| # Encode tuple as a string for easier mashalling. |
| target_dict[path].append(str(offset) + ':' + str(len(value))) |
| |
| return ret |
| |
| |
| # This is a target for BulkForkAndCall(). |
| def ResolveStringPiecesIndirect(encoded_string_addresses_by_path, string_data, |
| tool_prefix, output_directory): |
| string_addresses_by_path = concurrent.DecodeDictOfLists( |
| encoded_string_addresses_by_path) |
| # Assign |target| as archive path, or a list of object paths. |
| any_path = next(string_addresses_by_path.iterkeys()) |
| target = _ExtractArchivePath(any_path) |
| if not target: |
| target = string_addresses_by_path.keys() |
| |
| # Run readelf to find location of .rodata within the .o files. |
| section_positions_by_path = _LookupStringSectionPositions( |
| target, tool_prefix, output_directory) |
| # Load the .rodata sections (from object files) as strings. |
| string_sections_by_path = _ReadStringSections( |
| target, output_directory, section_positions_by_path) |
| |
| def GeneratePathAndValues(): |
| for path, object_addresses in string_addresses_by_path.iteritems(): |
| for value in _IterStringLiterals( |
| path, object_addresses, string_sections_by_path.get(path)): |
| yield path, value |
| |
| ret = _AnnotateStringData(string_data, GeneratePathAndValues()) |
| return [concurrent.EncodeDictOfLists(x) for x in ret] |
| |
| |
| # This is a target for BulkForkAndCall(). |
| def ResolveStringPieces(encoded_strings_by_path, string_data): |
| # ast.literal_eval() undoes repr() applied to strings. |
| strings_by_path = concurrent.DecodeDictOfLists( |
| encoded_strings_by_path, value_transform=ast.literal_eval) |
| |
| def GeneratePathAndValues(): |
| for path, strings in strings_by_path.iteritems(): |
| for value in strings: |
| yield path, value |
| |
| ret = _AnnotateStringData(string_data, GeneratePathAndValues()) |
| return [concurrent.EncodeDictOfLists(x) for x in ret] |