| #!/usr/bin/env python |
| # Copyright 2018 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Runs bcanalyzer to extract data from LLVM Bitcode (BC) files. |
| |
| IsBitcodeFile(): |
| Reads the magic header of a file to quickly decide whether it is a BC file. |
| |
| ParseTag(): |
| Heuristically parses a single-line tag from bcanalyzer dump (exporeted for |
| testing). |
| |
| RunBcAnalyzerOnIntermediates(): |
| BulkForkAndCall() target: Given BC file [paths], runs (llvm-)bcanalyzer on |
| each path, parses the output, extracts strings, and returns {path: [strings]}. |
| |
| This file can also be run stand-alone in order to test out the logic on smaller |
| sample sizes. |
| """ |
| |
| from __future__ import print_function |
| |
| import argparse |
| import os |
| import re |
| import subprocess |
| |
| import concurrent |
| import path_util |
| |
| |
| # Upper bound on number of bytes per character in strings. 4-byte / 32-bit |
| # strings are rare and are likely confused with 32-bit int arrays. So by |
| # default, only accept up to 2-byte / 16-bit strings. |
| _CHAR_WIDTH_LIMIT = 2 |
| |
| _RE_SPLIT = re.compile(r'=(\d+)') |
| # <TYPE_BLOCK_ID> children tags that should not be counted as types. |
| # - <NUMENTRY> is meta data. |
| # - <STRUCT_NAME> with the following <STRUCT_NAMED> (or other tag) are counted |
| # as a single type entry. |
| _NON_TYPE_TAGS = set(['NUMENTRY', 'STRUCT_NAME']) |
| |
| # Use bit-fields for tag types: 1 => Opening tag, 2 => Closed tag. |
| OPENING_TAG = 1 |
| CLOSING_TAG = 2 |
| SELF_CLOSING_TAG = OPENING_TAG | CLOSING_TAG |
| |
| |
| def _IsOpeningTag(tag_type): |
| return tag_type & 1 |
| |
| |
| def _IsClosingTag(tag_type): |
| return tag_type & 2 |
| |
| |
| def IsBitcodeFile(path): |
| try: |
| with open(path, 'rb') as f: |
| return f.read(4) == 'BC\xc0\xde' |
| except IOError: |
| return False |
| |
| |
| def ParseTag(line): |
| """Heuristically parses a single-line tag from bcanalyzer dump. |
| |
| Since input data are machine-generated, so we only need "good enough" parsing |
| logic that favors simplicity. For example, '</FOO/>' is accepted. |
| |
| Args: |
| line: Stripped line that may have a single-line tag with trailing text. |
| |
| Returns: |
| (tag_type, tag, attrib_pos) if successful, else (None) * 3. Details: |
| tag_type: One of {OPENING_TAG, CLOSING_TAG, SELF_CLOSING_TAG}. |
| tag: The tag name. |
| attrib_pos: Position in |line| to start parsing attributes. |
| """ |
| # <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4> |
| # ==> (OPENING_TAG, 'TYPE_BLOCK_ID', 14). |
| # <ARRAY abbrevid=9 op0=1 op1=7/> Trailing text! |
| # ==> (SELF_CLOSING_TAG, 'ARRAY', 6). |
| # </TYPE_BLOCK_ID> |
| # ==> (CLOSING_TAG, 'TYPE_BLOCK_ID', 15). |
| |
| # Assumes |line| is stripped, i.e., so no indent and no trailing new line. |
| if len(line) < 2 or line[0] != '<': |
| return (None, None, None) |
| tag_type, pos = (CLOSING_TAG, 2) if line[1] == '/' else (OPENING_TAG, 1) |
| for i in xrange(pos, len(line)): |
| if not line[i].isalnum() and line[i] != '_': |
| if i == pos or not line[i] in ' >/': |
| break |
| end = line.find('>', i) |
| if end < 0: |
| break |
| if line[end - 1] == '/': |
| return (SELF_CLOSING_TAG, line[pos:i], i) |
| return (tag_type, line[pos:i], i) |
| return (None, None, None) |
| |
| |
| def _ParseOpItems(line, pos): |
| """Heuristically extracts op0=# op1=# ... values from a single-line tag.""" |
| # <SETTYPE abbrevid=4 op0=42/> |
| # ^ pos = 8 |
| # ==> iter([42]). |
| # <CSTRING abbrevid=11 op0=84 op1=101 op2=115 op3=116 op4=56 op5=97/> |
| # ^ pos = 8 |
| # ==> iter([84, 101, 115, 116, 56, 97]). |
| # <STRING abbrevid=9 op0=1 op1=0 op2=0 op3=1 op4=1 op5=0/> |
| # ^ pos = 7 |
| # ==> iter([1, 0, 0, 1, 1, 0]). |
| # <DATA op0=8412 op1=101 op2=1150 op3=116 op4=5200 op5=98 op6=0/> |
| # ^ pos = 5 |
| # ==> iter([8412, 101, 1150, 116, 5200, 98, 0]). |
| |
| # In particular, skip 'abbrevid=#'. |
| start = line.index(' op', pos) |
| end = line.index('>', start) |
| for t in _RE_SPLIT.finditer(line[start:end]): |
| yield int(t.group(1)) |
| |
| |
| # Emits uint16 values as a stream of 2 bytes (little-endian). |
| def _UnpackUint16ListToBytes(items): |
| for item in items: |
| yield item & 0xFF |
| yield (item >> 8) & 0xFF |
| |
| |
| # Emits uint32 values as a stream of 4 bytes (little-endian). |
| def _UnpackUint32ListToBytes(items): |
| for item in items: |
| yield item & 0xFF |
| yield (item >> 8) & 0xFF |
| yield (item >> 16) & 0xFF |
| yield (item >> 24) & 0xFF |
| |
| |
| class _BcIntArrayType: |
| """The specs of an integer array type.""" |
| |
| # Lookup table to map from width to an unpacker that splits ints into bytes. |
| _UNPACKER_MAP = { |
| 1: iter, |
| 2: _UnpackUint16ListToBytes, |
| 4: _UnpackUint32ListToBytes |
| } |
| |
| def __init__(self, length, width): |
| # Number of elements in the array. |
| self.length = length |
| # Number of bytes per element. |
| self.width = width |
| |
| def ParseOpItemsAsString(self, line, attrib_pos, add_null_at_end): |
| """Reads op0=# op=# ... values and returns them as a list of bytes. |
| |
| Interprets each op0=# op1=# ... value as a |self.width|-byte integer, splits |
| them into component bytes (little-endian), and returns the result as string. |
| |
| Args: |
| line: Stripped line of single-line tag with op0=# op1=# ... data. |
| attrib_pos: Position in |line| where attribute list starts. |
| add_null_add_end: Whether to append |'\x00' * self.width|. |
| """ |
| items = _ParseOpItems(line, attrib_pos) |
| unpacker = _BcIntArrayType._UNPACKER_MAP[self.width] |
| s = ''.join(chr(t) for t in unpacker(items)) |
| if add_null_at_end: |
| s += '\x00' * self.width |
| # Rather stringent check to ensure exact size match. |
| assert len(s) == self.length * self.width |
| return s |
| |
| |
| class _BcTypeInfo: |
| """Stateful parser of <TYPE_BLOCK_ID>, specialized for integer arrays.""" |
| |
| # <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4> |
| # <NUMENTRY op0=9/> # Type ids should be in [0, 8]. |
| # <INTEGER op0=8/> # Type id = 0: int8. |
| # <POINTER abbrevid=4 op0=0 op1=0/> # Type id = 1: Pointer to type id 0 |
| # # ==> int8*. |
| # <ARRAY abbrevid=9 op0=4 op1=0/> # Type id = 2: Array with 4 elements |
| # # of type id 0 ==> int8[4] |
| # <STRUCT_NAME op0=115 op1=116 op2=114/> # Joins next Tag. |
| # <STRUCT_NAMED abbrevid=8 op0=0 op1=1/> # Type id = 3: Struct (unused). |
| # <FUNCTION abbrevid=5 op0=0 op1=12/> # Type id = 4: Function (unused). |
| # <INTEGER op0=16/> # Type id = 5: int16. |
| # <POINTER abbrevid=4 op0=5 op1=0/> # Type id = 6: Pointer to type id 5 |
| # # ==> int16*. |
| # <INTEGER op0=32/> # Type id = 7: int32. |
| # <ARRAY abbrevid=9 op0=5 op1=4/> # Type id = 8: Array with 4 elements |
| # # of type id 5 ==> int16[4] |
| # <TYPE_BLOCK_ID> |
| |
| def __init__(self): |
| # Auto-incrementing current type id. |
| self.cur_type_id = 0 |
| # Maps from type id (of an integer) to number of bits. |
| self.int_types = {} |
| # Maps from type id (of an integer array) to _BcIntArrayType. |
| self.int_array_types = {} |
| |
| def Feed(self, line, tag, attrib_pos): |
| """Parses a single-line tag and store integer and integer array types. |
| |
| Args: |
| line: Stripped line of single-line tag with op0=# op1=# ... data. |
| tag: The tag type in |line| (child tag of <TYPE_BLOCK_ID>). |
| attrib_pos: Position in |line| where attribute list starts. |
| """ |
| if tag in _NON_TYPE_TAGS: |
| return |
| if tag == 'INTEGER': |
| num_bits = next(_ParseOpItems(line, attrib_pos)) # op0. |
| self.int_types[self.cur_type_id] = num_bits |
| elif tag == 'ARRAY': |
| [size, item_type_id] = list(_ParseOpItems(line, attrib_pos)) # op0, op1. |
| bits = self.int_types.get(item_type_id) |
| if bits is not None: # |bits| can be None for non-int arrays. |
| self.int_array_types[self.cur_type_id] = _BcIntArrayType(size, bits / 8) |
| self.cur_type_id += 1 |
| |
| def GetArrayType(self, idx): |
| return self.int_array_types.get(idx) |
| |
| |
| def _ParseBcAnalyzer(lines): |
| """A generator to extract strings from bcanalyzer dump of a BC file.""" |
| |
| # ... |
| # <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4> |
| # ... (See above; parsed by _BcTypeInfo) |
| # <TYPE_BLOCK_ID> |
| # ... |
| # <CONSTANTS_BLOCK NumWords=93 BlockCodeSize=4> |
| # <SETTYPE abbrevid=4 op0=1/> # Current type id := 1 ==> int8*. |
| # <CE_INBOUNDS_GEP op0=3 op1=4 op2=1 op3=12 op4=57 op5=12 op6=57/> |
| # <SETTYPE abbrevid=4 op0=2/> # Current type id := 2 ==> int8[4]. |
| # <CSTRING abbrevid=11 op0=70 op1=111 op2=111/> record string = 'Foo' |
| # <STRING abbrevid=11 op0=70 op1=111 op2=111 op3=1/> # {'F','o','o',1}. |
| # <SETTYPE abbrevid=6 op0=7/> # Current type id := 7 ==> int32. |
| # <INTEGER abbrevid=5 op0=2000/> # Stores 1000. |
| # <INTEGER abbrevid=5 op0=2001/> # Stores -1000. |
| # <SETTYPE abbrevid=4 op0=8/> # Current type id := 8 ==> int16[4]. |
| # <NULL/> |
| # <DATA abbrevid=11 op0=8400 op1=10100 op2=11500 op3=0/> |
| # </CONSTANTS_BLOCK> |
| # ... |
| |
| # Notes: |
| # - Only parse first <TYPE_BLOCK_ID> and first <CONSTANTS_BLOCK>. |
| # - <CONSTANTS_BLOCK> is stateful: A "current type id" exists, and that's set |
| # by <SETTYPE>, with op0= referring to type id. |
| # - For array lengths one needs to refer to the corresponding <ARRAY>. |
| # - Strings / arrays are in <CSTRING>, <STRING>, and <DATA>. |
| # - abbrevid=# is redundant (repeats tag type) and unused |
| # - Character data are stored in op0=# op1=# ..., one per character. These |
| # values should fit in the proper range, and can be fairly large. |
| # - <CSTRING> has implicit 0 at end. |
| # - Data lengths agree with the length in the matching <ARRAY> entry. |
| # - "record string" text is not very useful: It only appears if all |
| # characters are printable. |
| # - Signed vs. unsigned types are undistinguished. |
| # - In <INTEGER>, the op0= value is stored as 2 * abs(x) + (signed ? 0 : 1). |
| # - In <ARRAY> of int, values are coerced to unsigned type. |
| # - Strings and int arrays are undistinguished. |
| # - <CSTRING>: If an uint8 array happens to end with 0, then this gets used! |
| # - Arrays (or integers) of all-0 appear as <NULL/>. Presumably this gets |
| # placed into .bss section. |
| |
| STATE_VOID = 0 |
| STATE_TYPE_BLOCK = 1 |
| STATE_CONST_BLOCK = 2 |
| state = STATE_VOID |
| |
| type_info = None |
| consts_cur_type = None |
| |
| # State machine to parse the *first* <TYPE_BLOCK_ID> to initialize |
| # |type_info|, then the *first* <CONSTANTS_BLOCK> to yield strings. |
| for line in lines: |
| line = line.lstrip() |
| (tag_type, tag, attrib_pos) = ParseTag(line) |
| if tag_type is None: |
| continue |
| if state == STATE_VOID: |
| if _IsOpeningTag(tag_type): |
| if tag == 'TYPE_BLOCK_ID': |
| if type_info is None: |
| state = STATE_TYPE_BLOCK |
| type_info = _BcTypeInfo() |
| elif tag == 'CONSTANTS_BLOCK': |
| if type_info is not None: |
| state = STATE_CONST_BLOCK |
| |
| elif state == STATE_TYPE_BLOCK: |
| if _IsClosingTag(tag_type) and tag == 'TYPE_BLOCK_ID': |
| state = STATE_VOID |
| else: |
| type_info.Feed(line, tag, attrib_pos) |
| |
| elif state == STATE_CONST_BLOCK: |
| if _IsClosingTag(tag_type) and tag == 'CONSTANTS_BLOCK': |
| # Skip remaining data, including subsequent <CONSTANTS_BLOCK>s. |
| break |
| elif tag == 'SETTYPE': |
| consts_cur_type_id = next(_ParseOpItems(line, attrib_pos)) # op0. |
| consts_cur_type = type_info.GetArrayType(consts_cur_type_id) |
| elif consts_cur_type and consts_cur_type.width <= _CHAR_WIDTH_LIMIT: |
| if tag in ['CSTRING', 'STRING', 'DATA']: |
| # Exclude 32-bit / 4-byte strings since they're rarely used, and are |
| # likely confused with 32-bit int arrays. |
| s = consts_cur_type.ParseOpItemsAsString( |
| line, attrib_pos, tag == 'CSTRING') |
| yield (consts_cur_type, s) |
| |
| |
| class _BcAnalyzerRunner: |
| """Helper to run bcanalyzer and extract output lines. """ |
| def __init__(self, tool_prefix, output_directory): |
| self._args = [path_util.GetBcAnalyzerPath(tool_prefix), '--dump', |
| '--disable-histogram'] |
| self._output_directory = output_directory |
| |
| def RunOnFile(self, obj_file): |
| output = subprocess.check_output(self._args + [obj_file], |
| cwd=self._output_directory) |
| return output.splitlines() |
| |
| |
| # This is a target for BulkForkAndCall(). |
| def RunBcAnalyzerOnIntermediates(target, tool_prefix, output_directory): |
| """Calls bcanalyzer and returns encoded map from path to strings. |
| |
| Args: |
| target: A list of BC file paths. |
| """ |
| assert isinstance(target, list) |
| runner = _BcAnalyzerRunner(tool_prefix, output_directory) |
| strings_by_path = {} |
| for t in target: |
| strings_by_path[t] = [s for _, s in _ParseBcAnalyzer(runner.RunOnFile(t))] |
| # Escape strings by repr() so there will be no special characters to interfere |
| # concurrent.EncodeDictOfLists() and decoding. |
| return concurrent.EncodeDictOfLists(strings_by_path, value_transform=repr) |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--tool-prefix', required=True) |
| parser.add_argument('--output-directory', default='.') |
| parser.add_argument('--char-width-limit', type=int) |
| parser.add_argument('objects', type=os.path.realpath, nargs='+') |
| |
| args = parser.parse_args() |
| base_path = os.path.normpath(args.output_directory) |
| runner = _BcAnalyzerRunner(args.tool_prefix, args.output_directory) |
| if args.char_width_limit is not None: |
| global _CHAR_WIDTH_LIMIT |
| _CHAR_WIDTH_LIMIT = args.char_width_limit |
| |
| for obj_path in args.objects: |
| rel_path = os.path.relpath(obj_path, base_path) |
| print('File: %s' % rel_path) |
| for cur_type, s in _ParseBcAnalyzer(runner.RunOnFile(obj_path)): |
| print(' char%d[%d]: %r' % (cur_type.width * 8, cur_type.length, s)) |
| print('') |
| |
| |
| if __name__ == '__main__': |
| main() |