| #!/usr/bin/env python3 | 
 | # Copyright 2018 The Chromium Authors | 
 | # Use of this source code is governed by a BSD-style license that can be | 
 | # found in the LICENSE file. | 
 |  | 
 | """Runs bcanalyzer to extract data from LLVM Bitcode (BC) files. | 
 |  | 
 | IsBitcodeFile(): | 
 |   Reads the magic header of a file to quickly decide whether it is a BC file. | 
 |  | 
 | ParseTag(): | 
 |   Heuristically parses a single-line tag from bcanalyzer dump (exporeted for | 
 |   testing). | 
 |  | 
 | RunBcAnalyzerOnIntermediates(): | 
 |   BulkForkAndCall() target: Given BC file [paths], runs (llvm-)bcanalyzer on | 
 |   each path, parses the output, extracts strings, and returns {path: [strings]}. | 
 |  | 
 | This file can also be run stand-alone in order to test out the logic on smaller | 
 | sample sizes. | 
 | """ | 
 |  | 
 | import argparse | 
 | import os | 
 | import re | 
 | import subprocess | 
 |  | 
 | import parallel | 
 | import path_util | 
 |  | 
 |  | 
 | # Upper bound on number of bytes per character in strings. 4-byte / 32-bit | 
 | # strings are rare and are likely confused with 32-bit int arrays. So by | 
 | # default, only accept up to 2-byte / 16-bit strings. | 
 | _CHAR_WIDTH_LIMIT = 2 | 
 |  | 
 | _RE_SPLIT = re.compile(r'=(\d+)') | 
 | # <TYPE_BLOCK_ID> children tags that should not be counted as types. | 
 | # - <NUMENTRY> is meta data. | 
 | # - <STRUCT_NAME> with the following <STRUCT_NAMED> (or other tag) are counted | 
 | #   as a single type entry. | 
 | _NON_TYPE_TAGS = set(['NUMENTRY', 'STRUCT_NAME']) | 
 |  | 
 | # Use bit-fields for tag types: 1 => Opening tag, 2 => Closed tag. | 
 | OPENING_TAG = 1 | 
 | CLOSING_TAG = 2 | 
 | SELF_CLOSING_TAG = OPENING_TAG | CLOSING_TAG | 
 |  | 
 |  | 
 | def _IsOpeningTag(tag_type): | 
 |   return tag_type & 1 | 
 |  | 
 |  | 
 | def _IsClosingTag(tag_type): | 
 |   return tag_type & 2 | 
 |  | 
 |  | 
 | def IsBitcodeFile(path): | 
 |   try: | 
 |     with open(path, 'rb') as f: | 
 |       return f.read(4) == b'BC\xc0\xde' | 
 |   except IOError: | 
 |     return False | 
 |  | 
 |  | 
 | def ParseTag(line): | 
 |   """Heuristically parses a single-line tag from bcanalyzer dump. | 
 |  | 
 |   Since input data are machine-generated, so we only need "good enough" parsing | 
 |   logic that favors simplicity. For example, '</FOO/>' is accepted. | 
 |  | 
 |   Args: | 
 |     line: Stripped line that may have a single-line tag with trailing text. | 
 |  | 
 |   Returns: | 
 |     (tag_type, tag, attrib_pos) if successful, else (None) * 3. Details: | 
 |     tag_type: One of {OPENING_TAG, CLOSING_TAG, SELF_CLOSING_TAG}. | 
 |     tag: The tag name. | 
 |     attrib_pos: Position in |line| to start parsing attributes. | 
 |   """ | 
 |   # <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4> | 
 |   #     ==> (OPENING_TAG, 'TYPE_BLOCK_ID', 14). | 
 |   # <ARRAY abbrevid=9 op0=1 op1=7/> Trailing text! | 
 |   #     ==> (SELF_CLOSING_TAG, 'ARRAY', 6). | 
 |   # </TYPE_BLOCK_ID> | 
 |   #     ==> (CLOSING_TAG, 'TYPE_BLOCK_ID', 15). | 
 |  | 
 |   # Assumes |line| is stripped, i.e., so no indent and no trailing new line. | 
 |   if len(line) < 2 or line[0] != '<': | 
 |     return (None, None, None) | 
 |   tag_type, pos = (CLOSING_TAG, 2) if line[1] == '/' else (OPENING_TAG, 1) | 
 |   for i in range(pos, len(line)): | 
 |     if not line[i].isalnum() and line[i] != '_': | 
 |       if i == pos or not line[i] in ' >/': | 
 |         break | 
 |       end = line.find('>', i) | 
 |       if end < 0: | 
 |         break | 
 |       if line[end - 1] == '/': | 
 |         return (SELF_CLOSING_TAG, line[pos:i], i) | 
 |       return (tag_type, line[pos:i], i) | 
 |   return (None, None, None) | 
 |  | 
 |  | 
 | def _ParseOpItems(line, pos): | 
 |   """Heuristically extracts op0=# op1=# ... values from a single-line tag.""" | 
 |   # <SETTYPE abbrevid=4 op0=42/> | 
 |   #         ^ pos = 8 | 
 |   #     ==> iter([42]). | 
 |   # <CSTRING abbrevid=11 op0=84 op1=101 op2=115 op3=116 op4=56 op5=97/> | 
 |   #         ^ pos = 8 | 
 |   #     ==> iter([84, 101, 115, 116, 56, 97]). | 
 |   # <STRING abbrevid=9 op0=1 op1=0 op2=0 op3=1 op4=1 op5=0/> | 
 |   #        ^ pos = 7 | 
 |   #     ==> iter([1, 0, 0, 1, 1, 0]). | 
 |   # <DATA op0=8412 op1=101 op2=1150 op3=116 op4=5200 op5=98 op6=0/> | 
 |   #      ^ pos = 5 | 
 |   #     ==> iter([8412, 101, 1150, 116, 5200, 98, 0]). | 
 |  | 
 |   # In particular, skip 'abbrevid=#'. | 
 |   start = line.index(' op', pos) | 
 |   end = line.index('>', start) | 
 |   for t in _RE_SPLIT.finditer(line[start:end]): | 
 |     yield int(t.group(1)) | 
 |  | 
 |  | 
 | # Emits uint16 values as a stream of 2 bytes (little-endian). | 
 | def _UnpackUint16ListToBytes(items): | 
 |   for item in items: | 
 |     yield item & 0xFF | 
 |     yield (item >> 8) & 0xFF | 
 |  | 
 |  | 
 | # Emits uint32 values as a stream of 4 bytes (little-endian). | 
 | def _UnpackUint32ListToBytes(items): | 
 |   for item in items: | 
 |     yield item & 0xFF | 
 |     yield (item >> 8) & 0xFF | 
 |     yield (item >> 16) & 0xFF | 
 |     yield (item >> 24) & 0xFF | 
 |  | 
 |  | 
 | class _BcIntArrayType: | 
 |   """The specs of an integer array type.""" | 
 |  | 
 |   # Lookup table to map from width to an unpacker that splits ints into bytes. | 
 |   _UNPACKER_MAP = { | 
 |     1: iter, | 
 |     2: _UnpackUint16ListToBytes, | 
 |     4: _UnpackUint32ListToBytes | 
 |   } | 
 |  | 
 |   def __init__(self, length, width): | 
 |     # Number of elements in the array. | 
 |     self.length = length | 
 |     # Number of bytes per element. | 
 |     self.width = width | 
 |  | 
 |   def ParseOpItemsAsBytes(self, line, attrib_pos, add_null_at_end): | 
 |     """Reads op0=# op=# ... values and returns them as a list of bytes. | 
 |  | 
 |     Interprets each op0=# op1=# ... value as a |self.width|-byte integer, splits | 
 |     them into component bytes (little-endian), and returns the result as string. | 
 |  | 
 |     Args: | 
 |       line: Stripped line of single-line tag with op0=# op1=# ... data. | 
 |       attrib_pos: Position in |line| where attribute list starts. | 
 |       add_null_add_end: Whether to append |'\x00' * self.width|. | 
 |     """ | 
 |     items = _ParseOpItems(line, attrib_pos) | 
 |     unpacker = _BcIntArrayType._UNPACKER_MAP[self.width] | 
 |     s = bytes(unpacker(items)) | 
 |     if add_null_at_end: | 
 |       s += b'\x00' * self.width | 
 |     # Rather stringent check to ensure exact size match. | 
 |     assert len(s) == self.length * self.width | 
 |     return s | 
 |  | 
 |  | 
 | class _BcTypeInfo: | 
 |   """Stateful parser of <TYPE_BLOCK_ID>, specialized for integer arrays.""" | 
 |  | 
 |   # <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4> | 
 |   #   <NUMENTRY op0=9/>                  # Type ids should be in [0, 8]. | 
 |   #   <INTEGER op0=8/>                   # Type id = 0: int8. | 
 |   #   <POINTER abbrevid=4 op0=0 op1=0/>  # Type id = 1: Pointer to type id 0 | 
 |   #                                      #    ==> int8*. | 
 |   #   <ARRAY abbrevid=9 op0=4 op1=0/>    # Type id = 2: Array with 4 elements | 
 |   #                                      # of type id 0 ==> int8[4] | 
 |   #   <STRUCT_NAME op0=115 op1=116 op2=114/>  # Joins next Tag. | 
 |   #   <STRUCT_NAMED abbrevid=8 op0=0 op1=1/>  # Type id = 3: Struct (unused). | 
 |   #   <FUNCTION abbrevid=5 op0=0 op1=12/>  # Type id = 4: Function (unused). | 
 |   #   <INTEGER op0=16/>                  # Type id = 5: int16. | 
 |   #   <POINTER abbrevid=4 op0=5 op1=0/>  # Type id = 6: Pointer to type id 5 | 
 |   #                                      #    ==> int16*. | 
 |   #   <INTEGER op0=32/>                  # Type id = 7: int32. | 
 |   #   <ARRAY abbrevid=9 op0=5 op1=4/>    # Type id = 8: Array with 4 elements | 
 |   #                                      # of type id 5 ==> int16[4] | 
 |   # <TYPE_BLOCK_ID> | 
 |  | 
 |   def __init__(self): | 
 |     # Auto-incrementing current type id. | 
 |     self.cur_type_id = 0 | 
 |     # Maps from type id (of an integer) to number of bits. | 
 |     self.int_types = {} | 
 |     # Maps from type id (of an integer array) to _BcIntArrayType. | 
 |     self.int_array_types = {} | 
 |  | 
 |   def Feed(self, line, tag, attrib_pos): | 
 |     """Parses a single-line tag and store integer and integer array types. | 
 |  | 
 |     Args: | 
 |       line: Stripped line of single-line tag with op0=# op1=# ... data. | 
 |       tag: The tag type in |line| (child tag of <TYPE_BLOCK_ID>). | 
 |       attrib_pos: Position in |line| where attribute list starts. | 
 |     """ | 
 |     if tag in _NON_TYPE_TAGS: | 
 |       return | 
 |     if tag == 'INTEGER': | 
 |       num_bits = next(_ParseOpItems(line, attrib_pos))  # op0. | 
 |       self.int_types[self.cur_type_id] = num_bits | 
 |     elif tag == 'ARRAY': | 
 |       [size, item_type_id] = list(_ParseOpItems(line, attrib_pos))  # op0, op1. | 
 |       bits = self.int_types.get(item_type_id) | 
 |       if bits is not None:  # |bits| can be None for non-int arrays. | 
 |         self.int_array_types[self.cur_type_id] = _BcIntArrayType( | 
 |             size, bits // 8) | 
 |     self.cur_type_id += 1 | 
 |  | 
 |   def GetArrayType(self, idx): | 
 |     return self.int_array_types.get(idx) | 
 |  | 
 |  | 
 | def _ParseBcAnalyzer(lines): | 
 |   """A generator to extract bytes() from bcanalyzer dump of a BC file.""" | 
 |  | 
 |   # ... | 
 |   # <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4> | 
 |   #    ... (See above; parsed by _BcTypeInfo) | 
 |   # <TYPE_BLOCK_ID> | 
 |   # ... | 
 |   # <CONSTANTS_BLOCK NumWords=93 BlockCodeSize=4> | 
 |   #   <SETTYPE abbrevid=4 op0=1/>  # Current type id := 1 ==> int8*. | 
 |   #   <CE_INBOUNDS_GEP op0=3 op1=4 op2=1 op3=12 op4=57 op5=12 op6=57/> | 
 |   #   <SETTYPE abbrevid=4 op0=2/>  # Current type id := 2 ==> int8[4]. | 
 |   #   <CSTRING abbrevid=11 op0=70 op1=111 op2=111/> record string = 'Foo' | 
 |   #   <STRING abbrevid=11 op0=70 op1=111 op2=111 op3=1/>  # {'F','o','o',1}. | 
 |   #   <SETTYPE abbrevid=6 op0=7/>    # Current type id := 7 ==> int32. | 
 |   #   <INTEGER abbrevid=5 op0=2000/> # Stores 1000. | 
 |   #   <INTEGER abbrevid=5 op0=2001/> # Stores -1000. | 
 |   #   <SETTYPE abbrevid=4 op0=8/>    # Current type id := 8 ==> int16[4]. | 
 |   #   <NULL/> | 
 |   #   <DATA abbrevid=11 op0=8400 op1=10100 op2=11500 op3=0/> | 
 |   # </CONSTANTS_BLOCK> | 
 |   # ... | 
 |  | 
 |   # Notes: | 
 |   # - Only parse first <TYPE_BLOCK_ID> and first <CONSTANTS_BLOCK>. | 
 |   # - <CONSTANTS_BLOCK> is stateful: A "current type id" exists, and that's set | 
 |   #   by <SETTYPE>, with op0= referring to type id. | 
 |   #   - For array lengths one needs to refer to the corresponding <ARRAY>. | 
 |   # - Strings / arrays are in <CSTRING>, <STRING>, and <DATA>. | 
 |   #   - abbrevid=# is redundant (repeats tag type) and unused | 
 |   #   - Character data are stored in op0=# op1=# ..., one per character. These | 
 |   #     values should fit in the proper range, and can be fairly large. | 
 |   #   - <CSTRING> has implicit 0 at end. | 
 |   #   - Data lengths agree with the length in the matching <ARRAY> entry. | 
 |   #   - "record string" text is not very useful: It only appears if all | 
 |   #     characters are printable. | 
 |   # - Signed vs. unsigned types are undistinguished. | 
 |   #   - In <INTEGER>, the op0= value is stored as 2 * abs(x) + (signed ? 0 : 1). | 
 |   #   - In <ARRAY> of int, values are coerced to unsigned type. | 
 |   # - Strings and int arrays are undistinguished. | 
 |   #   - <CSTRING>: If an uint8 array happens to end with 0, then this gets used! | 
 |   # - Arrays (or integers) of all-0 appear as <NULL/>. Presumably this gets | 
 |   #   placed into .bss section. | 
 |  | 
 |   STATE_VOID = 0 | 
 |   STATE_TYPE_BLOCK = 1 | 
 |   STATE_CONST_BLOCK = 2 | 
 |   state = STATE_VOID | 
 |  | 
 |   type_info = None | 
 |   consts_cur_type = None | 
 |  | 
 |   # State machine to parse the *first* <TYPE_BLOCK_ID> to initialize | 
 |   # |type_info|, then the *first* <CONSTANTS_BLOCK> to yield strings. | 
 |   for line in lines: | 
 |     line = line.lstrip() | 
 |     (tag_type, tag, attrib_pos) = ParseTag(line) | 
 |     if tag_type is None: | 
 |       continue | 
 |     if state == STATE_VOID: | 
 |       if _IsOpeningTag(tag_type): | 
 |         if tag == 'TYPE_BLOCK_ID': | 
 |           if type_info is None: | 
 |             state = STATE_TYPE_BLOCK | 
 |             type_info = _BcTypeInfo() | 
 |         elif tag == 'CONSTANTS_BLOCK': | 
 |           if type_info is not None: | 
 |             state = STATE_CONST_BLOCK | 
 |  | 
 |     elif state == STATE_TYPE_BLOCK: | 
 |       if _IsClosingTag(tag_type) and tag == 'TYPE_BLOCK_ID': | 
 |         state = STATE_VOID | 
 |       else: | 
 |         type_info.Feed(line, tag, attrib_pos) | 
 |  | 
 |     elif state == STATE_CONST_BLOCK: | 
 |       if _IsClosingTag(tag_type) and tag == 'CONSTANTS_BLOCK': | 
 |         # Skip remaining data, including subsequent <CONSTANTS_BLOCK>s. | 
 |         break | 
 |       if tag == 'SETTYPE': | 
 |         try: | 
 |           consts_cur_type_id = next(_ParseOpItems(line, attrib_pos))  # op0. | 
 |         except StopIteration: | 
 |           return | 
 |         consts_cur_type = type_info.GetArrayType(consts_cur_type_id) | 
 |       elif consts_cur_type and consts_cur_type.width <= _CHAR_WIDTH_LIMIT: | 
 |         if tag in ['CSTRING', 'STRING', 'DATA']: | 
 |           # Exclude 32-bit / 4-byte strings since they're rarely used, and are | 
 |           # likely confused with 32-bit int arrays. | 
 |           s = consts_cur_type.ParseOpItemsAsBytes(line, attrib_pos, | 
 |                                                   tag == 'CSTRING') | 
 |           yield (consts_cur_type, s) | 
 |  | 
 |  | 
 | class _BcAnalyzerRunner: | 
 |   """Helper to run bcanalyzer and extract output lines. """ | 
 |  | 
 |   def __init__(self, output_directory): | 
 |     self._args = [ | 
 |         path_util.GetBcAnalyzerPath(), '--dump', '--disable-histogram' | 
 |     ] | 
 |     self._output_directory = output_directory | 
 |  | 
 |   def RunOnFile(self, obj_file): | 
 |     output = subprocess.check_output( | 
 |         self._args + [obj_file], cwd=self._output_directory).decode('ascii') | 
 |     return output.splitlines() | 
 |  | 
 |  | 
 | # This is a target for BulkForkAndCall(). | 
 | def RunBcAnalyzerOnIntermediates(target, output_directory): | 
 |   """Calls bcanalyzer and returns encoded map from path to strings. | 
 |  | 
 |   Args: | 
 |     target: A list of BC file paths. | 
 |   """ | 
 |   assert isinstance(target, list) | 
 |   runner = _BcAnalyzerRunner(output_directory) | 
 |   strings_by_path = {} | 
 |   for t in target: | 
 |     strings_by_path[t] = [s for _, s in _ParseBcAnalyzer(runner.RunOnFile(t))] | 
 |   # Escape strings by repr() so there will be no special characters to interfere | 
 |   # parallel.EncodeDictOfLists() and decoding. | 
 |   return parallel.EncodeDictOfLists(strings_by_path, value_transform=repr) | 
 |  | 
 |  | 
 | def main(): | 
 |   parser = argparse.ArgumentParser() | 
 |   parser.add_argument('--output-directory', default='.') | 
 |   parser.add_argument('--char-width-limit', type=int) | 
 |   parser.add_argument('objects', type=os.path.realpath, nargs='+') | 
 |  | 
 |   args = parser.parse_args() | 
 |   base_path = os.path.normpath(args.output_directory) | 
 |   runner = _BcAnalyzerRunner(args.output_directory) | 
 |   if args.char_width_limit is not None: | 
 |     global _CHAR_WIDTH_LIMIT | 
 |     _CHAR_WIDTH_LIMIT = args.char_width_limit | 
 |  | 
 |   for obj_path in args.objects: | 
 |     rel_path = os.path.relpath(obj_path, base_path) | 
 |     print('File: %s' % rel_path) | 
 |     for cur_type, s in _ParseBcAnalyzer(runner.RunOnFile(obj_path)): | 
 |       print('    char%d[%d]: %r' % (cur_type.width * 8, cur_type.length, s)) | 
 |     print('') | 
 |  | 
 |  | 
 | if __name__ == '__main__': | 
 |   main() |