| # Copyright 2022 The Chromium Authors |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| """Utilities for parsing Dalvik bytecode.""" |
| |
| import collections |
| import struct |
| |
| # Dalvik Bytecode specs copied from first two column of table in: |
| # https://source.android.com/docs/core/runtime/dalvik-bytecode#instructions |
| # with minor modification (truncating comments). |
| _DALVIK_BYTECODE_SPECS = """00 10x nop |
| 01 12x move vA, vB |
| 02 22x move/from16 vAA, vBBBB |
| 03 32x move/16 vAAAA, vBBBB |
| 04 12x move-wide vA, vB |
| 05 22x move-wide/from16 vAA, vBBBB |
| 06 32x move-wide/16 vAAAA, vBBBB |
| 07 12x move-object vA, vB |
| 08 22x move-object/from16 vAA, vBBBB |
| 09 32x move-object/16 vAAAA, vBBBB |
| 0a 11x move-result vAA |
| 0b 11x move-result-wide vAA |
| 0c 11x move-result-object vAA |
| 0d 11x move-exception vAA |
| 0e 10x return-void |
| 0f 11x return vAA |
| 10 11x return-wide vAA |
| 11 11x return-object vAA |
| 12 11n const/4 vA, #+B |
| 13 21s const/16 vAA, #+BBBB |
| 14 31i const vAA, #+BBBBBBBB |
| 15 21h const/high16 vAA, #+BBBB0000 |
| 16 21s const-wide/16 vAA, #+BBBB |
| 17 31i const-wide/32 vAA, #+BBBBBBBB |
| 18 51l const-wide vAA, #+BBBBBBBBBBBBBBBB |
| 19 21h const-wide/high16 vAA, #+BBBB000000000000 |
| 1a 21c const-string vAA, string@BBBB |
| 1b 31c const-string/jumbo vAA, string@BBBBBBBB |
| 1c 21c const-class vAA, type@BBBB |
| 1d 11x monitor-enter vAA |
| 1e 11x monitor-exit vAA |
| 1f 21c check-cast vAA, type@BBBB |
| 20 22c instance-of vA, vB, type@CCCC |
| 21 12x array-length vA, vB |
| 22 21c new-instance vAA, type@BBBB |
| 23 22c new-array vA, vB, type@CCCC |
| 24 35c filled-new-array {vC, vD, vE, vF, vG}, type@BBBB |
| 25 3rc filled-new-array/range {vCCCC .. vNNNN}, type@BBBB |
| 26 31t fill-array-data vAA, +BBBBBBBB (with supplemental data...) |
| 27 11x throw vAA |
| 28 10t goto +AA |
| 29 20t goto/16 +AAAA |
| 2a 30t goto/32 +AAAAAAAA |
| 2b 31t packed-switch vAA, +BBBBBBBB (with supplemental data...) |
| 2c 31t sparse-switch vAA, +BBBBBBBB (with supplemental data...) |
| 2d..31 23x cmpkind vAA, vBB, vCC |
| 2d: cmpl-float (lt bias) |
| 2e: cmpg-float (gt bias) |
| 2f: cmpl-double (lt bias) |
| 30: cmpg-double (gt bias) |
| 31: cmp-long |
| 32..37 22t if-test vA, vB, +CCCC |
| 32: if-eq |
| 33: if-ne |
| 34: if-lt |
| 35: if-ge |
| 36: if-gt |
| 37: if-le |
| 38..3d 21t if-testz vAA, +BBBB |
| 38: if-eqz |
| 39: if-nez |
| 3a: if-ltz |
| 3b: if-gez |
| 3c: if-gtz |
| 3d: if-lez |
| 3e..43 10x (unused) |
| 44..51 23x arrayop vAA, vBB, vCC |
| 44: aget |
| 45: aget-wide |
| 46: aget-object |
| 47: aget-boolean |
| 48: aget-byte |
| 49: aget-char |
| 4a: aget-short |
| 4b: aput |
| 4c: aput-wide |
| 4d: aput-object |
| 4e: aput-boolean |
| 4f: aput-byte |
| 50: aput-char |
| 51: aput-short |
| 52..5f 22c iinstanceop vA, vB, field@CCCC |
| 52: iget |
| 53: iget-wide |
| 54: iget-object |
| 55: iget-boolean |
| 56: iget-byte |
| 57: iget-char |
| 58: iget-short |
| 59: iput |
| 5a: iput-wide |
| 5b: iput-object |
| 5c: iput-boolean |
| 5d: iput-byte |
| 5e: iput-char |
| 5f: iput-short |
| 60..6d 21c sstaticop vAA, field@BBBB |
| 60: sget |
| 61: sget-wide |
| 62: sget-object |
| 63: sget-boolean |
| 64: sget-byte |
| 65: sget-char |
| 66: sget-short |
| 67: sput |
| 68: sput-wide |
| 69: sput-object |
| 6a: sput-boolean |
| 6b: sput-byte |
| 6c: sput-char |
| 6d: sput-short |
| 6e..72 35c invoke-kind {vC, vD, vE, vF, vG}, meth@BBBB |
| 6e: invoke-virtual |
| 6f: invoke-super |
| 70: invoke-direct |
| 71: invoke-static |
| 72: invoke-interface |
| 73 10x (unused) |
| 74..78 3rc invoke-kind/range {vCCCC .. vNNNN}, meth@BBBB |
| 74: invoke-virtual/range |
| 75: invoke-super/range |
| 76: invoke-direct/range |
| 77: invoke-static/range |
| 78: invoke-interface/range |
| 79..7a 10x (unused) |
| 7b..8f 12x unop vA, vB |
| 7b: neg-int |
| 7c: not-int |
| 7d: neg-long |
| 7e: not-long |
| 7f: neg-float |
| 80: neg-double |
| 81: int-to-long |
| 82: int-to-float |
| 83: int-to-double |
| 84: long-to-int |
| 85: long-to-float |
| 86: long-to-double |
| 87: float-to-int |
| 88: float-to-long |
| 89: float-to-double |
| 8a: double-to-int |
| 8b: double-to-long |
| 8c: double-to-float |
| 8d: int-to-byte |
| 8e: int-to-char |
| 8f: int-to-short |
| 90..af 23x binop vAA, vBB, vCC |
| 90: add-int |
| 91: sub-int |
| 92: mul-int |
| 93: div-int |
| 94: rem-int |
| 95: and-int |
| 96: or-int |
| 97: xor-int |
| 98: shl-int |
| 99: shr-int |
| 9a: ushr-int |
| 9b: add-long |
| 9c: sub-long |
| 9d: mul-long |
| 9e: div-long |
| 9f: rem-long |
| a0: and-long |
| a1: or-long |
| a2: xor-long |
| a3: shl-long |
| a4: shr-long |
| a5: ushr-long |
| a6: add-float |
| a7: sub-float |
| a8: mul-float |
| a9: div-float |
| aa: rem-float |
| ab: add-double |
| ac: sub-double |
| ad: mul-double |
| ae: div-double |
| af: rem-double |
| b0..cf 12x binop/2addr vA, vB |
| b0: add-int/2addr |
| b1: sub-int/2addr |
| b2: mul-int/2addr |
| b3: div-int/2addr |
| b4: rem-int/2addr |
| b5: and-int/2addr |
| b6: or-int/2addr |
| b7: xor-int/2addr |
| b8: shl-int/2addr |
| b9: shr-int/2addr |
| ba: ushr-int/2addr |
| bb: add-long/2addr |
| bc: sub-long/2addr |
| bd: mul-long/2addr |
| be: div-long/2addr |
| bf: rem-long/2addr |
| c0: and-long/2addr |
| c1: or-long/2addr |
| c2: xor-long/2addr |
| c3: shl-long/2addr |
| c4: shr-long/2addr |
| c5: ushr-long/2addr |
| c6: add-float/2addr |
| c7: sub-float/2addr |
| c8: mul-float/2addr |
| c9: div-float/2addr |
| ca: rem-float/2addr |
| cb: add-double/2addr |
| cc: sub-double/2addr |
| cd: mul-double/2addr |
| ce: div-double/2addr |
| cf: rem-double/2addr |
| d0..d7 22s binop/lit16 vA, vB, #+CCCC |
| d0: add-int/lit16 |
| d1: rsub-int (reverse subtract) |
| d2: mul-int/lit16 |
| d3: div-int/lit16 |
| d4: rem-int/lit16 |
| d5: and-int/lit16 |
| d6: or-int/lit16 |
| d7: xor-int/lit16 |
| d8..e2 22b binop/lit8 vAA, vBB, #+CC |
| d8: add-int/lit8 |
| d9: rsub-int/lit8 |
| da: mul-int/lit8 |
| db: div-int/lit8 |
| dc: rem-int/lit8 |
| dd: and-int/lit8 |
| de: or-int/lit8 |
| df: xor-int/lit8 |
| e0: shl-int/lit8 |
| e1: shr-int/lit8 |
| e2: ushr-int/lit8 |
| e3..f9 10x (unused) |
| fa 45cc invoke-polymorphic {vC, vD, vE, vF, vG}, meth@BBBB, proto@HHHH |
| fb 4rcc invoke-polymorphic/range {vCCCC .. vNNNN}, meth@BBBB, proto@HHHH |
| fc 35c invoke-custom {vC, vD, vE, vF, vG}, call_site@BBBB |
| fd 3rc invoke-custom/range {vCCCC .. vNNNN}, call_site@BBBB |
| fe 21c const-method-handle vAA, method_handle@BBBB |
| ff 21c const-method-type vAA, proto@BBBB |
| """ |
| |
| DalvikByteCode = collections.namedtuple('DalvikByteCode', |
| 'op,size,format,name,params') |
| |
| |
| def _ParseByteCodeSpecs(): |
| """Parses _DALVIK_BYTECODE_SPECS into DalvikByteCode array.""" |
| format_map = [None] * 256 |
| name_map = [None] * 256 |
| params_map = [None] * 256 |
| (op_lo, op_hi) = (None, None) |
| for line in _DALVIK_BYTECODE_SPECS.splitlines(): |
| comment_pos = line.find(' (') |
| if comment_pos >= 0: |
| line = line[:comment_pos] |
| assert len(line) >= 5 |
| if line[2] == ':': |
| # Inside op range, e.g.: 'b0: add-int/2addr'. |
| # ['b0', 'add-int/2addr']. |
| toks = line.split(': ') |
| assert len(toks) == 2 |
| op = int(toks[0], 16) |
| assert op_lo <= op <= op_hi |
| name_map[op] = toks[1] # 'add-int/2addr'. |
| if op == op_hi: |
| op_lo = op_hi = None |
| elif line[2:4] == '..': |
| # Define op range, e.g.: 'b0..cf 12x binop/2addr vA, vB'. |
| # ['b0..cf', '12x', 'binop/2addr', 'vA, vB']. |
| toks = line.split(maxsplit=3) |
| # (0xb0, 0xcf). |
| (op_lo, op_hi) = (int(t, 16) for t in toks[0].split('..')) |
| for op in range(op_lo, op_hi + 1): |
| format_map[op] = toks[1] # '12x'. |
| if len(toks) > 2: # If not unused. |
| for op in range(op_lo, op_hi + 1): |
| params_map[op] = toks[3] # 'vA, vB'. |
| else: |
| # Standalone op, e.g.: '15 21h const/high16 vAA, #+BBBB0000'. |
| # ['15', '21h', 'const/high16', 'vAA, #+BBBB0000']. |
| toks = line.split(maxsplit=3) |
| op = int(toks[0], 16) |
| format_map[op] = toks[1] # '21h'. |
| if len(toks) > 2: # If not unused. |
| name_map[op] = toks[2] # 'const/high16'. |
| params_map[op] = toks[3] if len(toks) >= 4 else '' # 'vAA, #+BBBB0000'. |
| |
| ret = [] |
| for op in range(256): |
| size = int(format_map[op][0]) * 2 # '21h' -> 4. |
| bc = DalvikByteCode(op, size, format_map[op], name_map[op], params_map[op]) |
| ret.append(bc) |
| return ret |
| |
| |
| DALVIK_INSTRUCTIONS = _ParseByteCodeSpecs() |
| |
| |
| def Split(insns): |
| """Splits Dalvik code into a series of instruction bytes. |
| |
| The minimalistic approach avoids wasted work. It's up to the caller to filter |
| and/or disassemble emitted bytes. It is assumed that supplemental data (from |
| 31t instructions {fill-array-data, packed-switch, sparse-switch}) are found at |
| the end of `insns`. These are detected and omitted. |
| |
| Args: |
| insns: Even-length bytearray data containing valid Dalvik code. |
| """ |
| pos_end = len(insns) |
| assert pos_end % 2 == 0 |
| pos = 0 |
| while pos < pos_end: |
| instr = DALVIK_INSTRUCTIONS[insns[pos]] |
| size = instr.size |
| chunk = insns[pos:pos + size] |
| # Instructions with supplemental data contains relative offset to where |
| # data starts, which indicates where code ends. |
| if instr.format == '31t': |
| offset = struct.unpack_from('<L', chunk, 2)[0] |
| pos_end = min(pos_end, pos + offset * 2) |
| yield chunk |
| pos += size |
| # Do not emit supplemental data. |