tools/binary_size/libsupersize/dalvik_bytecode.py - chromium/src - Git at Google

 # Copyright 2022 The Chromium Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 """Utilities for parsing Dalvik bytecode."""

 import collections
 import struct

 # Dalvik Bytecode specs copied from first two column of table in:
 #   https://source.android.com/docs/core/runtime/dalvik-bytecode#instructions
 # with minor modification (truncating comments).
 _DALVIK_BYTECODE_SPECS = """00 10x  nop
 01 12x  move vA, vB
 02 22x  move/from16 vAA, vBBBB
 03 32x  move/16 vAAAA, vBBBB
 04 12x  move-wide vA, vB
 05 22x  move-wide/from16 vAA, vBBBB
 06 32x  move-wide/16 vAAAA, vBBBB
 07 12x  move-object vA, vB
 08 22x  move-object/from16 vAA, vBBBB
 09 32x  move-object/16 vAAAA, vBBBB
 0a 11x  move-result vAA
 0b 11x  move-result-wide vAA
 0c 11x  move-result-object vAA
 0d 11x  move-exception vAA
 0e 10x  return-void
 0f 11x  return vAA
 10 11x  return-wide vAA
 11 11x  return-object vAA
 12 11n  const/4 vA, #+B
 13 21s  const/16 vAA, #+BBBB
 14 31i  const vAA, #+BBBBBBBB
 15 21h  const/high16 vAA, #+BBBB0000
 16 21s  const-wide/16 vAA, #+BBBB
 17 31i  const-wide/32 vAA, #+BBBBBBBB
 18 51l  const-wide vAA, #+BBBBBBBBBBBBBBBB
 19 21h  const-wide/high16 vAA, #+BBBB000000000000
 1a 21c  const-string vAA, string@BBBB
 1b 31c  const-string/jumbo vAA, string@BBBBBBBB
 1c 21c  const-class vAA, type@BBBB
 1d 11x  monitor-enter vAA
 1e 11x  monitor-exit vAA
 1f 21c  check-cast vAA, type@BBBB
 20 22c  instance-of vA, vB, type@CCCC
 21 12x  array-length vA, vB
 22 21c  new-instance vAA, type@BBBB
 23 22c  new-array vA, vB, type@CCCC
 24 35c  filled-new-array {vC, vD, vE, vF, vG}, type@BBBB
 25 3rc  filled-new-array/range {vCCCC .. vNNNN}, type@BBBB
 26 31t  fill-array-data vAA, +BBBBBBBB (with supplemental data...)
 27 11x  throw vAA
 28 10t  goto +AA
 29 20t  goto/16 +AAAA
 2a 30t  goto/32 +AAAAAAAA
 2b 31t  packed-switch vAA, +BBBBBBBB (with supplemental data...)
 2c 31t  sparse-switch vAA, +BBBBBBBB (with supplemental data...)
 2d..31 23x  cmpkind vAA, vBB, vCC
 2d: cmpl-float (lt bias)
 2e: cmpg-float (gt bias)
 2f: cmpl-double (lt bias)
 30: cmpg-double (gt bias)
 31: cmp-long
 32..37 22t  if-test vA, vB, +CCCC
 32: if-eq
 33: if-ne
 34: if-lt
 35: if-ge
 36: if-gt
 37: if-le
 38..3d 21t  if-testz vAA, +BBBB
 38: if-eqz
 39: if-nez
 3a: if-ltz
 3b: if-gez
 3c: if-gtz
 3d: if-lez
 3e..43 10x  (unused)
 44..51 23x  arrayop vAA, vBB, vCC
 44: aget
 45: aget-wide
 46: aget-object
 47: aget-boolean
 48: aget-byte
 49: aget-char
 4a: aget-short
 4b: aput
 4c: aput-wide
 4d: aput-object
 4e: aput-boolean
 4f: aput-byte
 50: aput-char
 51: aput-short
 52..5f 22c  iinstanceop vA, vB, field@CCCC
 52: iget
 53: iget-wide
 54: iget-object
 55: iget-boolean
 56: iget-byte
 57: iget-char
 58: iget-short
 59: iput
 5a: iput-wide
 5b: iput-object
 5c: iput-boolean
 5d: iput-byte
 5e: iput-char
 5f: iput-short
 60..6d 21c  sstaticop vAA, field@BBBB
 60: sget
 61: sget-wide
 62: sget-object
 63: sget-boolean
 64: sget-byte
 65: sget-char
 66: sget-short
 67: sput
 68: sput-wide
 69: sput-object
 6a: sput-boolean
 6b: sput-byte
 6c: sput-char
 6d: sput-short
 6e..72 35c  invoke-kind {vC, vD, vE, vF, vG}, meth@BBBB
 6e: invoke-virtual
 6f: invoke-super
 70: invoke-direct
 71: invoke-static
 72: invoke-interface
 73 10x  (unused)
 74..78 3rc  invoke-kind/range {vCCCC .. vNNNN}, meth@BBBB
 74: invoke-virtual/range
 75: invoke-super/range
 76: invoke-direct/range
 77: invoke-static/range
 78: invoke-interface/range
 79..7a 10x  (unused)
 7b..8f 12x  unop vA, vB
 7b: neg-int
 7c: not-int
 7d: neg-long
 7e: not-long
 7f: neg-float
 80: neg-double
 81: int-to-long
 82: int-to-float
 83: int-to-double
 84: long-to-int
 85: long-to-float
 86: long-to-double
 87: float-to-int
 88: float-to-long
 89: float-to-double
 8a: double-to-int
 8b: double-to-long
 8c: double-to-float
 8d: int-to-byte
 8e: int-to-char
 8f: int-to-short
 90..af 23x  binop vAA, vBB, vCC
 90: add-int
 91: sub-int
 92: mul-int
 93: div-int
 94: rem-int
 95: and-int
 96: or-int
 97: xor-int
 98: shl-int
 99: shr-int
 9a: ushr-int
 9b: add-long
 9c: sub-long
 9d: mul-long
 9e: div-long
 9f: rem-long
 a0: and-long
 a1: or-long
 a2: xor-long
 a3: shl-long
 a4: shr-long
 a5: ushr-long
 a6: add-float
 a7: sub-float
 a8: mul-float
 a9: div-float
 aa: rem-float
 ab: add-double
 ac: sub-double
 ad: mul-double
 ae: div-double
 af: rem-double
 b0..cf 12x  binop/2addr vA, vB
 b0: add-int/2addr
 b1: sub-int/2addr
 b2: mul-int/2addr
 b3: div-int/2addr
 b4: rem-int/2addr
 b5: and-int/2addr
 b6: or-int/2addr
 b7: xor-int/2addr
 b8: shl-int/2addr
 b9: shr-int/2addr
 ba: ushr-int/2addr
 bb: add-long/2addr
 bc: sub-long/2addr
 bd: mul-long/2addr
 be: div-long/2addr
 bf: rem-long/2addr
 c0: and-long/2addr
 c1: or-long/2addr
 c2: xor-long/2addr
 c3: shl-long/2addr
 c4: shr-long/2addr
 c5: ushr-long/2addr
 c6: add-float/2addr
 c7: sub-float/2addr
 c8: mul-float/2addr
 c9: div-float/2addr
 ca: rem-float/2addr
 cb: add-double/2addr
 cc: sub-double/2addr
 cd: mul-double/2addr
 ce: div-double/2addr
 cf: rem-double/2addr
 d0..d7 22s  binop/lit16 vA, vB, #+CCCC
 d0: add-int/lit16
 d1: rsub-int (reverse subtract)
 d2: mul-int/lit16
 d3: div-int/lit16
 d4: rem-int/lit16
 d5: and-int/lit16
 d6: or-int/lit16
 d7: xor-int/lit16
 d8..e2 22b  binop/lit8 vAA, vBB, #+CC
 d8: add-int/lit8
 d9: rsub-int/lit8
 da: mul-int/lit8
 db: div-int/lit8
 dc: rem-int/lit8
 dd: and-int/lit8
 de: or-int/lit8
 df: xor-int/lit8
 e0: shl-int/lit8
 e1: shr-int/lit8
 e2: ushr-int/lit8
 e3..f9 10x  (unused)
 fa 45cc invoke-polymorphic {vC, vD, vE, vF, vG}, meth@BBBB, proto@HHHH
 fb 4rcc invoke-polymorphic/range {vCCCC .. vNNNN}, meth@BBBB, proto@HHHH
 fc 35c  invoke-custom {vC, vD, vE, vF, vG}, call_site@BBBB
 fd 3rc  invoke-custom/range {vCCCC .. vNNNN}, call_site@BBBB
 fe 21c  const-method-handle vAA, method_handle@BBBB
 ff 21c  const-method-type vAA, proto@BBBB
 """

 DalvikByteCode = collections.namedtuple('DalvikByteCode',
                                         'op,size,format,name,params')


 def _ParseByteCodeSpecs():
   """Parses _DALVIK_BYTECODE_SPECS into DalvikByteCode array."""
   format_map = [None] * 256
   name_map = [None] * 256
   params_map = [None] * 256
   (op_lo, op_hi) = (None, None)
   for line in _DALVIK_BYTECODE_SPECS.splitlines():
     comment_pos = line.find(' (')
     if comment_pos >= 0:
       line = line[:comment_pos]
     assert len(line) >= 5
     if line[2] == ':':
       # Inside op range, e.g.: 'b0: add-int/2addr'.
       # ['b0', 'add-int/2addr'].
       toks = line.split(': ')
       assert len(toks) == 2
       op = int(toks[0], 16)
       assert op_lo <= op <= op_hi
       name_map[op] = toks[1]  # 'add-int/2addr'.
       if op == op_hi:
         op_lo = op_hi = None
     elif line[2:4] == '..':
       # Define op range, e.g.: 'b0..cf 12x  binop/2addr vA, vB'.
       # ['b0..cf', '12x', 'binop/2addr', 'vA, vB'].
       toks = line.split(maxsplit=3)
       # (0xb0, 0xcf).
       (op_lo, op_hi) = (int(t, 16) for t in toks[0].split('..'))
       for op in range(op_lo, op_hi + 1):
         format_map[op] = toks[1]  # '12x'.
       if len(toks) > 2:  # If not unused.
         for op in range(op_lo, op_hi + 1):
           params_map[op] = toks[3]  # 'vA, vB'.
     else:
       # Standalone op, e.g.: '15 21h  const/high16 vAA, #+BBBB0000'.
       # ['15', '21h', 'const/high16', 'vAA, #+BBBB0000'].
       toks = line.split(maxsplit=3)
       op = int(toks[0], 16)
       format_map[op] = toks[1]  # '21h'.
       if len(toks) > 2:  # If not unused.
         name_map[op] = toks[2]  # 'const/high16'.
         params_map[op] = toks[3] if len(toks) >= 4 else ''  # 'vAA, #+BBBB0000'.

   ret = []
   for op in range(256):
     size = int(format_map[op][0]) * 2  # '21h' -> 4.
     bc = DalvikByteCode(op, size, format_map[op], name_map[op], params_map[op])
     ret.append(bc)
   return ret


 DALVIK_INSTRUCTIONS = _ParseByteCodeSpecs()


 def Split(insns):
   """Splits Dalvik code into a series of instruction bytes.

   The minimalistic approach avoids wasted work. It's up to the caller to filter
   and/or disassemble emitted bytes. It is assumed that supplemental data (from
   31t instructions {fill-array-data, packed-switch, sparse-switch}) are found at
   the end of `insns`. These are detected and omitted.

   Args:
     insns: Even-length bytearray data containing valid Dalvik code.
   """
   pos_end = len(insns)
   assert pos_end % 2 == 0
   pos = 0
   while pos < pos_end:
     instr = DALVIK_INSTRUCTIONS[insns[pos]]
     size = instr.size
     chunk = insns[pos:pos + size]
     # Instructions with supplemental data contains relative offset to where
     # data starts, which indicates where code ends.
     if instr.format == '31t':
       offset = struct.unpack_from('<L', chunk, 2)[0]
       pos_end = min(pos_end, pos + offset * 2)
     yield chunk
     pos += size
   # Do not emit supplemental data.
	# Copyright 2022 The Chromium Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.
	"""Utilities for parsing Dalvik bytecode."""

	import collections
	import struct

	# Dalvik Bytecode specs copied from first two column of table in:
	# https://source.android.com/docs/core/runtime/dalvik-bytecode#instructions
	# with minor modification (truncating comments).
	_DALVIK_BYTECODE_SPECS = """00 10x nop
	01 12x move vA, vB
	02 22x move/from16 vAA, vBBBB
	03 32x move/16 vAAAA, vBBBB
	04 12x move-wide vA, vB
	05 22x move-wide/from16 vAA, vBBBB
	06 32x move-wide/16 vAAAA, vBBBB
	07 12x move-object vA, vB
	08 22x move-object/from16 vAA, vBBBB
	09 32x move-object/16 vAAAA, vBBBB
	0a 11x move-result vAA
	0b 11x move-result-wide vAA
	0c 11x move-result-object vAA
	0d 11x move-exception vAA
	0e 10x return-void
	0f 11x return vAA
	10 11x return-wide vAA
	11 11x return-object vAA
	12 11n const/4 vA, #+B
	13 21s const/16 vAA, #+BBBB
	14 31i const vAA, #+BBBBBBBB
	15 21h const/high16 vAA, #+BBBB0000
	16 21s const-wide/16 vAA, #+BBBB
	17 31i const-wide/32 vAA, #+BBBBBBBB
	18 51l const-wide vAA, #+BBBBBBBBBBBBBBBB
	19 21h const-wide/high16 vAA, #+BBBB000000000000
	1a 21c const-string vAA, string@BBBB
	1b 31c const-string/jumbo vAA, string@BBBBBBBB
	1c 21c const-class vAA, type@BBBB
	1d 11x monitor-enter vAA
	1e 11x monitor-exit vAA
	1f 21c check-cast vAA, type@BBBB
	20 22c instance-of vA, vB, type@CCCC
	21 12x array-length vA, vB
	22 21c new-instance vAA, type@BBBB
	23 22c new-array vA, vB, type@CCCC
	24 35c filled-new-array {vC, vD, vE, vF, vG}, type@BBBB
	25 3rc filled-new-array/range {vCCCC .. vNNNN}, type@BBBB
	26 31t fill-array-data vAA, +BBBBBBBB (with supplemental data...)
	27 11x throw vAA
	28 10t goto +AA
	29 20t goto/16 +AAAA
	2a 30t goto/32 +AAAAAAAA
	2b 31t packed-switch vAA, +BBBBBBBB (with supplemental data...)
	2c 31t sparse-switch vAA, +BBBBBBBB (with supplemental data...)
	2d..31 23x cmpkind vAA, vBB, vCC
	2d: cmpl-float (lt bias)
	2e: cmpg-float (gt bias)
	2f: cmpl-double (lt bias)
	30: cmpg-double (gt bias)
	31: cmp-long
	32..37 22t if-test vA, vB, +CCCC
	32: if-eq
	33: if-ne
	34: if-lt
	35: if-ge
	36: if-gt
	37: if-le
	38..3d 21t if-testz vAA, +BBBB
	38: if-eqz
	39: if-nez
	3a: if-ltz
	3b: if-gez
	3c: if-gtz
	3d: if-lez
	3e..43 10x (unused)
	44..51 23x arrayop vAA, vBB, vCC
	44: aget
	45: aget-wide
	46: aget-object
	47: aget-boolean
	48: aget-byte
	49: aget-char
	4a: aget-short
	4b: aput
	4c: aput-wide
	4d: aput-object
	4e: aput-boolean
	4f: aput-byte
	50: aput-char
	51: aput-short
	52..5f 22c iinstanceop vA, vB, field@CCCC
	52: iget
	53: iget-wide
	54: iget-object
	55: iget-boolean
	56: iget-byte
	57: iget-char
	58: iget-short
	59: iput
	5a: iput-wide
	5b: iput-object
	5c: iput-boolean
	5d: iput-byte
	5e: iput-char
	5f: iput-short
	60..6d 21c sstaticop vAA, field@BBBB
	60: sget
	61: sget-wide
	62: sget-object
	63: sget-boolean
	64: sget-byte
	65: sget-char
	66: sget-short
	67: sput
	68: sput-wide
	69: sput-object
	6a: sput-boolean
	6b: sput-byte
	6c: sput-char
	6d: sput-short
	6e..72 35c invoke-kind {vC, vD, vE, vF, vG}, meth@BBBB
	6e: invoke-virtual
	6f: invoke-super
	70: invoke-direct
	71: invoke-static
	72: invoke-interface
	73 10x (unused)
	74..78 3rc invoke-kind/range {vCCCC .. vNNNN}, meth@BBBB
	74: invoke-virtual/range
	75: invoke-super/range
	76: invoke-direct/range
	77: invoke-static/range
	78: invoke-interface/range
	79..7a 10x (unused)
	7b..8f 12x unop vA, vB
	7b: neg-int
	7c: not-int
	7d: neg-long
	7e: not-long
	7f: neg-float
	80: neg-double
	81: int-to-long
	82: int-to-float
	83: int-to-double
	84: long-to-int
	85: long-to-float
	86: long-to-double
	87: float-to-int
	88: float-to-long
	89: float-to-double
	8a: double-to-int
	8b: double-to-long
	8c: double-to-float
	8d: int-to-byte
	8e: int-to-char
	8f: int-to-short
	90..af 23x binop vAA, vBB, vCC
	90: add-int
	91: sub-int
	92: mul-int
	93: div-int
	94: rem-int
	95: and-int
	96: or-int
	97: xor-int
	98: shl-int
	99: shr-int
	9a: ushr-int
	9b: add-long
	9c: sub-long
	9d: mul-long
	9e: div-long
	9f: rem-long
	a0: and-long
	a1: or-long
	a2: xor-long
	a3: shl-long
	a4: shr-long
	a5: ushr-long
	a6: add-float
	a7: sub-float
	a8: mul-float
	a9: div-float
	aa: rem-float
	ab: add-double
	ac: sub-double
	ad: mul-double
	ae: div-double
	af: rem-double
	b0..cf 12x binop/2addr vA, vB
	b0: add-int/2addr
	b1: sub-int/2addr
	b2: mul-int/2addr
	b3: div-int/2addr
	b4: rem-int/2addr
	b5: and-int/2addr
	b6: or-int/2addr
	b7: xor-int/2addr
	b8: shl-int/2addr
	b9: shr-int/2addr
	ba: ushr-int/2addr
	bb: add-long/2addr
	bc: sub-long/2addr
	bd: mul-long/2addr
	be: div-long/2addr
	bf: rem-long/2addr
	c0: and-long/2addr
	c1: or-long/2addr
	c2: xor-long/2addr
	c3: shl-long/2addr
	c4: shr-long/2addr
	c5: ushr-long/2addr
	c6: add-float/2addr
	c7: sub-float/2addr
	c8: mul-float/2addr
	c9: div-float/2addr
	ca: rem-float/2addr
	cb: add-double/2addr
	cc: sub-double/2addr
	cd: mul-double/2addr
	ce: div-double/2addr
	cf: rem-double/2addr
	d0..d7 22s binop/lit16 vA, vB, #+CCCC
	d0: add-int/lit16
	d1: rsub-int (reverse subtract)
	d2: mul-int/lit16
	d3: div-int/lit16
	d4: rem-int/lit16
	d5: and-int/lit16
	d6: or-int/lit16
	d7: xor-int/lit16
	d8..e2 22b binop/lit8 vAA, vBB, #+CC
	d8: add-int/lit8
	d9: rsub-int/lit8
	da: mul-int/lit8
	db: div-int/lit8
	dc: rem-int/lit8
	dd: and-int/lit8
	de: or-int/lit8
	df: xor-int/lit8
	e0: shl-int/lit8
	e1: shr-int/lit8
	e2: ushr-int/lit8
	e3..f9 10x (unused)
	fa 45cc invoke-polymorphic {vC, vD, vE, vF, vG}, meth@BBBB, proto@HHHH
	fb 4rcc invoke-polymorphic/range {vCCCC .. vNNNN}, meth@BBBB, proto@HHHH
	fc 35c invoke-custom {vC, vD, vE, vF, vG}, call_site@BBBB
	fd 3rc invoke-custom/range {vCCCC .. vNNNN}, call_site@BBBB
	fe 21c const-method-handle vAA, method_handle@BBBB
	ff 21c const-method-type vAA, proto@BBBB
	"""

	DalvikByteCode = collections.namedtuple('DalvikByteCode',
	'op,size,format,name,params')


	def _ParseByteCodeSpecs():
	"""Parses _DALVIK_BYTECODE_SPECS into DalvikByteCode array."""
	format_map = [None] * 256
	name_map = [None] * 256
	params_map = [None] * 256
	(op_lo, op_hi) = (None, None)
	for line in _DALVIK_BYTECODE_SPECS.splitlines():
	comment_pos = line.find(' (')
	if comment_pos >= 0:
	line = line[:comment_pos]
	assert len(line) >= 5
	if line[2] == ':':
	# Inside op range, e.g.: 'b0: add-int/2addr'.
	# ['b0', 'add-int/2addr'].
	toks = line.split(': ')
	assert len(toks) == 2
	op = int(toks[0], 16)
	assert op_lo <= op <= op_hi
	name_map[op] = toks[1] # 'add-int/2addr'.
	if op == op_hi:
	op_lo = op_hi = None
	elif line[2:4] == '..':
	# Define op range, e.g.: 'b0..cf 12x binop/2addr vA, vB'.
	# ['b0..cf', '12x', 'binop/2addr', 'vA, vB'].
	toks = line.split(maxsplit=3)
	# (0xb0, 0xcf).
	(op_lo, op_hi) = (int(t, 16) for t in toks[0].split('..'))
	for op in range(op_lo, op_hi + 1):
	format_map[op] = toks[1] # '12x'.
	if len(toks) > 2: # If not unused.
	for op in range(op_lo, op_hi + 1):
	params_map[op] = toks[3] # 'vA, vB'.
	else:
	# Standalone op, e.g.: '15 21h const/high16 vAA, #+BBBB0000'.
	# ['15', '21h', 'const/high16', 'vAA, #+BBBB0000'].
	toks = line.split(maxsplit=3)
	op = int(toks[0], 16)
	format_map[op] = toks[1] # '21h'.
	if len(toks) > 2: # If not unused.
	name_map[op] = toks[2] # 'const/high16'.
	params_map[op] = toks[3] if len(toks) >= 4 else '' # 'vAA, #+BBBB0000'.

	ret = []
	for op in range(256):
	size = int(format_map[op][0]) * 2 # '21h' -> 4.
	bc = DalvikByteCode(op, size, format_map[op], name_map[op], params_map[op])
	ret.append(bc)
	return ret


	DALVIK_INSTRUCTIONS = _ParseByteCodeSpecs()


	def Split(insns):
	"""Splits Dalvik code into a series of instruction bytes.

	The minimalistic approach avoids wasted work. It's up to the caller to filter
	and/or disassemble emitted bytes. It is assumed that supplemental data (from
	31t instructions {fill-array-data, packed-switch, sparse-switch}) are found at
	the end of `insns`. These are detected and omitted.

	Args:
	insns: Even-length bytearray data containing valid Dalvik code.
	"""
	pos_end = len(insns)
	assert pos_end % 2 == 0
	pos = 0
	while pos < pos_end:
	instr = DALVIK_INSTRUCTIONS[insns[pos]]
	size = instr.size
	chunk = insns[pos:pos + size]
	# Instructions with supplemental data contains relative offset to where
	# data starts, which indicates where code ends.
	if instr.format == '31t':
	offset = struct.unpack_from('<L', chunk, 2)[0]
	pos_end = min(pos_end, pos + offset * 2)
	yield chunk
	pos += size
	# Do not emit supplemental data.