blob: c1cc429eeb6987a3b9bfc823f144cab909e3847e [file] [log] [blame] [edit]
#
# Copyright 2024 WebAssembly Community Group participants
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Wasm extractor for testcases generated by the ClusterFuzz run.py script. This is
general enough to also handle Fuzzilli output.
Usage:
extract_wasms.py INFILE.js OUTFILE
That will find embedded wasm files in INFILE.js, of the form
new Uint8Array([..wasm_contents..]);
and extract them into OUTFILE.0.wasm, OUTFILE.1.wasm, etc. It also emits
OUTFILE.js which will no longer contain the embedded contents, after which the
script can be run as
d8 OUTFILE.js -- OUTFILE.0.wasm
That is, the embedded file can now be provided as a filename argument.
'''
import re
import sys
file_counter = 0
def get_wasm_filename():
global file_counter
file_counter += 1
return f'{out}.{file_counter - 1}.wasm'
in_js = sys.argv[1]
out = sys.argv[2]
with open(in_js) as f:
js = f.read()
def repl(match):
text = match.group(0)
# We found something of the form
#
# new Uint8Array([..binary data as numbers..]);
#
# See if the numbers are the beginnings of a wasm file, "\0asm". If so, we
# assume it is wasm. (We are careful here because Fuzzilli output can
# contain normal JavaScript Typed Arrays, which we do not want to touch.)
numbers = match.groups()[0]
numbers = numbers.split(',')
try:
# Handle both base 10 and 16 by passing in base 0.
parsed = [int(n, 0) for n in numbers]
binary = bytes(parsed)
except ValueError:
# Not wasm; return the existing text.
return text
if binary[:4] != b'\0asm':
return text
# It is wasm. Parse out the numbers into a binary wasm file.
with open(get_wasm_filename(), 'wb') as f:
f.write(binary)
# Replace the Uint8Array with undefined + a comment.
return 'undefined /* extracted wasm */'
# Replace the wasm files and write them out. We investigate any new Uint8Array
# on an array of values like [100, 200] or [0x61, 0x6D, 0x6a] etc.
js = re.sub(r'new Uint8Array\(\[([\d,x a-fA-F]+)\]\)', repl, js)
# Write out the new JS.
with open(f'{out}.js', 'w') as f:
f.write(js)