blob: 72197939d7693e3dfcc990036c6d78979cc81fc5 [file] [log] [blame] [edit]
import glob
import os
import platform
import re
import statistics
import subprocess
import sys
import tarfile
import tempfile
import unittest
from scripts.test import shared
from . import utils
def get_build_dir():
# wasm-opt is in the bin/ dir, and the build dir is one above it,
# and contains bin/ and lib/.
return os.path.dirname(os.path.dirname(shared.WASM_OPT[0]))
# Windows is not yet supported.
@unittest.skipIf(platform.system() == 'Windows', "showing class skipping")
class ClusterFuzz(utils.BinaryenTestCase):
@classmethod
def setUpClass(cls):
# Bundle up our ClusterFuzz package, and unbundle it to a directory.
# Keep the directory alive in a class var.
cls.temp_dir = tempfile.TemporaryDirectory()
cls.clusterfuzz_dir = cls.temp_dir.name
bundle = os.environ.get('BINARYEN_CLUSTER_FUZZ_BUNDLE')
if bundle:
print(f'Using existing bundle: {bundle}')
else:
print('Making a new bundle')
bundle = os.path.join(cls.clusterfuzz_dir, 'bundle.tgz')
cmd = [shared.in_binaryen('scripts', 'bundle_clusterfuzz.py')]
cmd.append(bundle)
cmd.append(f'--build-dir={get_build_dir()}')
shared.run_process(cmd)
print('Unpacking bundle')
tar = tarfile.open(bundle, "r:gz")
tar.extractall(path=cls.clusterfuzz_dir)
tar.close()
print('Ready')
# Test our bundler for ClusterFuzz.
def test_bundle(self):
# The bundle should contain certain files:
# 1. run.py, the main entry point.
self.assertTrue(os.path.exists(os.path.join(self.clusterfuzz_dir, 'run.py')))
# 2. scripts/fuzz_shell.js, the js testcase shell
self.assertTrue(os.path.exists(os.path.join(self.clusterfuzz_dir, 'scripts', 'fuzz_shell.js')))
# 3. bin/wasm-opt, the wasm-opt binary in a static build
wasm_opt = os.path.join(self.clusterfuzz_dir, 'bin', 'wasm-opt')
self.assertTrue(os.path.exists(wasm_opt))
# See that we can execute the bundled wasm-opt. It should be able to
# print out its version.
out = subprocess.check_output([wasm_opt, '--version'], text=True)
self.assertIn('wasm-opt version ', out)
# Generate N testcases, using run.py from a temp dir, and outputting to a
# testcase dir.
def generate_testcases(self, N, testcase_dir):
proc = subprocess.run([sys.executable,
os.path.join(self.clusterfuzz_dir, 'run.py'),
f'--output_dir={testcase_dir}',
f'--no_of_files={N}'],
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
self.assertEqual(proc.returncode, 0)
# We should have logged the creation of N testcases.
self.assertEqual(proc.stdout.count('Created testcase:'), N)
# We should have actually created them.
for i in range(0, N + 2):
fuzz_file = os.path.join(testcase_dir, f'fuzz-binaryen-{i}.js')
flags_file = os.path.join(testcase_dir, f'flags-binaryen-{i}.js')
# We actually emit the range [1, N], so 0 or N+1 should not exist.
if i >= 1 and i <= N:
self.assertTrue(os.path.exists(fuzz_file))
self.assertTrue(os.path.exists(flags_file))
else:
self.assertTrue(not os.path.exists(fuzz_file))
self.assertTrue(not os.path.exists(flags_file))
return proc
# Test the bundled run.py script.
def test_run_py(self):
temp_dir = tempfile.TemporaryDirectory()
N = 10
proc = self.generate_testcases(N, temp_dir.name)
# Run.py should report no errors or warnings to stderr, except from
# those we know are safe (we cannot test this in generate_testcases,
# because the caller could do something like set BINARYEN_PASS_DEBUG,
# which generates intentional stderr warnings).
SAFE_WARNINGS = [
# When we randomly pick no passes to run, this is shown.
'warning: no passes specified, not doing any work',
# MemoryPacking warns on some things.
'warning: active memory segments have overlap, which prevents some optimizations.',
]
stderr = proc.stderr
for safe in SAFE_WARNINGS:
stderr = stderr.replace(safe, '')
stderr = stderr.strip()
self.assertEqual(stderr, '')
def test_fuzz_passes(self):
# We should see interesting passes being run in run.py. This is *NOT* a
# deterministic test, since the number of passes run is random (we just
# let run.py run normally, to simulate the real environment), so flakes
# are possible here. However, we do the check in a way that the
# statistical likelihood of a flake is insignificant. Specifically, we
# just check that we see a different number of passes run in two
# different invocations, which is enough to prove that we are running
# different passes each time. And the number of passes is on average
# over 100 here (10 testcases, and each runs 0-20 passes or so).
temp_dir = tempfile.TemporaryDirectory()
N = 10
# Try many times to see a different number, to make flakes even less
# likely. In the worst case if there were two possible numbers of
# passes run, with equal probability, then if we failed 100 iterations
# every second, we could go for billions of billions of years without a
# flake. (And, if there are only two numbers with *non*-equal
# probability then something is very wrong, and we'd like to see
# errors.)
seen_num_passes = set()
for i in range(100):
os.environ['BINARYEN_PASS_DEBUG'] = '1'
try:
proc = self.generate_testcases(N, temp_dir.name)
finally:
del os.environ['BINARYEN_PASS_DEBUG']
num_passes = proc.stderr.count('running pass')
print(f'num passes: {num_passes}')
seen_num_passes.add(num_passes)
if len(seen_num_passes) > 1:
return
raise Exception(f'We always only saw {seen_num_passes} passes run')
def test_file_contents(self):
# As test_fuzz_passes, this is nondeterministic, but statistically it is
# almost impossible to get a flake here.
temp_dir = tempfile.TemporaryDirectory()
N = 100
self.generate_testcases(N, temp_dir.name)
# To check for interesting wasm file contents, we'll note how many
# struct.news appear (a signal that we are emitting WasmGC, and also a
# non-trivial number of them), the sizes of the wasm files, and the
# exports.
seen_struct_news = []
seen_sizes = []
seen_exports = []
# Second wasm files are also emitted sometimes.
seen_second_sizes = []
# The number of struct.news appears in the metrics report like this:
#
# StructNew : 18
#
struct_news_regex = re.compile(r'StructNew\s+:\s+(\d+)')
# The number of exports appears in the metrics report like this:
#
# [exports] : 1
#
exports_regex = re.compile(r'\[exports\]\s+:\s+(\d+)')
for i in range(1, N + 1):
fuzz_file = os.path.join(temp_dir.name, f'fuzz-binaryen-{i}.js')
flags_file = os.path.join(temp_dir.name, f'flags-binaryen-{i}.js')
# The flags file must contain --wasm-staging
with open(flags_file) as f:
self.assertEqual(f.read(), '--wasm-staging')
# Extract the wasm file(s) from the JS. Make sure to not notice
# stale files.
for f in glob.glob('extracted*'):
os.unlink(f)
extractor = shared.in_binaryen('scripts', 'clusterfuzz', 'extract_wasms.py')
subprocess.check_call([sys.executable, extractor, fuzz_file, 'extracted'])
# One wasm file must always exist, and must be valid.
binary_file = 'extracted.0.wasm'
assert os.path.exists(binary_file)
metrics = subprocess.check_output(
shared.WASM_OPT + ['-all', '--metrics', binary_file, '-q'], text=True)
# Update with what we see.
struct_news = re.findall(struct_news_regex, metrics)
if not struct_news:
# No line is emitted when --metrics sees no struct.news.
struct_news = ['0']
# Metrics should contain one line for StructNews.
self.assertEqual(len(struct_news), 1)
seen_struct_news.append(int(struct_news[0]))
seen_sizes.append(os.path.getsize(binary_file))
exports = re.findall(exports_regex, metrics)
# Metrics should contain one line for exports.
self.assertEqual(len(exports), 1)
seen_exports.append(int(exports[0]))
# Sometimes a second wasm file should exist, and it must be valid
# too.
second_binary_file = 'extracted.1.wasm'
if os.path.exists(second_binary_file):
subprocess.check_call(
shared.WASM_OPT + ['-all', second_binary_file, '-q'])
# Note its size (we leave detailed metrics for the first one;
# they are generated by the same logic in run.py, so just
# verifying some valid second wasms are emitted, of random
# sizes, is enough).
seen_second_sizes.append(os.path.getsize(second_binary_file))
print()
print('struct.news are distributed as ~ mean 15, stddev 24, median 10')
# Given that, with 100 samples we are incredibly likely to see an
# interesting number at least once. It is also incredibly unlikely for
# the stdev to be zero.
print(f'mean struct.news: {statistics.mean(seen_struct_news)}')
print(f'stdev struct.news: {statistics.stdev(seen_struct_news)}')
print(f'median struct.news: {statistics.median(seen_struct_news)}')
self.assertGreaterEqual(max(seen_struct_news), 10)
self.assertGreater(statistics.stdev(seen_struct_news), 0)
print()
print('sizes are distributed as ~ mean 2933, stddev 2011, median 2510')
print(f'mean sizes: {statistics.mean(seen_sizes)}')
print(f'stdev sizes: {statistics.stdev(seen_sizes)}')
print(f'median sizes: {statistics.median(seen_sizes)}')
self.assertGreaterEqual(max(seen_sizes), 1000)
self.assertGreater(statistics.stdev(seen_sizes), 0)
print()
print('exports are distributed as ~ mean 9, stddev 6, median 8')
print(f'mean exports: {statistics.mean(seen_exports)}')
print(f'stdev exports: {statistics.stdev(seen_exports)}')
print(f'median exports: {statistics.median(seen_exports)}')
self.assertGreaterEqual(max(seen_exports), 8)
self.assertGreater(statistics.stdev(seen_exports), 0)
print()
# Second files appear in ~ 1/3 of testcases.
print('number of second wasms should be around 33 +- 8')
print(f'number of second wasms: {len(seen_second_sizes)}')
assert seen_second_sizes, 'must see at least one second wasm'
print('second sizes are distributed as ~ mean 2933, stddev 2011, median 2510')
print(f'mean sizes: {statistics.mean(seen_second_sizes)}')
print(f'stdev sizes: {statistics.stdev(seen_second_sizes)}')
print(f'median sizes: {statistics.median(seen_second_sizes)}')
# Relax the assert on the max seen second size compared to the max seen
# primary size, as we see fewer of these. 500 is still proof of an
# interesting wasm file.
self.assertGreaterEqual(max(seen_second_sizes), 500)
self.assertGreater(statistics.stdev(seen_second_sizes), 0)
print()
# To check for interesting JS file contents, we'll note how many times
# we build and run the wasm, and other things like JSPI.
seen_builds = []
seen_calls = []
seen_second_builds = []
seen_JSPIs = []
seen_initial_contents = []
# Initial contents are noted in comments like this:
#
# /* using initial content 42.wasm */
#
# Note that we may see more than one in a file, as we may have more than
# one wasm in each testcase: each wasm has a chance.
initial_content_regex = re.compile(r'[/][*] using initial content ([^ ]+) [*][/]')
# Some calls to callExports come with a random seed, so we have either
#
# callExports();
# callExports(123456);
#
call_exports_regex = re.compile(r'callExports[(](\d*)[)]')
for i in range(1, N + 1):
fuzz_file = os.path.join(temp_dir.name, f'fuzz-binaryen-{i}.js')
with open(fuzz_file) as f:
js = f.read()
seen_builds.append(js.count('build(binary);'))
seen_calls.append(re.findall(call_exports_regex, js))
seen_second_builds.append(js.count('build(secondBinary);'))
# If JSPI is enabled, the async and await keywords should be
# enabled (uncommented).
if 'JSPI = 1' in js:
seen_JSPIs.append(1)
assert '/* async */' not in js
assert '/* await */' not in js
else:
seen_JSPIs.append(0)
assert '/* async */' in js
assert '/* await */' in js
seen_initial_contents.append(re.findall(initial_content_regex, js))
# There is always one build and one call (those are in the default
# fuzz_shell.js), and we add a couple of operations, each with equal
# probability to be a build or a call, so over the 100 testcases here we
# have an overwhelming probability to see at least one extra build and
# one extra call.
print('JS builds are distributed as ~ mean 4, stddev 5, median 2')
print(f'mean JS builds: {statistics.mean(seen_builds)}')
print(f'stdev JS builds: {statistics.stdev(seen_builds)}')
print(f'median JS builds: {statistics.median(seen_builds)}')
# Assert on at least 2, which means we added at least one to the default
# one that always exists, as mentioned before.
self.assertGreaterEqual(max(seen_builds), 2)
self.assertGreater(statistics.stdev(seen_builds), 0)
print()
# Generate the counts of seen calls, for convenience. We convert
# [['11', '22'], [], ['99']]
# into
# [2, 0, 1]
num_seen_calls = [len(x) for x in seen_calls]
print('Num JS calls are distributed as ~ mean 4, stddev 5, median 2')
print(f'mean JS calls: {statistics.mean(num_seen_calls)}')
print(f'stdev JS calls: {statistics.stdev(num_seen_calls)}')
print(f'median JS calls: {statistics.median(num_seen_calls)}')
self.assertGreaterEqual(max(num_seen_calls), 2)
self.assertGreater(statistics.stdev(num_seen_calls), 0)
# The initial callExports have no seed (that makes the first, default,
# callExports behave deterministically, so we can compare to
# wasm-opt --fuzz-exec etc.), and all subsequent ones must have a seed.
seeds = []
for calls in seen_calls:
if calls:
self.assertEqual(calls[0], '')
for other in calls[1:]:
self.assertNotEqual(other, '')
seeds.append(int(other))
# The seeds are random numbers in 0..2^32-1, so overlap between them
# should be incredibly unlikely. Allow a few % of such overlap just to
# avoid extremely rare errors.
num_seeds = len(seeds)
num_unique_seeds = len(set(seeds))
print(f'unique JS call seeds: {num_unique_seeds} (should be almost {num_seeds})')
self.assertGreaterEqual(num_unique_seeds / num_seeds, 0.95)
print()
# Second wasm files are more rarely added, only 1/3 of the time or so,
# but over 100 samples we are still overwhelmingly likely to see one.
print('JS second builds are distributed as ~ mean 1.8, stddev 2.2, median 1')
print(f'mean JS second builds: {statistics.mean(seen_second_builds)}')
print(f'stdev JS second builds: {statistics.stdev(seen_second_builds)}')
print(f'median JS second builds: {statistics.median(seen_second_builds)}')
self.assertGreaterEqual(max(seen_second_builds), 2)
self.assertGreater(statistics.stdev(seen_second_builds), 0)
print()
# JSPI is done 1/4 of the time or so.
print('JSPIs are distributed as ~ mean 0.25')
print(f'mean JSPIs: {statistics.mean(seen_JSPIs)}')
self.assertEqual(min(seen_JSPIs), 0)
self.assertEqual(max(seen_JSPIs), 1)
print()
# Flatten the data to help some of the below, from
# [['a.wasm', 'b.wasm'], ['c.wasm']]
# into
# ['a.wasm', 'b.wasm', 'c.wasm']
flat_initial_contents = [item for items in seen_initial_contents for item in items]
# Initial content appear 50% of the time for each wasm file. Each
# testcase has 1.333 wasm files on average.
print('Initial contents are distributed as ~ mean 0.68')
print(f'mean initial contents: {len(flat_initial_contents) / N}')
# Initial contents should be mostly unique (we have many, many testcases
# and we pick just 100 or so). And we must see more than one unique one.
unique_initial_contents = set(flat_initial_contents)
print(f'unique initial contents: {len(unique_initial_contents)} should be almost equal to {len(flat_initial_contents)}')
self.assertGreater(len(unique_initial_contents), 1)
# Not all testcases have initial contents.
num_initial_contents = [len(items) for items in seen_initial_contents]
self.assertEqual(min(num_initial_contents), 0)
# Some do (this is redundant given that the set of unique initial
# contents was asserted on before, so this just confirms/checks that).
self.assertGreaterEqual(max(num_initial_contents), 1)
print()
# Execute the files in V8. Almost all should execute properly (some
# small number may trap during startup, say on a segment out of bounds).
if shared.V8:
valid_executions = 0
for i in range(1, N + 1):
fuzz_file = os.path.join(temp_dir.name, f'fuzz-binaryen-{i}.js')
cmd = [shared.V8, '--wasm-staging', fuzz_file]
proc = subprocess.run(cmd, stdout=subprocess.PIPE)
# An execution is valid if we exited without error, and if we
# managed to run some code before exiting (modules with no
# exports will be considered "invalid" here, but that is very
# rare, and in a sense they are actually unuseful).
if proc.returncode == 0 and b'[fuzz-exec] calling ' in proc.stdout:
valid_executions += 1
print('Valid executions are distributed as ~ mean 0.99')
print(f'mean valid executions: {valid_executions / N}')
# Assert on having at least half execute properly. Given the true mean
# is 0.9, for half of 100 to fail is incredibly unlikely.
self.assertGreater(valid_executions, N / 2)
print()
# "zzz" in test name so that this runs last. If it runs first, it can be
# confusing as it appears next to the logging of which bundle we use (see
# setUpClass).
def test_zzz_bundle_build_dir(self):
cmd = [shared.in_binaryen('scripts', 'bundle_clusterfuzz.py')]
cmd.append('bundle.tgz')
# Test that we notice the --build-dir flag. Here we pass an invalid
# value, so we should error.
cmd.append('--build-dir=foo_bar')
failed = False
try:
subprocess.check_call(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except subprocess.CalledProcessError:
# Expected error.
failed = True
self.assertTrue(failed)
# Test with a valid --build-dir.
cmd.pop()
cmd.append(f'--build-dir={get_build_dir()}')
subprocess.check_call(cmd)