blob: de2e92d4298388fab63ab3553395a5c115210df1 [file] [log] [blame]
# Copyright 2015 WebAssembly Community Group participants
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import difflib
import math
import os
import os.path
import sys
import parallel_runner
import proc
# Set to True to disable execution via thread pool
single_threaded = False
class Result:
"""Result from a single test that was run."""
def __init__(self, test, success, output):
self.test = test
self.success = success
self.output = output
def __str__(self):
return '%s %s%s%s' % ('SUCCEEDED' if self.success else 'FAILED',
self.test, '\n' if self.output else '',
self.output.decode('utf-8'))
def __bool__(self):
return self.success
# py2 compat
__nonzero__ = __bool__
def __lt__(self, other):
"""Sort by test name so that the output files can be compared
easily."""
return self.test < other.test
def similarity(self, other):
"""Compare output similarity, returning a float in the range [0,1]."""
# Even quick_ratio is fairly slow on big inputs, capture just the
# start.
max_size = 1024
return difflib.SequenceMatcher(None, self.output[:max_size],
other.output[:max_size]).quick_ratio()
class Tester(object):
"""Test runner."""
def __init__(self, command_ctor, outname_ctor, outdir, extras):
"""Command-line constructor accepting input and output file names."""
if outdir:
assert os.path.isdir(
outdir), 'Expected output directory %s' % outdir
self.command_ctor = command_ctor
self.outname_ctor = outname_ctor
self.outdir = outdir
self.extras = extras
@staticmethod
def setlimits():
# Set maximum CPU time to 90 seconds in child process
try:
import resource
resource.setrlimit(resource.RLIMIT_CPU, (90, 90))
except: # noqa
pass
def __call__(self, test_file):
"""Execute a single test."""
basename = os.path.basename(test_file)
if self.outdir:
outfile = self.outname_ctor(self.outdir, test_file, self.extras)
else:
outfile = ''
should_log = sys.platform != 'darwin'
try:
output = proc.check_output(
self.command_ctor(test_file, outfile, self.extras),
stderr=proc.STDOUT,
cwd=self.outdir or os.getcwd(),
# preexec_fn is not supported on Windows
preexec_fn=Tester.setlimits
if sys.platform != 'win32' else None,
should_log=should_log)
return Result(test=basename, success=True, output=output)
except proc.CalledProcessError as e:
return Result(test=basename, success=False, output=e.output)
def parse_exclude_files(fails, config_attributes):
"""Returns a sorted list of exclusions which match the attributes.
Parse the files containing tests to exclude (i.e. expected fails).
* Each line may contain a comma-separated list of attributes restricting
the test configurations which are expected to fail. (e.g. JS engine
or optimization level).
* A test is only excluded if the configuration has all the attributes
specified in the exclude line.
* Lines which have no attributes will match everything
* Lines which specify only one attribute (e.g. engine) will match all
configurations with that attribute (e.g. both opt levels with that
engine).
For more details and example, see test/run_known_gcc_test_failures.txt
"""
excludes = {} # maps name of excluded test to file from whence it came
config_attributes = set(config_attributes) if config_attributes else set()
def parse_line(line):
line = line.strip()
if '#' in line:
line = line[:line.index('#')].strip()
tokens = line.split()
return tokens
for excludefile in fails:
f = open(excludefile)
for line in f:
tokens = parse_line(line)
if not tokens:
continue
if len(tokens) > 1:
attributes = set(tokens[1].split(','))
if not attributes.issubset(config_attributes):
continue
test = tokens[0]
if test in excludes:
print('ERROR: duplicate exclude: [%s]' % line)
print('Files: %s and %s' % (excludes[test], excludefile))
sys.exit(1)
excludes[test] = excludefile
f.close()
return sorted(excludes.keys())
class TriangularArray:
"""Indexed with two commutable keys."""
def __init__(self):
self.arr = {}
def canonicalize(self, key):
return (min(key[0], key[1]), max(key[0], key[1]))
def __getitem__(self, key):
return self.arr[self.canonicalize(key)]
def __setitem__(self, key, value):
k = self.canonicalize(key)
# Support single-insertion only, the intended usage would be a bug if
# there were multiple insertions of the same key.
assert k not in self.arr, 'Double insertion of key %s' % str(k)
self.arr[k] = value
def __iter__(self):
return iter(self.arr.items())
class SimilarityGroup:
"""Group of similar results."""
def __init__(self, tests, similarities):
self.tests = sorted(tests)
self.similarities = [100. * s for s in similarities]
self.average = (sum(self.similarities) /
len(self.similarities) if self.similarities else 0.)
squared_diffs = [(s - self.average)**2 for s in self.similarities]
self.stddev = (math.sqrt(sum(squared_diffs) / len(squared_diffs))
if self.similarities else 0.)
def similarity(results, cutoff):
"""List of lists of result test names with similar outputs."""
similarities = TriangularArray()
for x in range(0, len(results)):
for y in range(x + 1, len(results)):
rx = results[x]
ry = results[y]
similarities[(rx.test, ry.test)] = rx.similarity(ry)
# A maximum clique would be better suited to group similarities, but this
# silly traversal is simpler and seems to do the job pretty well.
similar_groups = []
worklist = set()
for k, v in similarities:
if v > cutoff:
worklist.add(k[0])
worklist.add(k[1])
for result in results:
test = result.test
if test in worklist:
worklist.remove(test)
group_tests = [test]
group_similarities = []
for other_result in results:
other_test = other_result.test
if other_test in worklist:
similar = similarities[(test, other_test)]
if similar > cutoff:
worklist.remove(other_test)
group_tests.append(other_test)
group_similarities.append(similar)
if len(group_tests) > 1:
# Some tests could have similar matches which were more similar
# to other tests, leaving this group with a single entry.
similar_groups.append(
SimilarityGroup(tests=group_tests,
similarities=group_similarities))
assert len(worklist) == 0, 'Failed emptying worklist %s' % worklist
# Put all the ungrouped tests into their own group.
grouped = set()
for group in similar_groups:
for test in group.tests:
grouped.add(test)
uniques = list(set([r.test for r in results]) - grouped)
if uniques:
s = [similarities[(uniques[0], u)] for u in uniques[1:]]
similar_groups.append(SimilarityGroup(tests=uniques, similarities=s))
return similar_groups
def make_blocking(fileno):
try:
from fcntl import fcntl, F_GETFL, F_SETFL
flags = fcntl(fileno, F_GETFL)
if flags & os.O_NONBLOCK:
fcntl(fileno, F_SETFL, flags & ~os.O_NONBLOCK)
print('make_blocking old flags %s' % hex(flags))
except ImportError:
pass
def execute(tester, inputs, fails, exclusions=None, attributes=None):
"""Execute tests in parallel, output results, return failure count."""
if exclusions:
input_exclusions = parse_exclude_files(exclusions, None)
inputs = [
i for i in inputs if os.path.basename(i) not in input_exclusions
]
sys.stdout.write('Executing tests.\n')
if single_threaded:
results = map(tester, inputs)
else:
runner = parallel_runner.ParallelRunner()
results = runner.map(tester, inputs)
sys.stdout.flush()
sys.stdout.write('Done.\n')
results = sorted(results)
successes = [r for r in results if r]
failures = [r for r in results if not r]
# For some reason it's always here.
make_blocking(sys.stdout.fileno())
make_blocking(sys.stderr.fileno())
sys.stdout.write('\nResults:\n')
for result in results:
sys.stdout.flush()
sys.stdout.write(str(result) + '\n\n')
if not fails:
sys.stdout.write('\n'.join([
'Ran %s tests.' % len(results),
'Got %s successes.' % len(successes),
'Got %s failures.' % len(failures)
]) + '\n')
if failures:
sys.stdout.write('Unexpected failures:\n')
for f in failures:
sys.stdout.write('\t%s\n' % f.test)
return len(failures)
input_expected_failures = parse_exclude_files(fails, attributes)
expected_failures = [
t for t in failures if t.test in input_expected_failures
]
unexpected_failures = [
t for t in failures if t.test not in input_expected_failures
]
unexpected_successes = [
t for t in successes if t.test in input_expected_failures
]
similarity_cutoff = 0.9
# Calculating similarity is pretty expensive. If too many tests are
# failing, it can take minutes, and most of them are probably failing for
# the same fundamental reason. Skip in that case.
failure_cutoff = 0.5
max_failure_count = max(1, len(inputs) * failure_cutoff)
def similar_failures(label, failures):
if len(failures) > max_failure_count:
print('Too many %s failures to show similarity' % label)
return []
return similarity(failures, similarity_cutoff)
similar_expected_failures = similar_failures('expected', expected_failures)
similar_unexpected_failures = similar_failures('unexpected',
unexpected_failures)
def show_similar_failures(label, similar, failures):
for s in similar:
tests = ' '.join(s.tests)
if s.average >= similarity_cutoff * 100.:
sys.stdout.write(
('\nSimilar %s failures, '
'average %s%% similarity with stddev %s: '
'%s\n') % (label, s.average, s.stddev, tests))
sample = [f for f in failures if f.test == s.tests[0]][0]
sys.stdout.write('Sample failure: %s\n' % sample)
else:
sys.stdout.write(
('\nUngrouped %s failures, '
'average %s%% similarity with stddev %s: '
'%s\n') % (label, s.average, s.stddev, tests))
show_similar_failures('expected', similar_expected_failures,
expected_failures)
show_similar_failures('unexpected', similar_unexpected_failures,
unexpected_failures)
if expected_failures:
sys.stdout.write('Expected failures:\n')
for f in expected_failures:
sys.stdout.write('\t%s\n' % f.test)
if unexpected_failures:
sys.stdout.write('Unexpected failures:\n')
for f in unexpected_failures:
sys.stdout.write('\t%s\n' % f.test)
if unexpected_successes:
sys.stdout.write('Unexpected successes:\n')
for f in unexpected_successes:
sys.stdout.write('\t%s\n' % f.test)
sys.stdout.write('\n'.join([
'\n',
'Ran %s tests.' % len(results),
'Got %s successes.' % len(successes),
'Got %s failures.' % len(failures),
'Expected %s failures.' % len(input_expected_failures),
'Got %s expected failures in %s similarity groups.' %
(len(expected_failures), len(similar_expected_failures)),
'Got %s unexpected failures in %s similarity groups.' %
(len(unexpected_failures), len(similar_unexpected_failures)),
'Got %s unexpected successes.' % len(unexpected_successes), '\n'
]))
return len(unexpected_failures) + len(unexpected_successes)