src/testing.py - external/github.com/WebAssembly/waterfall - Git at Google

 #   Copyright 2015 WebAssembly Community Group participants
 #
 #   Licensed under the Apache License, Version 2.0 (the "License");
 #   you may not use this file except in compliance with the License.
 #   You may obtain a copy of the License at
 #
 #       http://www.apache.org/licenses/LICENSE-2.0
 #
 #   Unless required by applicable law or agreed to in writing, software
 #   distributed under the License is distributed on an "AS IS" BASIS,
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.

 import difflib
 import math
 import os
 import os.path
 import sys

 import parallel_runner
 import proc

 # Set to True to disable execution via thread pool
 single_threaded = False


 class Result:
     """Result from a single test that was run."""
     def __init__(self, test, success, output):
         self.test = test
         self.success = success
         self.output = output

     def __str__(self):
         return '%s %s%s%s' % ('SUCCEEDED' if self.success else 'FAILED',
                               self.test, '\n' if self.output else '',
                               self.output.decode('utf-8'))

     def __bool__(self):
         return self.success

     # py2 compat
     __nonzero__ = __bool__

     def __lt__(self, other):
         """Sort by test name so that the output files can be compared
         easily."""
         return self.test < other.test

     def similarity(self, other):
         """Compare output similarity, returning a float in the range [0,1]."""
         # Even quick_ratio is fairly slow on big inputs, capture just the
         # start.
         max_size = 1024
         return difflib.SequenceMatcher(None, self.output[:max_size],
                                        other.output[:max_size]).quick_ratio()


 class Tester(object):
     """Test runner."""
     def __init__(self, command_ctor, outname_ctor, outdir, extras):
         """Command-line constructor accepting input and output file names."""
         if outdir:
             assert os.path.isdir(
                 outdir), 'Expected output directory %s' % outdir
         self.command_ctor = command_ctor
         self.outname_ctor = outname_ctor
         self.outdir = outdir
         self.extras = extras

     @staticmethod
     def setlimits():
         # Set maximum CPU time to 90 seconds in child process
         try:
             import resource
             resource.setrlimit(resource.RLIMIT_CPU, (90, 90))
         except:  # noqa
             pass

     def __call__(self, test_file):
         """Execute a single test."""
         basename = os.path.basename(test_file)
         if self.outdir:
             outfile = self.outname_ctor(self.outdir, test_file, self.extras)
         else:
             outfile = ''
         should_log = sys.platform != 'darwin'
         try:

             output = proc.check_output(
                 self.command_ctor(test_file, outfile, self.extras),
                 stderr=proc.STDOUT,
                 cwd=self.outdir or os.getcwd(),
                 # preexec_fn is not supported on Windows
                 preexec_fn=Tester.setlimits
                 if sys.platform != 'win32' else None,
                 should_log=should_log)
             return Result(test=basename, success=True, output=output)
         except proc.CalledProcessError as e:
             return Result(test=basename, success=False, output=e.output)


 def parse_exclude_files(fails, config_attributes):
     """Returns a sorted list  of exclusions which match the attributes.

     Parse the files containing tests to exclude (i.e. expected fails).
     * Each line may contain a comma-separated list of attributes restricting
       the test configurations which are expected to fail. (e.g. JS engine
       or optimization level).
     * A test is only excluded if the configuration has all the attributes
       specified in the exclude line.
     * Lines which have no attributes will match everything
     * Lines which specify only one attribute (e.g. engine) will match all
       configurations with that attribute (e.g. both opt levels with that
       engine).
     For more details and example, see test/run_known_gcc_test_failures.txt
     """
     excludes = {}  # maps name of excluded test to file from whence it came
     config_attributes = set(config_attributes) if config_attributes else set()

     def parse_line(line):
         line = line.strip()
         if '#' in line:
             line = line[:line.index('#')].strip()
         tokens = line.split()
         return tokens

     for excludefile in fails:
         f = open(excludefile)
         for line in f:
             tokens = parse_line(line)
             if not tokens:
                 continue
             if len(tokens) > 1:
                 attributes = set(tokens[1].split(','))
                 if not attributes.issubset(config_attributes):
                     continue
             test = tokens[0]

             if test in excludes:
                 print('ERROR: duplicate exclude: [%s]' % line)
                 print('Files: %s and %s' % (excludes[test], excludefile))
                 sys.exit(1)
             excludes[test] = excludefile
         f.close()
     return sorted(excludes.keys())


 class TriangularArray:
     """Indexed with two commutable keys."""
     def __init__(self):
         self.arr = {}

     def canonicalize(self, key):
         return (min(key[0], key[1]), max(key[0], key[1]))

     def __getitem__(self, key):
         return self.arr[self.canonicalize(key)]

     def __setitem__(self, key, value):
         k = self.canonicalize(key)
         # Support single-insertion only, the intended usage would be a bug if
         # there were multiple insertions of the same key.
         assert k not in self.arr, 'Double insertion of key %s' % str(k)
         self.arr[k] = value

     def __iter__(self):
         return iter(self.arr.items())


 class SimilarityGroup:
     """Group of similar results."""
     def __init__(self, tests, similarities):
         self.tests = sorted(tests)
         self.similarities = [100. * s for s in similarities]
         self.average = (sum(self.similarities) /
                         len(self.similarities) if self.similarities else 0.)
         squared_diffs = [(s - self.average)**2 for s in self.similarities]
         self.stddev = (math.sqrt(sum(squared_diffs) / len(squared_diffs))
                        if self.similarities else 0.)


 def similarity(results, cutoff):
     """List of lists of result test names with similar outputs."""
     similarities = TriangularArray()
     for x in range(0, len(results)):
         for y in range(x + 1, len(results)):
             rx = results[x]
             ry = results[y]
             similarities[(rx.test, ry.test)] = rx.similarity(ry)
     # A maximum clique would be better suited to group similarities, but this
     # silly traversal is simpler and seems to do the job pretty well.
     similar_groups = []
     worklist = set()
     for k, v in similarities:
         if v > cutoff:
             worklist.add(k[0])
             worklist.add(k[1])
     for result in results:
         test = result.test
         if test in worklist:
             worklist.remove(test)
             group_tests = [test]
             group_similarities = []
             for other_result in results:
                 other_test = other_result.test
                 if other_test in worklist:
                     similar = similarities[(test, other_test)]
                     if similar > cutoff:
                         worklist.remove(other_test)
                         group_tests.append(other_test)
                         group_similarities.append(similar)
             if len(group_tests) > 1:
                 # Some tests could have similar matches which were more similar
                 # to other tests, leaving this group with a single entry.
                 similar_groups.append(
                     SimilarityGroup(tests=group_tests,
                                     similarities=group_similarities))
     assert len(worklist) == 0, 'Failed emptying worklist %s' % worklist
     # Put all the ungrouped tests into their own group.
     grouped = set()
     for group in similar_groups:
         for test in group.tests:
             grouped.add(test)
     uniques = list(set([r.test for r in results]) - grouped)
     if uniques:
         s = [similarities[(uniques[0], u)] for u in uniques[1:]]
         similar_groups.append(SimilarityGroup(tests=uniques, similarities=s))
     return similar_groups


 def make_blocking(fileno):
     try:
         from fcntl import fcntl, F_GETFL, F_SETFL
         flags = fcntl(fileno, F_GETFL)
         if flags & os.O_NONBLOCK:
             fcntl(fileno, F_SETFL, flags & ~os.O_NONBLOCK)
         print('make_blocking old flags %s' % hex(flags))
     except ImportError:
         pass


 def execute(tester, inputs, fails, exclusions=None, attributes=None):
     """Execute tests in parallel, output results, return failure count."""
     if exclusions:
         input_exclusions = parse_exclude_files(exclusions, None)
         inputs = [
             i for i in inputs if os.path.basename(i) not in input_exclusions
         ]
     sys.stdout.write('Executing tests.\n')
     if single_threaded:
         results = map(tester, inputs)
     else:
         runner = parallel_runner.ParallelRunner()
         results = runner.map(tester, inputs)

     sys.stdout.flush()
     sys.stdout.write('Done.\n')

     results = sorted(results)
     successes = [r for r in results if r]
     failures = [r for r in results if not r]

     # For some reason it's always here.
     make_blocking(sys.stdout.fileno())
     make_blocking(sys.stderr.fileno())

     sys.stdout.write('\nResults:\n')
     for result in results:
         sys.stdout.flush()
         sys.stdout.write(str(result) + '\n\n')

     if not fails:
         sys.stdout.write('\n'.join([
             'Ran %s tests.' % len(results),
             'Got %s successes.' % len(successes),
             'Got %s failures.' % len(failures)
         ]) + '\n')
         if failures:
             sys.stdout.write('Unexpected failures:\n')
             for f in failures:
                 sys.stdout.write('\t%s\n' % f.test)
         return len(failures)

     input_expected_failures = parse_exclude_files(fails, attributes)
     expected_failures = [
         t for t in failures if t.test in input_expected_failures
     ]
     unexpected_failures = [
         t for t in failures if t.test not in input_expected_failures
     ]
     unexpected_successes = [
         t for t in successes if t.test in input_expected_failures
     ]

     similarity_cutoff = 0.9
     # Calculating similarity is pretty expensive. If too many tests are
     # failing, it can take minutes, and most of them are probably failing for
     # the same fundamental reason. Skip in that case.
     failure_cutoff = 0.5
     max_failure_count = max(1, len(inputs) * failure_cutoff)

     def similar_failures(label, failures):
         if len(failures) > max_failure_count:
             print('Too many %s failures to show similarity' % label)
             return []
         return similarity(failures, similarity_cutoff)

     similar_expected_failures = similar_failures('expected', expected_failures)
     similar_unexpected_failures = similar_failures('unexpected',
                                                    unexpected_failures)

     def show_similar_failures(label, similar, failures):
         for s in similar:
             tests = ' '.join(s.tests)
             if s.average >= similarity_cutoff * 100.:
                 sys.stdout.write(
                     ('\nSimilar %s failures, '
                      'average %s%% similarity with stddev %s: '
                      '%s\n') % (label, s.average, s.stddev, tests))
                 sample = [f for f in failures if f.test == s.tests[0]][0]
                 sys.stdout.write('Sample failure: %s\n' % sample)
             else:
                 sys.stdout.write(
                     ('\nUngrouped %s failures, '
                      'average %s%% similarity with stddev %s: '
                      '%s\n') % (label, s.average, s.stddev, tests))

     show_similar_failures('expected', similar_expected_failures,
                           expected_failures)
     show_similar_failures('unexpected', similar_unexpected_failures,
                           unexpected_failures)

     if expected_failures:
         sys.stdout.write('Expected failures:\n')
         for f in expected_failures:
             sys.stdout.write('\t%s\n' % f.test)
     if unexpected_failures:
         sys.stdout.write('Unexpected failures:\n')
         for f in unexpected_failures:
             sys.stdout.write('\t%s\n' % f.test)
     if unexpected_successes:
         sys.stdout.write('Unexpected successes:\n')
         for f in unexpected_successes:
             sys.stdout.write('\t%s\n' % f.test)
     sys.stdout.write('\n'.join([
         '\n',
         'Ran %s tests.' % len(results),
         'Got %s successes.' % len(successes),
         'Got %s failures.' % len(failures),
         'Expected %s failures.' % len(input_expected_failures),
         'Got %s expected failures in %s similarity groups.' %
         (len(expected_failures), len(similar_expected_failures)),
         'Got %s unexpected failures in %s similarity groups.' %
         (len(unexpected_failures), len(similar_unexpected_failures)),
         'Got %s unexpected successes.' % len(unexpected_successes), '\n'
     ]))
     return len(unexpected_failures) + len(unexpected_successes)
	# Copyright 2015 WebAssembly Community Group participants
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import difflib
	import math
	import os
	import os.path
	import sys

	import parallel_runner
	import proc

	# Set to True to disable execution via thread pool
	single_threaded = False


	class Result:
	"""Result from a single test that was run."""
	def __init__(self, test, success, output):
	self.test = test
	self.success = success
	self.output = output

	def __str__(self):
	return '%s %s%s%s' % ('SUCCEEDED' if self.success else 'FAILED',
	self.test, '\n' if self.output else '',
	self.output.decode('utf-8'))

	def __bool__(self):
	return self.success

	# py2 compat
	__nonzero__ = __bool__

	def __lt__(self, other):
	"""Sort by test name so that the output files can be compared
	easily."""
	return self.test < other.test

	def similarity(self, other):
	"""Compare output similarity, returning a float in the range [0,1]."""
	# Even quick_ratio is fairly slow on big inputs, capture just the
	# start.
	max_size = 1024
	return difflib.SequenceMatcher(None, self.output[:max_size],
	other.output[:max_size]).quick_ratio()


	class Tester(object):
	"""Test runner."""
	def __init__(self, command_ctor, outname_ctor, outdir, extras):
	"""Command-line constructor accepting input and output file names."""
	if outdir:
	assert os.path.isdir(
	outdir), 'Expected output directory %s' % outdir
	self.command_ctor = command_ctor
	self.outname_ctor = outname_ctor
	self.outdir = outdir
	self.extras = extras

	@staticmethod
	def setlimits():
	# Set maximum CPU time to 90 seconds in child process
	try:
	import resource
	resource.setrlimit(resource.RLIMIT_CPU, (90, 90))
	except: # noqa
	pass

	def __call__(self, test_file):
	"""Execute a single test."""
	basename = os.path.basename(test_file)
	if self.outdir:
	outfile = self.outname_ctor(self.outdir, test_file, self.extras)
	else:
	outfile = ''
	should_log = sys.platform != 'darwin'
	try:

	output = proc.check_output(
	self.command_ctor(test_file, outfile, self.extras),
	stderr=proc.STDOUT,
	cwd=self.outdir or os.getcwd(),
	# preexec_fn is not supported on Windows
	preexec_fn=Tester.setlimits
	if sys.platform != 'win32' else None,
	should_log=should_log)
	return Result(test=basename, success=True, output=output)
	except proc.CalledProcessError as e:
	return Result(test=basename, success=False, output=e.output)


	def parse_exclude_files(fails, config_attributes):
	"""Returns a sorted list of exclusions which match the attributes.

	Parse the files containing tests to exclude (i.e. expected fails).
	* Each line may contain a comma-separated list of attributes restricting
	the test configurations which are expected to fail. (e.g. JS engine
	or optimization level).
	* A test is only excluded if the configuration has all the attributes
	specified in the exclude line.
	* Lines which have no attributes will match everything
	* Lines which specify only one attribute (e.g. engine) will match all
	configurations with that attribute (e.g. both opt levels with that
	engine).
	For more details and example, see test/run_known_gcc_test_failures.txt
	"""
	excludes = {} # maps name of excluded test to file from whence it came
	config_attributes = set(config_attributes) if config_attributes else set()

	def parse_line(line):
	line = line.strip()
	if '#' in line:
	line = line[:line.index('#')].strip()
	tokens = line.split()
	return tokens

	for excludefile in fails:
	f = open(excludefile)
	for line in f:
	tokens = parse_line(line)
	if not tokens:
	continue
	if len(tokens) > 1:
	attributes = set(tokens[1].split(','))
	if not attributes.issubset(config_attributes):
	continue
	test = tokens[0]

	if test in excludes:
	print('ERROR: duplicate exclude: [%s]' % line)
	print('Files: %s and %s' % (excludes[test], excludefile))
	sys.exit(1)
	excludes[test] = excludefile
	f.close()
	return sorted(excludes.keys())


	class TriangularArray:
	"""Indexed with two commutable keys."""
	def __init__(self):
	self.arr = {}

	def canonicalize(self, key):
	return (min(key[0], key[1]), max(key[0], key[1]))

	def __getitem__(self, key):
	return self.arr[self.canonicalize(key)]

	def __setitem__(self, key, value):
	k = self.canonicalize(key)
	# Support single-insertion only, the intended usage would be a bug if
	# there were multiple insertions of the same key.
	assert k not in self.arr, 'Double insertion of key %s' % str(k)
	self.arr[k] = value

	def __iter__(self):
	return iter(self.arr.items())


	class SimilarityGroup:
	"""Group of similar results."""
	def __init__(self, tests, similarities):
	self.tests = sorted(tests)
	self.similarities = [100. * s for s in similarities]
	self.average = (sum(self.similarities) /
	len(self.similarities) if self.similarities else 0.)
	squared_diffs = [(s - self.average)**2 for s in self.similarities]
	self.stddev = (math.sqrt(sum(squared_diffs) / len(squared_diffs))
	if self.similarities else 0.)


	def similarity(results, cutoff):
	"""List of lists of result test names with similar outputs."""
	similarities = TriangularArray()
	for x in range(0, len(results)):
	for y in range(x + 1, len(results)):
	rx = results[x]
	ry = results[y]
	similarities[(rx.test, ry.test)] = rx.similarity(ry)
	# A maximum clique would be better suited to group similarities, but this
	# silly traversal is simpler and seems to do the job pretty well.
	similar_groups = []
	worklist = set()
	for k, v in similarities:
	if v > cutoff:
	worklist.add(k[0])
	worklist.add(k[1])
	for result in results:
	test = result.test
	if test in worklist:
	worklist.remove(test)
	group_tests = [test]
	group_similarities = []
	for other_result in results:
	other_test = other_result.test
	if other_test in worklist:
	similar = similarities[(test, other_test)]
	if similar > cutoff:
	worklist.remove(other_test)
	group_tests.append(other_test)
	group_similarities.append(similar)
	if len(group_tests) > 1:
	# Some tests could have similar matches which were more similar
	# to other tests, leaving this group with a single entry.
	similar_groups.append(
	SimilarityGroup(tests=group_tests,
	similarities=group_similarities))
	assert len(worklist) == 0, 'Failed emptying worklist %s' % worklist
	# Put all the ungrouped tests into their own group.
	grouped = set()
	for group in similar_groups:
	for test in group.tests:
	grouped.add(test)
	uniques = list(set([r.test for r in results]) - grouped)
	if uniques:
	s = [similarities[(uniques[0], u)] for u in uniques[1:]]
	similar_groups.append(SimilarityGroup(tests=uniques, similarities=s))
	return similar_groups


	def make_blocking(fileno):
	try:
	from fcntl import fcntl, F_GETFL, F_SETFL
	flags = fcntl(fileno, F_GETFL)
	if flags & os.O_NONBLOCK:
	fcntl(fileno, F_SETFL, flags & ~os.O_NONBLOCK)
	print('make_blocking old flags %s' % hex(flags))
	except ImportError:
	pass


	def execute(tester, inputs, fails, exclusions=None, attributes=None):
	"""Execute tests in parallel, output results, return failure count."""
	if exclusions:
	input_exclusions = parse_exclude_files(exclusions, None)
	inputs = [
	i for i in inputs if os.path.basename(i) not in input_exclusions
	]
	sys.stdout.write('Executing tests.\n')
	if single_threaded:
	results = map(tester, inputs)
	else:
	runner = parallel_runner.ParallelRunner()
	results = runner.map(tester, inputs)

	sys.stdout.flush()
	sys.stdout.write('Done.\n')

	results = sorted(results)
	successes = [r for r in results if r]
	failures = [r for r in results if not r]

	# For some reason it's always here.
	make_blocking(sys.stdout.fileno())
	make_blocking(sys.stderr.fileno())

	sys.stdout.write('\nResults:\n')
	for result in results:
	sys.stdout.flush()
	sys.stdout.write(str(result) + '\n\n')

	if not fails:
	sys.stdout.write('\n'.join([
	'Ran %s tests.' % len(results),
	'Got %s successes.' % len(successes),
	'Got %s failures.' % len(failures)
	]) + '\n')
	if failures:
	sys.stdout.write('Unexpected failures:\n')
	for f in failures:
	sys.stdout.write('\t%s\n' % f.test)
	return len(failures)

	input_expected_failures = parse_exclude_files(fails, attributes)
	expected_failures = [
	t for t in failures if t.test in input_expected_failures
	]
	unexpected_failures = [
	t for t in failures if t.test not in input_expected_failures
	]
	unexpected_successes = [
	t for t in successes if t.test in input_expected_failures
	]

	similarity_cutoff = 0.9
	# Calculating similarity is pretty expensive. If too many tests are
	# failing, it can take minutes, and most of them are probably failing for
	# the same fundamental reason. Skip in that case.
	failure_cutoff = 0.5
	max_failure_count = max(1, len(inputs) * failure_cutoff)

	def similar_failures(label, failures):
	if len(failures) > max_failure_count:
	print('Too many %s failures to show similarity' % label)
	return []
	return similarity(failures, similarity_cutoff)

	similar_expected_failures = similar_failures('expected', expected_failures)
	similar_unexpected_failures = similar_failures('unexpected',
	unexpected_failures)

	def show_similar_failures(label, similar, failures):
	for s in similar:
	tests = ' '.join(s.tests)
	if s.average >= similarity_cutoff * 100.:
	sys.stdout.write(
	('\nSimilar %s failures, '
	'average %s%% similarity with stddev %s: '
	'%s\n') % (label, s.average, s.stddev, tests))
	sample = [f for f in failures if f.test == s.tests[0]][0]
	sys.stdout.write('Sample failure: %s\n' % sample)
	else:
	sys.stdout.write(
	('\nUngrouped %s failures, '
	'average %s%% similarity with stddev %s: '
	'%s\n') % (label, s.average, s.stddev, tests))

	show_similar_failures('expected', similar_expected_failures,
	expected_failures)
	show_similar_failures('unexpected', similar_unexpected_failures,
	unexpected_failures)

	if expected_failures:
	sys.stdout.write('Expected failures:\n')
	for f in expected_failures:
	sys.stdout.write('\t%s\n' % f.test)
	if unexpected_failures:
	sys.stdout.write('Unexpected failures:\n')
	for f in unexpected_failures:
	sys.stdout.write('\t%s\n' % f.test)
	if unexpected_successes:
	sys.stdout.write('Unexpected successes:\n')
	for f in unexpected_successes:
	sys.stdout.write('\t%s\n' % f.test)
	sys.stdout.write('\n'.join([
	'\n',
	'Ran %s tests.' % len(results),
	'Got %s successes.' % len(successes),
	'Got %s failures.' % len(failures),
	'Expected %s failures.' % len(input_expected_failures),
	'Got %s expected failures in %s similarity groups.' %
	(len(expected_failures), len(similar_expected_failures)),
	'Got %s unexpected failures in %s similarity groups.' %
	(len(unexpected_failures), len(similar_unexpected_failures)),
	'Got %s unexpected successes.' % len(unexpected_successes), '\n'
	]))
	return len(unexpected_failures) + len(unexpected_successes)