blob: 0721aeb9fe40f2808dc480296cc041384ebb44fa [file] [log] [blame]
#!/usr/bin/env python
# Copyright 2019 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Runs telemetry benchmarks on representative story tag.
This script is a wrapper around run_performance_tests.py to capture the
values of performance metrics and compare them with the acceptable limits
in order to prevent regressions.
Arguments used for this script are the same as run_performance_tests.py.
The name and some functionalities of this script should be adjusted for
use with other benchmarks.
"""
from __future__ import print_function
import argparse
import csv
import json
import numpy as np
import os
import sys
import time
import common
import run_performance_tests
# AVG_ERROR_MARGIN determines how much more the value of frame times can be
# compared to the recorded value (multiplier of upper limit).
AVG_ERROR_MARGIN = 1.1
# CI stands for confidence intervals. "ci_095"s recorded in the data is the
# recorded range between upper and lower CIs. CI_ERROR_MARGIN is the maximum
# acceptable ratio of calculated ci_095 to the recorded ones.
# TODO(behdadb) crbug.com/1052054
CI_ERROR_MARGIN = 1.5
METRIC_NAME = 'frame_times'
class ResultRecorder(object):
def __init__(self):
self.fails = 0
self.tests = 0
self.start_time = time.time()
self.output = {}
self.return_code = 0
self._failed_stories = set()
self._noisy_control_stories = set()
# Set of _noisy_control_stories keeps track of control tests which failed
# because of high noise values.
def set_tests(self, output):
self.output = output
self.fails = output['num_failures_by_type'].get('FAIL', 0)
self.tests = self.fails + output['num_failures_by_type'].get('PASS', 0)
def add_failure(self, name, benchmark, is_control=False):
self.output['tests'][benchmark][name]['actual'] = 'FAIL'
self.output['tests'][benchmark][name]['is_unexpected'] = True
self._failed_stories.add(name)
self.fails += 1
if is_control:
self._noisy_control_stories.add(name)
def remove_failure(self, name, benchmark, is_control=False,
invalidation_reason=None):
self.output['tests'][benchmark][name]['actual'] = 'PASS'
self.output['tests'][benchmark][name]['is_unexpected'] = False
self._failed_stories.remove(name)
self.fails -= 1
if is_control:
self._noisy_control_stories.remove(name)
if invalidation_reason:
self.add_invalidation_reason(name, benchmark, invalidation_reason)
def invalidate_failures(self, benchmark):
# The method is for invalidating the failures in case of noisy control test
for story in self._failed_stories.copy():
print(story + ' [Invalidated Failure]: The story failed but was ' +
'invalidated as a result of noisy control test.')
self.remove_failure(story, benchmark, False, 'Noisy control test')
def add_invalidation_reason(self, name, benchmark, reason):
self.output['tests'][benchmark][name]['invalidation_reason'] = reason
@property
def failed_stories(self):
return self._failed_stories
@property
def is_control_stories_noisy(self):
return len(self._noisy_control_stories) > 0
def get_output(self, return_code):
self.output['seconds_since_epoch'] = time.time() - self.start_time
self.output['num_failures_by_type']['PASS'] = self.tests - self.fails
self.output['num_failures_by_type']['FAIL'] = self.fails
if return_code == 1:
self.output['interrupted'] = True
plural = lambda n, s, p: '%d %s' % (n, p if n != 1 else s)
tests = lambda n: plural(n, 'test', 'tests')
print('[ PASSED ] ' + tests(self.tests - self.fails) + '.')
if self.fails > 0:
print('[ FAILED ] ' + tests(self.fails) + '.')
self.return_code = 1
return (self.output, self.return_code)
class RenderingRepresentativePerfTest(object):
def __init__(self, initialization_for_tests=False):
self.return_code = 0
# result_recorder for rerun, and non rerun
self.result_recorder = {
True: ResultRecorder(),
False: ResultRecorder()
}
if initialization_for_tests is True:
return
self.options = parse_arguments()
print (self.options)
self.benchmark = self.options.benchmarks
out_dir_path = os.path.dirname(self.options.isolated_script_test_output)
re_run_output_dir = os.path.join(out_dir_path, 're_run_failures')
self.output_path = {
True: os.path.join(
re_run_output_dir, self.benchmark, 'test_results.json'),
False: os.path.join(out_dir_path, self.benchmark, 'test_results.json')
}
self.results_path = {
True: os.path.join(
re_run_output_dir, self.benchmark, 'perf_results.csv'),
False: os.path.join(out_dir_path, self.benchmark, 'perf_results.csv')
}
re_run_test_output = os.path.join(re_run_output_dir,
os.path.basename(self.options.isolated_script_test_output))
self.set_platform_specific_attributes()
# The values used as the upper limit are the 99th percentile of the
# avg and ci_095 frame_times recorded by dashboard in the past 200
# revisions. If the value measured here would be higher than this value at
# least by 10 [AVG_ERROR_MARGIN] percent of upper limit, that would be
# considered a failure. crbug.com/953895
with open(
os.path.join(os.path.dirname(__file__),
'representative_perf_test_data',
'representatives_frame_times_upper_limit.json')
) as bound_data:
self.upper_limit_data = json.load(bound_data)[self.platform]
self.args = list(sys.argv)
# The first run uses all stories in the representative story tag, but for
# rerun we use only the failed stories.
self.args.extend(['--story-tag-filter', self.story_tag])
self.re_run_args = replace_arg_values(list(sys.argv), [
('--isolated-script-test-output', re_run_test_output)])
def parse_csv_results(self, csv_obj):
""" Parses the raw CSV data
Convers the csv_obj into an array of valid values for averages and
confidence intervals based on the described upper_limits.
Args:
csv_obj: An array of rows (dict) describing the CSV results
Raturns:
A dictionary which has the stories as keys and an array of confidence
intervals and valid averages as data.
"""
values_per_story = {}
for row in csv_obj:
# For now only frame_times is used for testing representatives'
# performance and cpu_wall_time_ratio is used for validation.
if row['name'] != METRIC_NAME and row['name'] != 'cpu_wall_time_ratio':
continue
story_name = row['stories']
if (story_name not in self.upper_limit_data):
continue
if story_name not in values_per_story:
values_per_story[story_name] = {
'averages': [],
'ci_095': [],
'cpu_wall_time_ratio': []
}
if row['name'] == METRIC_NAME and row['avg'] != '' and row['count'] != 0:
values_per_story[story_name]['ci_095'].append(float(row['ci_095']))
values_per_story[story_name]['averages'].append(float(row['avg']))
elif row['name'] == 'cpu_wall_time_ratio' and row['avg'] != '':
values_per_story[story_name]['cpu_wall_time_ratio'].append(
float(row['avg']))
return values_per_story
def compare_values(self, values_per_story, rerun=False):
""" Parses the raw CSV data
Compares the values in values_per_story with the upper_limit_data and
determines if the story passes or fails and updates the ResultRecorder.
Args:
values_per_story: An array of rows (dict) descriving the CSV results
rerun: Is this a rerun or initial run
"""
for story_name in values_per_story:
# The experimental stories will not be considered for failing the tests
if (self.is_experimental_story(story_name)):
continue
if len(values_per_story[story_name]['ci_095']) == 0:
print(('[ FAILED ] {}/{} has no valid values for {}. Check ' +
'run_benchmark logs for more information.').format(
self.benchmark, story_name, METRIC_NAME))
self.result_recorder[rerun].add_failure(story_name, self.benchmark)
continue
upper_limits = self.upper_limit_data
upper_limit_avg = upper_limits[story_name]['avg']
upper_limit_ci = upper_limits[story_name]['ci_095']
lower_limit_cpu_ratio = upper_limits[story_name]['cpu_wall_time_ratio']
measured_avg = np.mean(np.array(values_per_story[story_name]['averages']))
measured_ci = np.mean(np.array(values_per_story[story_name]['ci_095']))
measured_cpu_ratio = np.mean(np.array(
values_per_story[story_name]['cpu_wall_time_ratio']))
if (measured_ci > upper_limit_ci * CI_ERROR_MARGIN and
self.is_control_story(story_name)):
print(('[ FAILED ] {}/{} {} has higher noise ({:.3f}) ' +
'compared to upper limit ({:.3f})').format(
self.benchmark, story_name, METRIC_NAME, measured_ci,
upper_limit_ci))
self.result_recorder[rerun].add_failure(
story_name, self.benchmark, True)
elif (measured_avg > upper_limit_avg * AVG_ERROR_MARGIN):
if (measured_cpu_ratio >= lower_limit_cpu_ratio):
print(('[ FAILED ] {}/{} higher average {}({:.3f}) compared' +
' to upper limit ({:.3f})').format(self.benchmark, story_name,
METRIC_NAME, measured_avg, upper_limit_avg))
self.result_recorder[rerun].add_failure(story_name, self.benchmark)
else:
print(('[ OK ] {}/{} higher average {}({:.3f}) compared ' +
'to upper limit({:.3f}). Invalidated for low cpu_wall_time_ratio'
).format(self.benchmark, story_name, METRIC_NAME, measured_avg,
upper_limit_avg))
self.result_recorder[rerun].add_invalidation_reason(
story_name, self.benchmark, 'Low cpu_wall_time_ratio')
else:
print(('[ OK ] {}/{} lower average {}({:.3f}) compared ' +
'to upper limit({:.3f}).').format(self.benchmark, story_name,
METRIC_NAME, measured_avg, upper_limit_avg))
def interpret_run_benchmark_results(self, rerun=False):
with open(self.output_path[rerun], 'r+') as resultsFile:
initialOut = json.load(resultsFile)
self.result_recorder[rerun].set_tests(initialOut)
with open(self.results_path[rerun]) as csv_file:
csv_obj = csv.DictReader(csv_file)
values_per_story = self.parse_csv_results(csv_obj)
if not rerun:
# Clearing the result of run_benchmark and write the gated perf results
resultsFile.seek(0)
resultsFile.truncate(0)
self.compare_values(values_per_story, rerun)
def run_perf_tests(self):
self.return_code |= run_performance_tests.main(self.args)
self.interpret_run_benchmark_results(False)
if len(self.result_recorder[False].failed_stories) > 0:
# For failed stories we run_tests again to make sure it's not a false
# positive.
print('============ Re_run the failed tests ============')
all_failed_stories = '('+'|'.join(
self.result_recorder[False].failed_stories)+')'
# TODO(crbug.com/1055893): Remove the extra chrome categories after
# investigation of flakes in representative perf tests.
self.re_run_args.extend(
['--story-filter', all_failed_stories, '--pageset-repeat=3',
'--extra-chrome-categories=blink,blink_gc,gpu,v8,viz'])
self.return_code |= run_performance_tests.main(self.re_run_args)
self.interpret_run_benchmark_results(True)
for story_name in self.result_recorder[False].failed_stories.copy():
if story_name not in self.result_recorder[True].failed_stories:
self.result_recorder[False].remove_failure(story_name,
self.benchmark, self.is_control_story(story_name))
if self.result_recorder[False].is_control_stories_noisy:
# In this case all failures are reported as expected, and the number of
# Failed stories in output.json will be zero.
self.result_recorder[False].invalidate_failures(self.benchmark)
(
finalOut,
self.return_code
) = self.result_recorder[False].get_output(self.return_code)
with open(self.output_path[False], 'r+') as resultsFile:
json.dump(finalOut, resultsFile, indent=4)
with open(self.options.isolated_script_test_output, 'w') as outputFile:
json.dump(finalOut, outputFile, indent=4)
if self.result_recorder[False].is_control_stories_noisy:
assert self.return_code == 0
print('Control story has high noise. These runs are not reliable!')
return self.return_code
def is_control_story(self, story_name):
# The story tagged as control story in upper_limit_data, will be used to
# identify possible flake and invalidates the results.
return self.story_has_attribute_enabled(story_name, 'control')
def is_experimental_story(self, story_name):
# The story tagged as experimental story in upper_limit_data, will be used
# to gather the performance results, but the test would not be failed as
# a result of.
return self.story_has_attribute_enabled(story_name, 'experimental')
def story_has_attribute_enabled(self, story_name, attribute):
return (attribute in self.upper_limit_data[story_name] and
self.upper_limit_data[story_name][attribute] == True)
def set_platform_specific_attributes(self):
if self.benchmark == 'rendering.desktop':
# Linux does not have it's own specific representatives
# and uses the representatives chosen for windows.
if sys.platform == 'win32' or sys.platform.startswith('linux'):
self.platform = 'win'
self.story_tag = 'representative_win_desktop'
elif sys.platform == 'darwin':
self.platform = 'mac'
self.story_tag = 'representative_mac_desktop'
else:
self.return_code = 1
elif self.benchmark == 'rendering.mobile':
self.platform = 'android'
self.story_tag = 'representative_mobile'
else:
self.return_code = 1
def replace_arg_values(args, key_value_pairs):
for index in range(0, len(args)):
for (key, value) in key_value_pairs:
if args[index].startswith(key):
if '=' in args[index]:
args[index] = key + '=' + value
else:
args[index+1] = value
return args
def main():
test_runner = RenderingRepresentativePerfTest()
if test_runner.return_code == 1:
return 1
return test_runner.run_perf_tests()
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('executable', help='The name of the executable to run.')
parser.add_argument(
'--benchmarks', required=True)
parser.add_argument(
'--isolated-script-test-output', required=True)
parser.add_argument(
'--isolated-script-test-perf-output', required=False)
return parser.parse_known_args()[0]
def main_compile_targets(args):
json.dump([], args.output)
if __name__ == '__main__':
# Conform minimally to the protocol defined by ScriptTest.
if 'compile_targets' in sys.argv:
funcs = {
'run': None,
'compile_targets': main_compile_targets,
}
sys.exit(common.run_script(sys.argv[1:], funcs))
sys.exit(main())