| #!/usr/bin/env python |
| # Copyright 2019 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Runs telemetry benchmarks on representative story tag. |
| |
| This script is a wrapper around run_performance_tests.py to capture the |
| values of performance metrics and compare them with the acceptable limits |
| in order to prevent regressions. |
| |
| Arguments used for this script are the same as run_performance_tests.py. |
| |
| The name and some functionalities of this script should be adjusted for |
| use with other benchmarks. |
| """ |
| |
| from __future__ import print_function |
| |
| import argparse |
| import csv |
| import json |
| import os |
| import sys |
| import time |
| |
| import common |
| import run_performance_tests |
| |
| BENCHMARK = 'rendering.desktop' |
| # AVG_ERROR_MARGIN determines how much more the value of frame times can be |
| # compared to the recorded value |
| AVG_ERROR_MARGIN = 2.0 |
| # CI stands for confidence intervals. "ci_095"s recorded in the data is the |
| # recorded range between upper and lower CIs. CI_ERROR_MARGIN is the maximum |
| # acceptable ratio of calculated ci_095 to the recorded ones. |
| CI_ERROR_MARGIN = 2.0 |
| |
| class ResultRecorder(object): |
| def __init__(self): |
| self.fails = 0 |
| self.tests = 0 |
| self.start_time = time.time() |
| self.output = {} |
| self.return_code = 0 |
| self._failed_stories = set() |
| |
| def set_tests(self, output): |
| self.output = output |
| self.fails = 0 |
| if 'FAIL' in output['num_failures_by_type']: |
| self.fails = output['num_failures_by_type']['FAIL'] |
| self.tests = self.fails + output['num_failures_by_type']['PASS'] |
| |
| def add_failure(self, name): |
| self.output['tests'][BENCHMARK][name]['actual'] = 'FAIL' |
| self.output['tests'][BENCHMARK][name]['is_unexpected'] = True |
| self._failed_stories.add(name) |
| self.fails += 1 |
| |
| def remove_failure(self, name): |
| self.output['tests'][BENCHMARK][name]['actual'] = 'PASS' |
| self.output['tests'][BENCHMARK][name]['is_unexpected'] = False |
| self._failed_stories.remove(name) |
| self.fails -= 1 |
| |
| @property |
| def failed_stories(self): |
| return self._failed_stories |
| |
| def get_output(self, return_code): |
| self.output['seconds_since_epoch'] = time.time() - self.start_time |
| self.output['num_failures_by_type']['PASS'] = self.tests - self.fails |
| if self.fails > 0: |
| self.output['num_failures_by_type']['FAIL'] = self.fails |
| if return_code == 1: |
| self.output['interrupted'] = True |
| |
| plural = lambda n, s, p: '%d %s' % (n, p if n != 1 else s) |
| tests = lambda n: plural(n, 'test', 'tests') |
| |
| print('[ PASSED ] ' + tests(self.tests - self.fails) + '.') |
| if self.fails > 0: |
| print('[ FAILED ] ' + tests(self.fails) + '.') |
| self.return_code = 1 |
| |
| return (self.output, self.return_code) |
| |
| def interpret_run_benchmark_results(upper_limit_data, |
| isolated_script_test_output): |
| out_dir_path = os.path.dirname(isolated_script_test_output) |
| output_path = os.path.join(out_dir_path, BENCHMARK, 'test_results.json') |
| result_recorder = ResultRecorder() |
| |
| with open(output_path, 'r+') as resultsFile: |
| initialOut = json.load(resultsFile) |
| result_recorder.set_tests(initialOut) |
| |
| results_path = os.path.join(out_dir_path, BENCHMARK, 'perf_results.csv') |
| marked_stories = set() |
| |
| with open(results_path) as csv_file: |
| reader = csv.DictReader(csv_file) |
| for row in reader: |
| # For now only frame_times is used for testing representatives' |
| # performance. |
| if row['name'] != 'frame_times': |
| continue |
| story_name = row['stories'] |
| if (story_name in marked_stories or story_name not in |
| upper_limit_data): |
| continue |
| marked_stories.add(story_name) |
| |
| upper_limit_avg = upper_limit_data[story_name]['avg'] |
| upper_limit_ci = upper_limit_data[story_name]['ci_095'] |
| |
| if row['avg'] == '' or row['count'] == 0: |
| print('[ FAILED ] '+ BENCHMARK + '/' + story_name + ' ' + |
| row['name'] + ' has no values for ' + row['name'] + |
| '. check run_benchmark logs for more information.') |
| result_recorder.add_failure(story_name) |
| elif (float(row['ci_095']) > upper_limit_ci * CI_ERROR_MARGIN): |
| print('[ FAILED ] '+ BENCHMARK + '/' + story_name + ' ' + |
| row['name'] + ' has higher noise' + '(' + str(row['ci_095']) + |
| ') compared to upper limit(' + str(upper_limit_ci) + ').') |
| result_recorder.add_failure(story_name) |
| elif (float(row['avg']) > upper_limit_avg + AVG_ERROR_MARGIN): |
| print('[ FAILED ] '+ BENCHMARK + '/' + story_name + |
| ' higher average ' + row['name'] + '(' + str(row['avg']) + |
| ') compared to upper limit(' + str(upper_limit_avg) + ').') |
| result_recorder.add_failure(story_name) |
| else: |
| print('[ OK ] '+ BENCHMARK + '/' + story_name + |
| ' lower average ' + row['name'] + '(' + str(row['avg']) + |
| ') compared to upper limit(' + str(upper_limit_avg) + ').') |
| |
| # Clearing the result of run_benchmark and write the gated perf results |
| resultsFile.seek(0) |
| resultsFile.truncate(0) |
| |
| return result_recorder |
| |
| def replace_arg_values(args, key_value_pairs): |
| for index in range(0, len(args)): |
| for (key, value) in key_value_pairs: |
| if args[index].startswith(key): |
| if '=' in args[index]: |
| args[index] = key + '=' + value |
| else: |
| args[index+1] = value |
| return args |
| |
| def main(): |
| overall_return_code = 0 |
| |
| # Linux does not have it's own specific representatives |
| # and uses the representatives chosen for windows. |
| if sys.platform == 'win32' or sys.platform.startswith('linux'): |
| platform = 'win' |
| story_tag = 'representative_win_desktop' |
| elif sys.platform == 'darwin': |
| platform = 'mac' |
| story_tag = 'representative_mac_desktop' |
| else: |
| return 1 |
| |
| options = parse_arguments() |
| args = sys.argv |
| re_run_args = sys.argv |
| args.extend(['--story-tag-filter', story_tag]) |
| |
| overall_return_code = run_performance_tests.main(args) |
| |
| # The values used as the upper limit are the 99th percentile of the |
| # avg and ci_095 frame_times recorded by dashboard in the past 200 revisions. |
| # If the value measured here would be higher than this value at least by |
| # 2ms [AVG_ERROR_MARGIN], that would be considered a failure. |
| # crbug.com/953895 |
| with open( |
| os.path.join(os.path.dirname(__file__), |
| 'representative_perf_test_data', |
| 'representatives_frame_times_upper_limit.json') |
| ) as bound_data: |
| upper_limit_data = json.load(bound_data) |
| |
| out_dir_path = os.path.dirname(options.isolated_script_test_output) |
| output_path = os.path.join(out_dir_path, BENCHMARK, 'test_results.json') |
| result_recorder = interpret_run_benchmark_results(upper_limit_data[platform], |
| options.isolated_script_test_output) |
| |
| with open(output_path, 'r+') as resultsFile: |
| if len(result_recorder.failed_stories) > 0: |
| # For failed stories we run_tests again to make sure it's not a false |
| # positive. |
| print('============ Re_run the failed tests ============') |
| all_failed_stories = '('+'|'.join(result_recorder.failed_stories)+')' |
| re_run_args.extend( |
| ['--story-filter', all_failed_stories, '--pageset-repeat=3']) |
| |
| re_run_isolated_script_test_dir = os.path.join(out_dir_path, |
| 're_run_failures') |
| re_run_isolated_script_test_output = os.path.join( |
| re_run_isolated_script_test_dir, |
| os.path.basename(options.isolated_script_test_output)) |
| re_run_isolated_script_test_perf_output = os.path.join( |
| re_run_isolated_script_test_dir, |
| os.path.basename(options.isolated_script_test_perf_output)) |
| |
| re_run_args = replace_arg_values(re_run_args, [ |
| ('--isolated-script-test-output', re_run_isolated_script_test_output), |
| ('--isolated-script-test-perf-output', |
| re_run_isolated_script_test_perf_output) |
| ]) |
| |
| overall_return_code |= run_performance_tests.main(re_run_args) |
| re_run_result_recorder = interpret_run_benchmark_results( |
| upper_limit_data[platform], re_run_isolated_script_test_output) |
| |
| for story_name in result_recorder.failed_stories.copy(): |
| if story_name not in re_run_result_recorder.failed_stories: |
| result_recorder.remove_failure(story_name) |
| |
| ( |
| finalOut, |
| overall_return_code |
| ) = result_recorder.get_output(overall_return_code) |
| |
| json.dump(finalOut, resultsFile, indent=4) |
| |
| with open(options.isolated_script_test_output, 'w') as outputFile: |
| json.dump(finalOut, outputFile, indent=4) |
| |
| return overall_return_code |
| |
| def parse_arguments(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument('executable', help='The name of the executable to run.') |
| parser.add_argument( |
| '--isolated-script-test-output', required=True) |
| parser.add_argument( |
| '--isolated-script-test-perf-output', required=False) |
| return parser.parse_known_args()[0] |
| |
| def main_compile_targets(args): |
| json.dump([], args.output) |
| |
| if __name__ == '__main__': |
| # Conform minimally to the protocol defined by ScriptTest. |
| if 'compile_targets' in sys.argv: |
| funcs = { |
| 'run': None, |
| 'compile_targets': main_compile_targets, |
| } |
| sys.exit(common.run_script(sys.argv[1:], funcs)) |
| sys.exit(main()) |