| #!/usr/bin/env python |
| # Copyright 2019 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Runs telemetry benchmarks on representative story tag. |
| |
| This script is a wrapper around run_performance_tests.py to capture the |
| values of performance metrics and compare them with the acceptable limits |
| in order to prevent regressions. |
| |
| Arguments used for this script are the same as run_performance_tests.py. |
| |
| The name and some functionalities of this script should be adjusted for |
| use with other benchmarks. |
| """ |
| |
| from __future__ import print_function |
| |
| import argparse |
| import csv |
| import json |
| import numpy as np |
| import os |
| import sys |
| import time |
| |
| import common |
| import run_performance_tests |
| |
| # AVG_ERROR_MARGIN determines how much more the value of frame times can be |
| # compared to the recorded value (multiplier of upper limit). |
| AVG_ERROR_MARGIN = 1.1 |
| # CI stands for confidence intervals. "ci_095"s recorded in the data is the |
| # recorded range between upper and lower CIs. CI_ERROR_MARGIN is the maximum |
| # acceptable ratio of calculated ci_095 to the recorded ones. |
| # TODO(behdadb) crbug.com/1052054 |
| CI_ERROR_MARGIN = 1.5 |
| |
| METRIC_NAME = 'frame_times' |
| |
| class ResultRecorder(object): |
| def __init__(self): |
| self.fails = 0 |
| self.tests = 0 |
| self.start_time = time.time() |
| self.output = {} |
| self.return_code = 0 |
| self._failed_stories = set() |
| self._noisy_control_stories = set() |
| # Set of _noisy_control_stories keeps track of control tests which failed |
| # because of high noise values. |
| |
| def set_tests(self, output): |
| self.output = output |
| self.fails = output['num_failures_by_type'].get('FAIL', 0) |
| self.tests = self.fails + output['num_failures_by_type'].get('PASS', 0) |
| |
| def add_failure(self, name, benchmark, is_control=False): |
| self.output['tests'][benchmark][name]['actual'] = 'FAIL' |
| self.output['tests'][benchmark][name]['is_unexpected'] = True |
| self._failed_stories.add(name) |
| self.fails += 1 |
| if is_control: |
| self._noisy_control_stories.add(name) |
| |
| def remove_failure(self, name, benchmark, is_control=False, |
| invalidation_reason=None): |
| self.output['tests'][benchmark][name]['actual'] = 'PASS' |
| self.output['tests'][benchmark][name]['is_unexpected'] = False |
| self._failed_stories.remove(name) |
| self.fails -= 1 |
| if is_control: |
| self._noisy_control_stories.remove(name) |
| if invalidation_reason: |
| self.add_invalidation_reason(name, benchmark, invalidation_reason) |
| |
| def invalidate_failures(self, benchmark): |
| # The method is for invalidating the failures in case of noisy control test |
| for story in self._failed_stories.copy(): |
| print(story + ' [Invalidated Failure]: The story failed but was ' + |
| 'invalidated as a result of noisy control test.') |
| self.remove_failure(story, benchmark, False, 'Noisy control test') |
| |
| def add_invalidation_reason(self, name, benchmark, reason): |
| self.output['tests'][benchmark][name]['invalidation_reason'] = reason |
| |
| @property |
| def failed_stories(self): |
| return self._failed_stories |
| |
| @property |
| def is_control_stories_noisy(self): |
| return len(self._noisy_control_stories) > 0 |
| |
| def get_output(self, return_code): |
| self.output['seconds_since_epoch'] = time.time() - self.start_time |
| self.output['num_failures_by_type']['PASS'] = self.tests - self.fails |
| self.output['num_failures_by_type']['FAIL'] = self.fails |
| if return_code == 1: |
| self.output['interrupted'] = True |
| |
| plural = lambda n, s, p: '%d %s' % (n, p if n != 1 else s) |
| tests = lambda n: plural(n, 'test', 'tests') |
| |
| print('[ PASSED ] ' + tests(self.tests - self.fails) + '.') |
| if self.fails > 0: |
| print('[ FAILED ] ' + tests(self.fails) + '.') |
| self.return_code = 1 |
| |
| return (self.output, self.return_code) |
| |
| class RenderingRepresentativePerfTest(object): |
| def __init__(self, initialization_for_tests=False): |
| self.return_code = 0 |
| # result_recorder for rerun, and non rerun |
| self.result_recorder = { |
| True: ResultRecorder(), |
| False: ResultRecorder() |
| } |
| |
| if initialization_for_tests is True: |
| return |
| |
| self.options = parse_arguments() |
| print (self.options) |
| |
| self.benchmark = self.options.benchmarks |
| out_dir_path = os.path.dirname(self.options.isolated_script_test_output) |
| re_run_output_dir = os.path.join(out_dir_path, 're_run_failures') |
| |
| self.output_path = { |
| True: os.path.join( |
| re_run_output_dir, self.benchmark, 'test_results.json'), |
| False: os.path.join(out_dir_path, self.benchmark, 'test_results.json') |
| } |
| self.results_path = { |
| True: os.path.join( |
| re_run_output_dir, self.benchmark, 'perf_results.csv'), |
| False: os.path.join(out_dir_path, self.benchmark, 'perf_results.csv') |
| } |
| |
| re_run_test_output = os.path.join(re_run_output_dir, |
| os.path.basename(self.options.isolated_script_test_output)) |
| |
| self.set_platform_specific_attributes() |
| |
| # The values used as the upper limit are the 99th percentile of the |
| # avg and ci_095 frame_times recorded by dashboard in the past 200 |
| # revisions. If the value measured here would be higher than this value at |
| # least by 10 [AVG_ERROR_MARGIN] percent of upper limit, that would be |
| # considered a failure. crbug.com/953895 |
| with open( |
| os.path.join(os.path.dirname(__file__), |
| 'representative_perf_test_data', |
| 'representatives_frame_times_upper_limit.json') |
| ) as bound_data: |
| self.upper_limit_data = json.load(bound_data)[self.platform] |
| |
| self.args = list(sys.argv) |
| # The first run uses all stories in the representative story tag, but for |
| # rerun we use only the failed stories. |
| self.args.extend(['--story-tag-filter', self.story_tag]) |
| |
| self.re_run_args = replace_arg_values(list(sys.argv), [ |
| ('--isolated-script-test-output', re_run_test_output)]) |
| |
| def parse_csv_results(self, csv_obj): |
| """ Parses the raw CSV data |
| Convers the csv_obj into an array of valid values for averages and |
| confidence intervals based on the described upper_limits. |
| |
| Args: |
| csv_obj: An array of rows (dict) describing the CSV results |
| |
| Raturns: |
| A dictionary which has the stories as keys and an array of confidence |
| intervals and valid averages as data. |
| """ |
| values_per_story = {} |
| for row in csv_obj: |
| # For now only frame_times is used for testing representatives' |
| # performance and cpu_wall_time_ratio is used for validation. |
| if row['name'] != METRIC_NAME and row['name'] != 'cpu_wall_time_ratio': |
| continue |
| story_name = row['stories'] |
| if (story_name not in self.upper_limit_data): |
| continue |
| if story_name not in values_per_story: |
| values_per_story[story_name] = { |
| 'averages': [], |
| 'ci_095': [], |
| 'cpu_wall_time_ratio': [] |
| } |
| |
| if row['name'] == METRIC_NAME and row['avg'] != '' and row['count'] != 0: |
| values_per_story[story_name]['ci_095'].append(float(row['ci_095'])) |
| values_per_story[story_name]['averages'].append(float(row['avg'])) |
| elif row['name'] == 'cpu_wall_time_ratio' and row['avg'] != '': |
| values_per_story[story_name]['cpu_wall_time_ratio'].append( |
| float(row['avg'])) |
| |
| return values_per_story |
| |
| def compare_values(self, values_per_story, rerun=False): |
| """ Parses the raw CSV data |
| Compares the values in values_per_story with the upper_limit_data and |
| determines if the story passes or fails and updates the ResultRecorder. |
| |
| Args: |
| values_per_story: An array of rows (dict) descriving the CSV results |
| rerun: Is this a rerun or initial run |
| """ |
| for story_name in values_per_story: |
| # The experimental stories will not be considered for failing the tests |
| if (self.is_experimental_story(story_name)): |
| continue |
| if len(values_per_story[story_name]['ci_095']) == 0: |
| print(('[ FAILED ] {}/{} has no valid values for {}. Check ' + |
| 'run_benchmark logs for more information.').format( |
| self.benchmark, story_name, METRIC_NAME)) |
| self.result_recorder[rerun].add_failure(story_name, self.benchmark) |
| continue |
| |
| upper_limits = self.upper_limit_data |
| upper_limit_avg = upper_limits[story_name]['avg'] |
| upper_limit_ci = upper_limits[story_name]['ci_095'] |
| lower_limit_cpu_ratio = upper_limits[story_name]['cpu_wall_time_ratio'] |
| measured_avg = np.mean(np.array(values_per_story[story_name]['averages'])) |
| measured_ci = np.mean(np.array(values_per_story[story_name]['ci_095'])) |
| measured_cpu_ratio = np.mean(np.array( |
| values_per_story[story_name]['cpu_wall_time_ratio'])) |
| |
| if (measured_ci > upper_limit_ci * CI_ERROR_MARGIN and |
| self.is_control_story(story_name)): |
| print(('[ FAILED ] {}/{} {} has higher noise ({:.3f}) ' + |
| 'compared to upper limit ({:.3f})').format( |
| self.benchmark, story_name, METRIC_NAME, measured_ci, |
| upper_limit_ci)) |
| self.result_recorder[rerun].add_failure( |
| story_name, self.benchmark, True) |
| elif (measured_avg > upper_limit_avg * AVG_ERROR_MARGIN): |
| if (measured_cpu_ratio >= lower_limit_cpu_ratio): |
| print(('[ FAILED ] {}/{} higher average {}({:.3f}) compared' + |
| ' to upper limit ({:.3f})').format(self.benchmark, story_name, |
| METRIC_NAME, measured_avg, upper_limit_avg)) |
| self.result_recorder[rerun].add_failure(story_name, self.benchmark) |
| else: |
| print(('[ OK ] {}/{} higher average {}({:.3f}) compared ' + |
| 'to upper limit({:.3f}). Invalidated for low cpu_wall_time_ratio' |
| ).format(self.benchmark, story_name, METRIC_NAME, measured_avg, |
| upper_limit_avg)) |
| self.result_recorder[rerun].add_invalidation_reason( |
| story_name, self.benchmark, 'Low cpu_wall_time_ratio') |
| else: |
| print(('[ OK ] {}/{} lower average {}({:.3f}) compared ' + |
| 'to upper limit({:.3f}).').format(self.benchmark, story_name, |
| METRIC_NAME, measured_avg, upper_limit_avg)) |
| |
| def interpret_run_benchmark_results(self, rerun=False): |
| with open(self.output_path[rerun], 'r+') as resultsFile: |
| initialOut = json.load(resultsFile) |
| self.result_recorder[rerun].set_tests(initialOut) |
| |
| with open(self.results_path[rerun]) as csv_file: |
| csv_obj = csv.DictReader(csv_file) |
| values_per_story = self.parse_csv_results(csv_obj) |
| |
| if not rerun: |
| # Clearing the result of run_benchmark and write the gated perf results |
| resultsFile.seek(0) |
| resultsFile.truncate(0) |
| |
| self.compare_values(values_per_story, rerun) |
| |
| def run_perf_tests(self): |
| self.return_code |= run_performance_tests.main(self.args) |
| self.interpret_run_benchmark_results(False) |
| |
| if len(self.result_recorder[False].failed_stories) > 0: |
| # For failed stories we run_tests again to make sure it's not a false |
| # positive. |
| print('============ Re_run the failed tests ============') |
| all_failed_stories = '('+'|'.join( |
| self.result_recorder[False].failed_stories)+')' |
| # TODO(crbug.com/1055893): Remove the extra chrome categories after |
| # investigation of flakes in representative perf tests. |
| self.re_run_args.extend( |
| ['--story-filter', all_failed_stories, '--pageset-repeat=3', |
| '--extra-chrome-categories=blink,blink_gc,gpu,v8,viz']) |
| self.return_code |= run_performance_tests.main(self.re_run_args) |
| self.interpret_run_benchmark_results(True) |
| |
| for story_name in self.result_recorder[False].failed_stories.copy(): |
| if story_name not in self.result_recorder[True].failed_stories: |
| self.result_recorder[False].remove_failure(story_name, |
| self.benchmark, self.is_control_story(story_name)) |
| |
| if self.result_recorder[False].is_control_stories_noisy: |
| # In this case all failures are reported as expected, and the number of |
| # Failed stories in output.json will be zero. |
| self.result_recorder[False].invalidate_failures(self.benchmark) |
| |
| ( |
| finalOut, |
| self.return_code |
| ) = self.result_recorder[False].get_output(self.return_code) |
| |
| with open(self.output_path[False], 'r+') as resultsFile: |
| json.dump(finalOut, resultsFile, indent=4) |
| with open(self.options.isolated_script_test_output, 'w') as outputFile: |
| json.dump(finalOut, outputFile, indent=4) |
| |
| if self.result_recorder[False].is_control_stories_noisy: |
| assert self.return_code == 0 |
| print('Control story has high noise. These runs are not reliable!') |
| |
| return self.return_code |
| |
| def is_control_story(self, story_name): |
| # The story tagged as control story in upper_limit_data, will be used to |
| # identify possible flake and invalidates the results. |
| return self.story_has_attribute_enabled(story_name, 'control') |
| |
| def is_experimental_story(self, story_name): |
| # The story tagged as experimental story in upper_limit_data, will be used |
| # to gather the performance results, but the test would not be failed as |
| # a result of. |
| return self.story_has_attribute_enabled(story_name, 'experimental') |
| |
| def story_has_attribute_enabled(self, story_name, attribute): |
| return (attribute in self.upper_limit_data[story_name] and |
| self.upper_limit_data[story_name][attribute] == True) |
| |
| |
| def set_platform_specific_attributes(self): |
| if self.benchmark == 'rendering.desktop': |
| # Linux does not have it's own specific representatives |
| # and uses the representatives chosen for windows. |
| if sys.platform == 'win32' or sys.platform.startswith('linux'): |
| self.platform = 'win' |
| self.story_tag = 'representative_win_desktop' |
| elif sys.platform == 'darwin': |
| self.platform = 'mac' |
| self.story_tag = 'representative_mac_desktop' |
| else: |
| self.return_code = 1 |
| elif self.benchmark == 'rendering.mobile': |
| self.platform = 'android' |
| self.story_tag = 'representative_mobile' |
| else: |
| self.return_code = 1 |
| |
| def replace_arg_values(args, key_value_pairs): |
| for index in range(0, len(args)): |
| for (key, value) in key_value_pairs: |
| if args[index].startswith(key): |
| if '=' in args[index]: |
| args[index] = key + '=' + value |
| else: |
| args[index+1] = value |
| return args |
| |
| def main(): |
| test_runner = RenderingRepresentativePerfTest() |
| if test_runner.return_code == 1: |
| return 1 |
| |
| return test_runner.run_perf_tests() |
| |
| def parse_arguments(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument('executable', help='The name of the executable to run.') |
| parser.add_argument( |
| '--benchmarks', required=True) |
| parser.add_argument( |
| '--isolated-script-test-output', required=True) |
| parser.add_argument( |
| '--isolated-script-test-perf-output', required=False) |
| return parser.parse_known_args()[0] |
| |
| def main_compile_targets(args): |
| json.dump([], args.output) |
| |
| if __name__ == '__main__': |
| # Conform minimally to the protocol defined by ScriptTest. |
| if 'compile_targets' in sys.argv: |
| funcs = { |
| 'run': None, |
| 'compile_targets': main_compile_targets, |
| } |
| sys.exit(common.run_script(sys.argv[1:], funcs)) |
| sys.exit(main()) |