blob: 0721aeb9fe40f2808dc480296cc041384ebb44fa [file] [log] [blame]
#!/usr/bin/env python
# Copyright 2019 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Runs telemetry benchmarks on representative story tag.
This script is a wrapper around to capture the
values of performance metrics and compare them with the acceptable limits
in order to prevent regressions.
Arguments used for this script are the same as
The name and some functionalities of this script should be adjusted for
use with other benchmarks.
from __future__ import print_function
import argparse
import csv
import json
import numpy as np
import os
import sys
import time
import common
import run_performance_tests
# AVG_ERROR_MARGIN determines how much more the value of frame times can be
# compared to the recorded value (multiplier of upper limit).
# CI stands for confidence intervals. "ci_095"s recorded in the data is the
# recorded range between upper and lower CIs. CI_ERROR_MARGIN is the maximum
# acceptable ratio of calculated ci_095 to the recorded ones.
# TODO(behdadb)
METRIC_NAME = 'frame_times'
class ResultRecorder(object):
def __init__(self):
self.fails = 0
self.tests = 0
self.start_time = time.time()
self.output = {}
self.return_code = 0
self._failed_stories = set()
self._noisy_control_stories = set()
# Set of _noisy_control_stories keeps track of control tests which failed
# because of high noise values.
def set_tests(self, output):
self.output = output
self.fails = output['num_failures_by_type'].get('FAIL', 0)
self.tests = self.fails + output['num_failures_by_type'].get('PASS', 0)
def add_failure(self, name, benchmark, is_control=False):
self.output['tests'][benchmark][name]['actual'] = 'FAIL'
self.output['tests'][benchmark][name]['is_unexpected'] = True
self.fails += 1
if is_control:
def remove_failure(self, name, benchmark, is_control=False,
self.output['tests'][benchmark][name]['actual'] = 'PASS'
self.output['tests'][benchmark][name]['is_unexpected'] = False
self.fails -= 1
if is_control:
if invalidation_reason:
self.add_invalidation_reason(name, benchmark, invalidation_reason)
def invalidate_failures(self, benchmark):
# The method is for invalidating the failures in case of noisy control test
for story in self._failed_stories.copy():
print(story + ' [Invalidated Failure]: The story failed but was ' +
'invalidated as a result of noisy control test.')
self.remove_failure(story, benchmark, False, 'Noisy control test')
def add_invalidation_reason(self, name, benchmark, reason):
self.output['tests'][benchmark][name]['invalidation_reason'] = reason
def failed_stories(self):
return self._failed_stories
def is_control_stories_noisy(self):
return len(self._noisy_control_stories) > 0
def get_output(self, return_code):
self.output['seconds_since_epoch'] = time.time() - self.start_time
self.output['num_failures_by_type']['PASS'] = self.tests - self.fails
self.output['num_failures_by_type']['FAIL'] = self.fails
if return_code == 1:
self.output['interrupted'] = True
plural = lambda n, s, p: '%d %s' % (n, p if n != 1 else s)
tests = lambda n: plural(n, 'test', 'tests')
print('[ PASSED ] ' + tests(self.tests - self.fails) + '.')
if self.fails > 0:
print('[ FAILED ] ' + tests(self.fails) + '.')
self.return_code = 1
return (self.output, self.return_code)
class RenderingRepresentativePerfTest(object):
def __init__(self, initialization_for_tests=False):
self.return_code = 0
# result_recorder for rerun, and non rerun
self.result_recorder = {
True: ResultRecorder(),
False: ResultRecorder()
if initialization_for_tests is True:
self.options = parse_arguments()
print (self.options)
self.benchmark = self.options.benchmarks
out_dir_path = os.path.dirname(self.options.isolated_script_test_output)
re_run_output_dir = os.path.join(out_dir_path, 're_run_failures')
self.output_path = {
True: os.path.join(
re_run_output_dir, self.benchmark, 'test_results.json'),
False: os.path.join(out_dir_path, self.benchmark, 'test_results.json')
self.results_path = {
True: os.path.join(
re_run_output_dir, self.benchmark, 'perf_results.csv'),
False: os.path.join(out_dir_path, self.benchmark, 'perf_results.csv')
re_run_test_output = os.path.join(re_run_output_dir,
# The values used as the upper limit are the 99th percentile of the
# avg and ci_095 frame_times recorded by dashboard in the past 200
# revisions. If the value measured here would be higher than this value at
# least by 10 [AVG_ERROR_MARGIN] percent of upper limit, that would be
# considered a failure.
with open(
) as bound_data:
self.upper_limit_data = json.load(bound_data)[self.platform]
self.args = list(sys.argv)
# The first run uses all stories in the representative story tag, but for
# rerun we use only the failed stories.
self.args.extend(['--story-tag-filter', self.story_tag])
self.re_run_args = replace_arg_values(list(sys.argv), [
('--isolated-script-test-output', re_run_test_output)])
def parse_csv_results(self, csv_obj):
""" Parses the raw CSV data
Convers the csv_obj into an array of valid values for averages and
confidence intervals based on the described upper_limits.
csv_obj: An array of rows (dict) describing the CSV results
A dictionary which has the stories as keys and an array of confidence
intervals and valid averages as data.
values_per_story = {}
for row in csv_obj:
# For now only frame_times is used for testing representatives'
# performance and cpu_wall_time_ratio is used for validation.
if row['name'] != METRIC_NAME and row['name'] != 'cpu_wall_time_ratio':
story_name = row['stories']
if (story_name not in self.upper_limit_data):
if story_name not in values_per_story:
values_per_story[story_name] = {
'averages': [],
'ci_095': [],
'cpu_wall_time_ratio': []
if row['name'] == METRIC_NAME and row['avg'] != '' and row['count'] != 0:
elif row['name'] == 'cpu_wall_time_ratio' and row['avg'] != '':
return values_per_story
def compare_values(self, values_per_story, rerun=False):
""" Parses the raw CSV data
Compares the values in values_per_story with the upper_limit_data and
determines if the story passes or fails and updates the ResultRecorder.
values_per_story: An array of rows (dict) descriving the CSV results
rerun: Is this a rerun or initial run
for story_name in values_per_story:
# The experimental stories will not be considered for failing the tests
if (self.is_experimental_story(story_name)):
if len(values_per_story[story_name]['ci_095']) == 0:
print(('[ FAILED ] {}/{} has no valid values for {}. Check ' +
'run_benchmark logs for more information.').format(
self.benchmark, story_name, METRIC_NAME))
self.result_recorder[rerun].add_failure(story_name, self.benchmark)
upper_limits = self.upper_limit_data
upper_limit_avg = upper_limits[story_name]['avg']
upper_limit_ci = upper_limits[story_name]['ci_095']
lower_limit_cpu_ratio = upper_limits[story_name]['cpu_wall_time_ratio']
measured_avg = np.mean(np.array(values_per_story[story_name]['averages']))
measured_ci = np.mean(np.array(values_per_story[story_name]['ci_095']))
measured_cpu_ratio = np.mean(np.array(
if (measured_ci > upper_limit_ci * CI_ERROR_MARGIN and
print(('[ FAILED ] {}/{} {} has higher noise ({:.3f}) ' +
'compared to upper limit ({:.3f})').format(
self.benchmark, story_name, METRIC_NAME, measured_ci,
story_name, self.benchmark, True)
elif (measured_avg > upper_limit_avg * AVG_ERROR_MARGIN):
if (measured_cpu_ratio >= lower_limit_cpu_ratio):
print(('[ FAILED ] {}/{} higher average {}({:.3f}) compared' +
' to upper limit ({:.3f})').format(self.benchmark, story_name,
METRIC_NAME, measured_avg, upper_limit_avg))
self.result_recorder[rerun].add_failure(story_name, self.benchmark)
print(('[ OK ] {}/{} higher average {}({:.3f}) compared ' +
'to upper limit({:.3f}). Invalidated for low cpu_wall_time_ratio'
).format(self.benchmark, story_name, METRIC_NAME, measured_avg,
story_name, self.benchmark, 'Low cpu_wall_time_ratio')
print(('[ OK ] {}/{} lower average {}({:.3f}) compared ' +
'to upper limit({:.3f}).').format(self.benchmark, story_name,
METRIC_NAME, measured_avg, upper_limit_avg))
def interpret_run_benchmark_results(self, rerun=False):
with open(self.output_path[rerun], 'r+') as resultsFile:
initialOut = json.load(resultsFile)
with open(self.results_path[rerun]) as csv_file:
csv_obj = csv.DictReader(csv_file)
values_per_story = self.parse_csv_results(csv_obj)
if not rerun:
# Clearing the result of run_benchmark and write the gated perf results
self.compare_values(values_per_story, rerun)
def run_perf_tests(self):
self.return_code |= run_performance_tests.main(self.args)
if len(self.result_recorder[False].failed_stories) > 0:
# For failed stories we run_tests again to make sure it's not a false
# positive.
print('============ Re_run the failed tests ============')
all_failed_stories = '('+'|'.join(
# TODO( Remove the extra chrome categories after
# investigation of flakes in representative perf tests.
['--story-filter', all_failed_stories, '--pageset-repeat=3',
self.return_code |= run_performance_tests.main(self.re_run_args)
for story_name in self.result_recorder[False].failed_stories.copy():
if story_name not in self.result_recorder[True].failed_stories:
self.benchmark, self.is_control_story(story_name))
if self.result_recorder[False].is_control_stories_noisy:
# In this case all failures are reported as expected, and the number of
# Failed stories in output.json will be zero.
) = self.result_recorder[False].get_output(self.return_code)
with open(self.output_path[False], 'r+') as resultsFile:
json.dump(finalOut, resultsFile, indent=4)
with open(self.options.isolated_script_test_output, 'w') as outputFile:
json.dump(finalOut, outputFile, indent=4)
if self.result_recorder[False].is_control_stories_noisy:
assert self.return_code == 0
print('Control story has high noise. These runs are not reliable!')
return self.return_code
def is_control_story(self, story_name):
# The story tagged as control story in upper_limit_data, will be used to
# identify possible flake and invalidates the results.
return self.story_has_attribute_enabled(story_name, 'control')
def is_experimental_story(self, story_name):
# The story tagged as experimental story in upper_limit_data, will be used
# to gather the performance results, but the test would not be failed as
# a result of.
return self.story_has_attribute_enabled(story_name, 'experimental')
def story_has_attribute_enabled(self, story_name, attribute):
return (attribute in self.upper_limit_data[story_name] and
self.upper_limit_data[story_name][attribute] == True)
def set_platform_specific_attributes(self):
if self.benchmark == 'rendering.desktop':
# Linux does not have it's own specific representatives
# and uses the representatives chosen for windows.
if sys.platform == 'win32' or sys.platform.startswith('linux'):
self.platform = 'win'
self.story_tag = 'representative_win_desktop'
elif sys.platform == 'darwin':
self.platform = 'mac'
self.story_tag = 'representative_mac_desktop'
self.return_code = 1
elif self.benchmark == '':
self.platform = 'android'
self.story_tag = 'representative_mobile'
self.return_code = 1
def replace_arg_values(args, key_value_pairs):
for index in range(0, len(args)):
for (key, value) in key_value_pairs:
if args[index].startswith(key):
if '=' in args[index]:
args[index] = key + '=' + value
args[index+1] = value
return args
def main():
test_runner = RenderingRepresentativePerfTest()
if test_runner.return_code == 1:
return 1
return test_runner.run_perf_tests()
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('executable', help='The name of the executable to run.')
'--benchmarks', required=True)
'--isolated-script-test-output', required=True)
'--isolated-script-test-perf-output', required=False)
return parser.parse_known_args()[0]
def main_compile_targets(args):
json.dump([], args.output)
if __name__ == '__main__':
# Conform minimally to the protocol defined by ScriptTest.
if 'compile_targets' in sys.argv:
funcs = {
'run': None,
'compile_targets': main_compile_targets,
sys.exit(common.run_script(sys.argv[1:], funcs))