testing/scripts/run_rendering_benchmark_with_gated_performance.py - chromium/src.git - Git at Google

 #!/usr/bin/env python
 # Copyright 2019 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Runs telemetry benchmarks on representative story tag.

 This script is a wrapper around run_performance_tests.py to capture the
 values of performance metrics and compare them with the acceptable limits
 in order to prevent regressions.

 Arguments used for this script are the same as run_performance_tests.py.

 The name and some functionalities of this script should be adjusted for
 use with other benchmarks.
 """

 from __future__ import print_function

 import argparse
 import csv
 import json
 import numpy as np
 import os
 import sys
 import time

 import common
 import run_performance_tests

 # AVG_ERROR_MARGIN determines how much more the value of frame times can be
 # compared to the recorded value (multiplier of upper limit).
 AVG_ERROR_MARGIN = 1.1
 # CI stands for confidence intervals. "ci_095"s recorded in the data is the
 # recorded range between upper and lower CIs. CI_ERROR_MARGIN is the maximum
 # acceptable ratio of calculated ci_095 to the recorded ones.
 # TODO(behdadb) crbug.com/1052054
 CI_ERROR_MARGIN = 1.5

 METRIC_NAME = 'frame_times'

 class ResultRecorder(object):
   def __init__(self):
     self.fails = 0
     self.tests = 0
     self.start_time = time.time()
     self.output = {}
     self.return_code = 0
     self._failed_stories = set()
     self._noisy_control_stories = set()
     # Set of _noisy_control_stories keeps track of control tests which failed
     # because of high noise values.

   def set_tests(self, output):
     self.output = output
     self.fails = output['num_failures_by_type'].get('FAIL', 0)
     self.tests = self.fails + output['num_failures_by_type'].get('PASS', 0)

   def add_failure(self, name, benchmark, is_control=False):
     self.output['tests'][benchmark][name]['actual'] = 'FAIL'
     self.output['tests'][benchmark][name]['is_unexpected'] = True
     self._failed_stories.add(name)
     self.fails += 1
     if is_control:
       self._noisy_control_stories.add(name)

   def remove_failure(self, name, benchmark, is_control=False,
                       invalidation_reason=None):
     self.output['tests'][benchmark][name]['actual'] = 'PASS'
     self.output['tests'][benchmark][name]['is_unexpected'] = False
     self._failed_stories.remove(name)
     self.fails -= 1
     if is_control:
       self._noisy_control_stories.remove(name)
     if invalidation_reason:
       self.add_invalidation_reason(name, benchmark, invalidation_reason)

   def invalidate_failures(self, benchmark):
     # The method is for invalidating the failures in case of noisy control test
     for story in self._failed_stories.copy():
       print(story + ' [Invalidated Failure]: The story failed but was ' +
         'invalidated as a result of noisy control test.')
       self.remove_failure(story, benchmark, False, 'Noisy control test')

   def add_invalidation_reason(self, name, benchmark, reason):
     self.output['tests'][benchmark][name]['invalidation_reason'] = reason

   @property
   def failed_stories(self):
     return self._failed_stories

   @property
   def is_control_stories_noisy(self):
     return len(self._noisy_control_stories) > 0

   def get_output(self, return_code):
     self.output['seconds_since_epoch'] = time.time() - self.start_time
     self.output['num_failures_by_type']['PASS'] = self.tests - self.fails
     self.output['num_failures_by_type']['FAIL'] = self.fails
     if return_code == 1:
       self.output['interrupted'] = True

     plural = lambda n, s, p: '%d %s' % (n, p if n != 1 else s)
     tests = lambda n: plural(n, 'test', 'tests')

     print('[  PASSED  ] ' + tests(self.tests - self.fails) + '.')
     if self.fails > 0:
       print('[  FAILED  ] ' + tests(self.fails) + '.')
       self.return_code = 1

     return (self.output, self.return_code)

 class RenderingRepresentativePerfTest(object):
   def __init__(self, initialization_for_tests=False):
     self.return_code = 0
     # result_recorder for rerun, and non rerun
     self.result_recorder = {
       True: ResultRecorder(),
       False: ResultRecorder()
     }

     if initialization_for_tests is True:
       return

     self.options = parse_arguments()
     print (self.options)

     self.benchmark = self.options.benchmarks
     out_dir_path = os.path.dirname(self.options.isolated_script_test_output)
     re_run_output_dir = os.path.join(out_dir_path, 're_run_failures')

     self.output_path = {
       True: os.path.join(
         re_run_output_dir, self.benchmark, 'test_results.json'),
       False: os.path.join(out_dir_path, self.benchmark, 'test_results.json')
     }
     self.results_path = {
       True: os.path.join(
         re_run_output_dir, self.benchmark, 'perf_results.csv'),
       False: os.path.join(out_dir_path, self.benchmark, 'perf_results.csv')
     }

     re_run_test_output = os.path.join(re_run_output_dir,
       os.path.basename(self.options.isolated_script_test_output))

     self.set_platform_specific_attributes()

     # The values used as the upper limit are the 99th percentile of the
     # avg and ci_095 frame_times recorded by dashboard in the past 200
     # revisions. If the value measured here would be higher than this value at
     # least by 10 [AVG_ERROR_MARGIN] percent of upper limit, that would be
     # considered a failure. crbug.com/953895
     with open(
       os.path.join(os.path.dirname(__file__),
       'representative_perf_test_data',
       'representatives_frame_times_upper_limit.json')
     ) as bound_data:
       self.upper_limit_data = json.load(bound_data)[self.platform]

     self.args = list(sys.argv)
     # The first run uses all stories in the representative story tag, but for
     # rerun we use only the failed stories.
     self.args.extend(['--story-tag-filter', self.story_tag])

     self.re_run_args = replace_arg_values(list(sys.argv), [
       ('--isolated-script-test-output', re_run_test_output)])

   def parse_csv_results(self, csv_obj):
     """ Parses the raw CSV data
     Convers the csv_obj into an array of valid values for averages and
     confidence intervals based on the described upper_limits.

     Args:
       csv_obj: An array of rows (dict) describing the CSV results

     Raturns:
       A dictionary which has the stories as keys and an array of confidence
       intervals and valid averages as data.
     """
     values_per_story = {}
     for row in csv_obj:
       # For now only frame_times is used for testing representatives'
       # performance and cpu_wall_time_ratio is used for validation.
       if row['name'] != METRIC_NAME and row['name'] != 'cpu_wall_time_ratio':
         continue
       story_name = row['stories']
       if (story_name not in self.upper_limit_data):
         continue
       if story_name not in values_per_story:
         values_per_story[story_name] = {
           'averages': [],
           'ci_095': [],
           'cpu_wall_time_ratio': []
         }

       if row['name'] == METRIC_NAME and row['avg'] != '' and row['count'] != 0:
         values_per_story[story_name]['ci_095'].append(float(row['ci_095']))
         values_per_story[story_name]['averages'].append(float(row['avg']))
       elif row['name'] == 'cpu_wall_time_ratio' and row['avg'] != '':
         values_per_story[story_name]['cpu_wall_time_ratio'].append(
           float(row['avg']))

     return values_per_story

   def compare_values(self, values_per_story, rerun=False):
     """ Parses the raw CSV data
     Compares the values in values_per_story with the upper_limit_data and
     determines if the story passes or fails and updates the ResultRecorder.

     Args:
       values_per_story: An array of rows (dict) descriving the CSV results
       rerun: Is this a rerun or initial run
     """
     for story_name in values_per_story:
       # The experimental stories will not be considered for failing the tests
       if (self.is_experimental_story(story_name)):
         continue
       if len(values_per_story[story_name]['ci_095']) == 0:
         print(('[  FAILED  ] {}/{} has no valid values for {}. Check ' +
           'run_benchmark logs for more information.').format(
             self.benchmark, story_name, METRIC_NAME))
         self.result_recorder[rerun].add_failure(story_name, self.benchmark)
         continue

       upper_limits = self.upper_limit_data
       upper_limit_avg = upper_limits[story_name]['avg']
       upper_limit_ci = upper_limits[story_name]['ci_095']
       lower_limit_cpu_ratio = upper_limits[story_name]['cpu_wall_time_ratio']
       measured_avg = np.mean(np.array(values_per_story[story_name]['averages']))
       measured_ci = np.mean(np.array(values_per_story[story_name]['ci_095']))
       measured_cpu_ratio = np.mean(np.array(
         values_per_story[story_name]['cpu_wall_time_ratio']))

       if (measured_ci > upper_limit_ci * CI_ERROR_MARGIN and
         self.is_control_story(story_name)):
         print(('[  FAILED  ] {}/{} {} has higher noise ({:.3f}) ' +
           'compared to upper limit ({:.3f})').format(
             self.benchmark, story_name, METRIC_NAME, measured_ci,
           upper_limit_ci))
         self.result_recorder[rerun].add_failure(
           story_name, self.benchmark, True)
       elif (measured_avg > upper_limit_avg * AVG_ERROR_MARGIN):
         if (measured_cpu_ratio >= lower_limit_cpu_ratio):
           print(('[  FAILED  ] {}/{} higher average {}({:.3f}) compared' +
             ' to upper limit ({:.3f})').format(self.benchmark, story_name,
             METRIC_NAME, measured_avg, upper_limit_avg))
           self.result_recorder[rerun].add_failure(story_name, self.benchmark)
         else:
           print(('[       OK ] {}/{} higher average {}({:.3f}) compared ' +
             'to upper limit({:.3f}). Invalidated for low cpu_wall_time_ratio'
             ).format(self.benchmark, story_name, METRIC_NAME, measured_avg,
                       upper_limit_avg))
           self.result_recorder[rerun].add_invalidation_reason(
             story_name, self.benchmark, 'Low cpu_wall_time_ratio')
       else:
         print(('[       OK ] {}/{} lower average {}({:.3f}) compared ' +
           'to upper limit({:.3f}).').format(self.benchmark, story_name,
           METRIC_NAME, measured_avg, upper_limit_avg))

   def interpret_run_benchmark_results(self, rerun=False):
     with open(self.output_path[rerun], 'r+') as resultsFile:
       initialOut = json.load(resultsFile)
       self.result_recorder[rerun].set_tests(initialOut)

       with open(self.results_path[rerun]) as csv_file:
         csv_obj = csv.DictReader(csv_file)
         values_per_story = self.parse_csv_results(csv_obj)

       if not rerun:
         # Clearing the result of run_benchmark and write the gated perf results
         resultsFile.seek(0)
         resultsFile.truncate(0)

     self.compare_values(values_per_story, rerun)

   def run_perf_tests(self):
     self.return_code |= run_performance_tests.main(self.args)
     self.interpret_run_benchmark_results(False)

     if len(self.result_recorder[False].failed_stories) > 0:
       # For failed stories we run_tests again to make sure it's not a false
       # positive.
       print('============ Re_run the failed tests ============')
       all_failed_stories = '('+'|'.join(
         self.result_recorder[False].failed_stories)+')'
       # TODO(crbug.com/1055893): Remove the extra chrome categories after
       # investigation of flakes in representative perf tests.
       self.re_run_args.extend(
         ['--story-filter', all_failed_stories, '--pageset-repeat=3',
          '--extra-chrome-categories=blink,blink_gc,gpu,v8,viz'])
       self.return_code |= run_performance_tests.main(self.re_run_args)
       self.interpret_run_benchmark_results(True)

       for story_name in self.result_recorder[False].failed_stories.copy():
         if story_name not in self.result_recorder[True].failed_stories:
           self.result_recorder[False].remove_failure(story_name,
             self.benchmark, self.is_control_story(story_name))

     if self.result_recorder[False].is_control_stories_noisy:
       # In this case all failures are reported as expected, and the number of
       # Failed stories in output.json will be zero.
       self.result_recorder[False].invalidate_failures(self.benchmark)

     (
       finalOut,
       self.return_code
     ) = self.result_recorder[False].get_output(self.return_code)

     with open(self.output_path[False], 'r+') as resultsFile:
       json.dump(finalOut, resultsFile, indent=4)
     with open(self.options.isolated_script_test_output, 'w') as outputFile:
       json.dump(finalOut, outputFile, indent=4)

     if self.result_recorder[False].is_control_stories_noisy:
       assert self.return_code == 0
       print('Control story has high noise. These runs are not reliable!')

     return self.return_code

   def is_control_story(self, story_name):
     # The story tagged as control story in upper_limit_data, will be used to
     # identify possible flake and invalidates the results.
     return self.story_has_attribute_enabled(story_name, 'control')

   def is_experimental_story(self, story_name):
     # The story tagged as experimental story in upper_limit_data, will be used
     # to gather the performance results, but the test would not be failed as
     # a result of.
     return self.story_has_attribute_enabled(story_name, 'experimental')

   def story_has_attribute_enabled(self, story_name, attribute):
     return (attribute in self.upper_limit_data[story_name] and
       self.upper_limit_data[story_name][attribute] == True)


   def set_platform_specific_attributes(self):
     if self.benchmark == 'rendering.desktop':
       # Linux does not have it's own specific representatives
       # and uses the representatives chosen for windows.
       if sys.platform == 'win32' or sys.platform.startswith('linux'):
         self.platform = 'win'
         self.story_tag = 'representative_win_desktop'
       elif sys.platform == 'darwin':
         self.platform = 'mac'
         self.story_tag = 'representative_mac_desktop'
       else:
         self.return_code = 1
     elif self.benchmark == 'rendering.mobile':
       self.platform = 'android'
       self.story_tag = 'representative_mobile'
     else:
       self.return_code = 1

 def replace_arg_values(args, key_value_pairs):
   for index in range(0, len(args)):
     for (key, value) in key_value_pairs:
       if args[index].startswith(key):
         if '=' in args[index]:
           args[index] = key + '=' + value
         else:
           args[index+1] = value
   return args

 def main():
   test_runner = RenderingRepresentativePerfTest()
   if test_runner.return_code == 1:
     return 1

   return test_runner.run_perf_tests()

 def parse_arguments():
   parser = argparse.ArgumentParser()
   parser.add_argument('executable', help='The name of the executable to run.')
   parser.add_argument(
       '--benchmarks', required=True)
   parser.add_argument(
       '--isolated-script-test-output', required=True)
   parser.add_argument(
       '--isolated-script-test-perf-output', required=False)
   return parser.parse_known_args()[0]

 def main_compile_targets(args):
   json.dump([], args.output)

 if __name__ == '__main__':
   # Conform minimally to the protocol defined by ScriptTest.
   if 'compile_targets' in sys.argv:
     funcs = {
       'run': None,
       'compile_targets': main_compile_targets,
     }
     sys.exit(common.run_script(sys.argv[1:], funcs))
   sys.exit(main())
	#!/usr/bin/env python
	# Copyright 2019 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Runs telemetry benchmarks on representative story tag.

	This script is a wrapper around run_performance_tests.py to capture the
	values of performance metrics and compare them with the acceptable limits
	in order to prevent regressions.

	Arguments used for this script are the same as run_performance_tests.py.

	The name and some functionalities of this script should be adjusted for
	use with other benchmarks.
	"""

	from __future__ import print_function

	import argparse
	import csv
	import json
	import numpy as np
	import os
	import sys
	import time

	import common
	import run_performance_tests

	# AVG_ERROR_MARGIN determines how much more the value of frame times can be
	# compared to the recorded value (multiplier of upper limit).
	AVG_ERROR_MARGIN = 1.1
	# CI stands for confidence intervals. "ci_095"s recorded in the data is the
	# recorded range between upper and lower CIs. CI_ERROR_MARGIN is the maximum
	# acceptable ratio of calculated ci_095 to the recorded ones.
	# TODO(behdadb) crbug.com/1052054
	CI_ERROR_MARGIN = 1.5

	METRIC_NAME = 'frame_times'

	class ResultRecorder(object):
	def __init__(self):
	self.fails = 0
	self.tests = 0
	self.start_time = time.time()
	self.output = {}
	self.return_code = 0
	self._failed_stories = set()
	self._noisy_control_stories = set()
	# Set of _noisy_control_stories keeps track of control tests which failed
	# because of high noise values.

	def set_tests(self, output):
	self.output = output
	self.fails = output['num_failures_by_type'].get('FAIL', 0)
	self.tests = self.fails + output['num_failures_by_type'].get('PASS', 0)

	def add_failure(self, name, benchmark, is_control=False):
	self.output['tests'][benchmark][name]['actual'] = 'FAIL'
	self.output['tests'][benchmark][name]['is_unexpected'] = True
	self._failed_stories.add(name)
	self.fails += 1
	if is_control:
	self._noisy_control_stories.add(name)

	def remove_failure(self, name, benchmark, is_control=False,
	invalidation_reason=None):
	self.output['tests'][benchmark][name]['actual'] = 'PASS'
	self.output['tests'][benchmark][name]['is_unexpected'] = False
	self._failed_stories.remove(name)
	self.fails -= 1
	if is_control:
	self._noisy_control_stories.remove(name)
	if invalidation_reason:
	self.add_invalidation_reason(name, benchmark, invalidation_reason)

	def invalidate_failures(self, benchmark):
	# The method is for invalidating the failures in case of noisy control test
	for story in self._failed_stories.copy():
	print(story + ' [Invalidated Failure]: The story failed but was ' +
	'invalidated as a result of noisy control test.')
	self.remove_failure(story, benchmark, False, 'Noisy control test')

	def add_invalidation_reason(self, name, benchmark, reason):
	self.output['tests'][benchmark][name]['invalidation_reason'] = reason

	@property
	def failed_stories(self):
	return self._failed_stories

	@property
	def is_control_stories_noisy(self):
	return len(self._noisy_control_stories) > 0

	def get_output(self, return_code):
	self.output['seconds_since_epoch'] = time.time() - self.start_time
	self.output['num_failures_by_type']['PASS'] = self.tests - self.fails
	self.output['num_failures_by_type']['FAIL'] = self.fails
	if return_code == 1:
	self.output['interrupted'] = True

	plural = lambda n, s, p: '%d %s' % (n, p if n != 1 else s)
	tests = lambda n: plural(n, 'test', 'tests')

	print('[ PASSED ] ' + tests(self.tests - self.fails) + '.')
	if self.fails > 0:
	print('[ FAILED ] ' + tests(self.fails) + '.')
	self.return_code = 1

	return (self.output, self.return_code)

	class RenderingRepresentativePerfTest(object):
	def __init__(self, initialization_for_tests=False):
	self.return_code = 0
	# result_recorder for rerun, and non rerun
	self.result_recorder = {
	True: ResultRecorder(),
	False: ResultRecorder()
	}

	if initialization_for_tests is True:
	return

	self.options = parse_arguments()
	print (self.options)

	self.benchmark = self.options.benchmarks
	out_dir_path = os.path.dirname(self.options.isolated_script_test_output)
	re_run_output_dir = os.path.join(out_dir_path, 're_run_failures')

	self.output_path = {
	True: os.path.join(
	re_run_output_dir, self.benchmark, 'test_results.json'),
	False: os.path.join(out_dir_path, self.benchmark, 'test_results.json')
	}
	self.results_path = {
	True: os.path.join(
	re_run_output_dir, self.benchmark, 'perf_results.csv'),
	False: os.path.join(out_dir_path, self.benchmark, 'perf_results.csv')
	}

	re_run_test_output = os.path.join(re_run_output_dir,
	os.path.basename(self.options.isolated_script_test_output))

	self.set_platform_specific_attributes()

	# The values used as the upper limit are the 99th percentile of the
	# avg and ci_095 frame_times recorded by dashboard in the past 200
	# revisions. If the value measured here would be higher than this value at
	# least by 10 [AVG_ERROR_MARGIN] percent of upper limit, that would be
	# considered a failure. crbug.com/953895
	with open(
	os.path.join(os.path.dirname(__file__),
	'representative_perf_test_data',
	'representatives_frame_times_upper_limit.json')
	) as bound_data:
	self.upper_limit_data = json.load(bound_data)[self.platform]

	self.args = list(sys.argv)
	# The first run uses all stories in the representative story tag, but for
	# rerun we use only the failed stories.
	self.args.extend(['--story-tag-filter', self.story_tag])

	self.re_run_args = replace_arg_values(list(sys.argv), [
	('--isolated-script-test-output', re_run_test_output)])

	def parse_csv_results(self, csv_obj):
	""" Parses the raw CSV data
	Convers the csv_obj into an array of valid values for averages and
	confidence intervals based on the described upper_limits.

	Args:
	csv_obj: An array of rows (dict) describing the CSV results

	Raturns:
	A dictionary which has the stories as keys and an array of confidence
	intervals and valid averages as data.
	"""
	values_per_story = {}
	for row in csv_obj:
	# For now only frame_times is used for testing representatives'
	# performance and cpu_wall_time_ratio is used for validation.
	if row['name'] != METRIC_NAME and row['name'] != 'cpu_wall_time_ratio':
	continue
	story_name = row['stories']
	if (story_name not in self.upper_limit_data):
	continue
	if story_name not in values_per_story:
	values_per_story[story_name] = {
	'averages': [],
	'ci_095': [],
	'cpu_wall_time_ratio': []
	}

	if row['name'] == METRIC_NAME and row['avg'] != '' and row['count'] != 0:
	values_per_story[story_name]['ci_095'].append(float(row['ci_095']))
	values_per_story[story_name]['averages'].append(float(row['avg']))
	elif row['name'] == 'cpu_wall_time_ratio' and row['avg'] != '':
	values_per_story[story_name]['cpu_wall_time_ratio'].append(
	float(row['avg']))

	return values_per_story

	def compare_values(self, values_per_story, rerun=False):
	""" Parses the raw CSV data
	Compares the values in values_per_story with the upper_limit_data and
	determines if the story passes or fails and updates the ResultRecorder.

	Args:
	values_per_story: An array of rows (dict) descriving the CSV results
	rerun: Is this a rerun or initial run
	"""
	for story_name in values_per_story:
	# The experimental stories will not be considered for failing the tests
	if (self.is_experimental_story(story_name)):
	continue
	if len(values_per_story[story_name]['ci_095']) == 0:
	print(('[ FAILED ] {}/{} has no valid values for {}. Check ' +
	'run_benchmark logs for more information.').format(
	self.benchmark, story_name, METRIC_NAME))
	self.result_recorder[rerun].add_failure(story_name, self.benchmark)
	continue

	upper_limits = self.upper_limit_data
	upper_limit_avg = upper_limits[story_name]['avg']
	upper_limit_ci = upper_limits[story_name]['ci_095']
	lower_limit_cpu_ratio = upper_limits[story_name]['cpu_wall_time_ratio']
	measured_avg = np.mean(np.array(values_per_story[story_name]['averages']))
	measured_ci = np.mean(np.array(values_per_story[story_name]['ci_095']))
	measured_cpu_ratio = np.mean(np.array(
	values_per_story[story_name]['cpu_wall_time_ratio']))

	if (measured_ci > upper_limit_ci * CI_ERROR_MARGIN and
	self.is_control_story(story_name)):
	print(('[ FAILED ] {}/{} {} has higher noise ({:.3f}) ' +
	'compared to upper limit ({:.3f})').format(
	self.benchmark, story_name, METRIC_NAME, measured_ci,
	upper_limit_ci))
	self.result_recorder[rerun].add_failure(
	story_name, self.benchmark, True)
	elif (measured_avg > upper_limit_avg * AVG_ERROR_MARGIN):
	if (measured_cpu_ratio >= lower_limit_cpu_ratio):
	print(('[ FAILED ] {}/{} higher average {}({:.3f}) compared' +
	' to upper limit ({:.3f})').format(self.benchmark, story_name,
	METRIC_NAME, measured_avg, upper_limit_avg))
	self.result_recorder[rerun].add_failure(story_name, self.benchmark)
	else:
	print(('[ OK ] {}/{} higher average {}({:.3f}) compared ' +
	'to upper limit({:.3f}). Invalidated for low cpu_wall_time_ratio'
	).format(self.benchmark, story_name, METRIC_NAME, measured_avg,
	upper_limit_avg))
	self.result_recorder[rerun].add_invalidation_reason(
	story_name, self.benchmark, 'Low cpu_wall_time_ratio')
	else:
	print(('[ OK ] {}/{} lower average {}({:.3f}) compared ' +
	'to upper limit({:.3f}).').format(self.benchmark, story_name,
	METRIC_NAME, measured_avg, upper_limit_avg))

	def interpret_run_benchmark_results(self, rerun=False):
	with open(self.output_path[rerun], 'r+') as resultsFile:
	initialOut = json.load(resultsFile)
	self.result_recorder[rerun].set_tests(initialOut)

	with open(self.results_path[rerun]) as csv_file:
	csv_obj = csv.DictReader(csv_file)
	values_per_story = self.parse_csv_results(csv_obj)

	if not rerun:
	# Clearing the result of run_benchmark and write the gated perf results
	resultsFile.seek(0)
	resultsFile.truncate(0)

	self.compare_values(values_per_story, rerun)

	def run_perf_tests(self):
	self.return_code \|= run_performance_tests.main(self.args)
	self.interpret_run_benchmark_results(False)

	if len(self.result_recorder[False].failed_stories) > 0:
	# For failed stories we run_tests again to make sure it's not a false
	# positive.
	print('============ Re_run the failed tests ============')
	all_failed_stories = '('+'\|'.join(
	self.result_recorder[False].failed_stories)+')'
	# TODO(crbug.com/1055893): Remove the extra chrome categories after
	# investigation of flakes in representative perf tests.
	self.re_run_args.extend(
	['--story-filter', all_failed_stories, '--pageset-repeat=3',
	'--extra-chrome-categories=blink,blink_gc,gpu,v8,viz'])
	self.return_code \|= run_performance_tests.main(self.re_run_args)
	self.interpret_run_benchmark_results(True)

	for story_name in self.result_recorder[False].failed_stories.copy():
	if story_name not in self.result_recorder[True].failed_stories:
	self.result_recorder[False].remove_failure(story_name,
	self.benchmark, self.is_control_story(story_name))

	if self.result_recorder[False].is_control_stories_noisy:
	# In this case all failures are reported as expected, and the number of
	# Failed stories in output.json will be zero.
	self.result_recorder[False].invalidate_failures(self.benchmark)

	(
	finalOut,
	self.return_code
	) = self.result_recorder[False].get_output(self.return_code)

	with open(self.output_path[False], 'r+') as resultsFile:
	json.dump(finalOut, resultsFile, indent=4)
	with open(self.options.isolated_script_test_output, 'w') as outputFile:
	json.dump(finalOut, outputFile, indent=4)

	if self.result_recorder[False].is_control_stories_noisy:
	assert self.return_code == 0
	print('Control story has high noise. These runs are not reliable!')

	return self.return_code

	def is_control_story(self, story_name):
	# The story tagged as control story in upper_limit_data, will be used to
	# identify possible flake and invalidates the results.
	return self.story_has_attribute_enabled(story_name, 'control')

	def is_experimental_story(self, story_name):
	# The story tagged as experimental story in upper_limit_data, will be used
	# to gather the performance results, but the test would not be failed as
	# a result of.
	return self.story_has_attribute_enabled(story_name, 'experimental')

	def story_has_attribute_enabled(self, story_name, attribute):
	return (attribute in self.upper_limit_data[story_name] and
	self.upper_limit_data[story_name][attribute] == True)


	def set_platform_specific_attributes(self):
	if self.benchmark == 'rendering.desktop':
	# Linux does not have it's own specific representatives
	# and uses the representatives chosen for windows.
	if sys.platform == 'win32' or sys.platform.startswith('linux'):
	self.platform = 'win'
	self.story_tag = 'representative_win_desktop'
	elif sys.platform == 'darwin':
	self.platform = 'mac'
	self.story_tag = 'representative_mac_desktop'
	else:
	self.return_code = 1
	elif self.benchmark == 'rendering.mobile':
	self.platform = 'android'
	self.story_tag = 'representative_mobile'
	else:
	self.return_code = 1

	def replace_arg_values(args, key_value_pairs):
	for index in range(0, len(args)):
	for (key, value) in key_value_pairs:
	if args[index].startswith(key):
	if '=' in args[index]:
	args[index] = key + '=' + value
	else:
	args[index+1] = value
	return args

	def main():
	test_runner = RenderingRepresentativePerfTest()
	if test_runner.return_code == 1:
	return 1

	return test_runner.run_perf_tests()

	def parse_arguments():
	parser = argparse.ArgumentParser()
	parser.add_argument('executable', help='The name of the executable to run.')
	parser.add_argument(
	'--benchmarks', required=True)
	parser.add_argument(
	'--isolated-script-test-output', required=True)
	parser.add_argument(
	'--isolated-script-test-perf-output', required=False)
	return parser.parse_known_args()[0]

	def main_compile_targets(args):
	json.dump([], args.output)

	if __name__ == '__main__':
	# Conform minimally to the protocol defined by ScriptTest.
	if 'compile_targets' in sys.argv:
	funcs = {
	'run': None,
	'compile_targets': main_compile_targets,
	}
	sys.exit(common.run_script(sys.argv[1:], funcs))
	sys.exit(main())