tools/perf/core/sharding_map_generator.py - chromium/src.git - Git at Google

 # Copyright 2018 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 import collections
 import sys

 import core.path_util

 core.path_util.AddTelemetryToPath()


 def generate_sharding_map(
     benchmarks_to_shard, timing_data, num_shards, debug):
   """Generate sharding map.

     Args:
       benchmarks_to_shard is a list all benchmarks to be sharded. Its
       structure is as follows:
       [{
          "name": "benchmark_1",
          "stories": [ "storyA", "storyB",...],
          "repeat": <number of pageset_repeat>
         },
         {
          "name": "benchmark_2",
          "stories": [ "storyA", "storyB",...],
          "repeat": <number of pageset_repeat>
         },
          ...
       ]

       The "stories" field contains a list of ordered story names. Notes that
       this should match the actual order of how the benchmark stories are
       executed for the sharding algorithm to be effective.

   """
   # Sort the list of benchmarks to be sharded by benchmark's name to make the
   # execution of this algorithm deterministic.
   benchmarks_to_shard.sort(key=lambda entry: entry['name'])

   story_timing_list = _gather_timing_data(
       benchmarks_to_shard, timing_data, True)

   all_stories = {}
   for b in benchmarks_to_shard:
     all_stories[b['name']] = b['stories']

   total_time = sum(p[1] for p in story_timing_list)
   expected_time_per_shard = total_time/num_shards

   total_time_scheduled = 0
   sharding_map = collections.OrderedDict()
   debug_map = collections.OrderedDict()
   min_shard_time = sys.maxint
   min_shard_index = None
   max_shard_time = 0
   max_shard_index = None
   num_stories = len(story_timing_list)
   predicted_shard_timings = []

   # The algorithm below removes all the stories from |story_timing_list| one by
   # one and add them to the current shard until the shard's total time is
   # approximately equals to |expected_time_per_shard|. After that point,
   # it moves to the next shard.
   # For efficient removal of |story_timing_list|'s elements & to keep the
   # ordering of benchmark alphabetically sorted in the shards' assignment, we
   # reverse the |story_timing_list|.
   story_timing_list.reverse()
   num_stories = len(story_timing_list)
   final_shard_index = num_shards - 1
   for i in range(num_shards):
     shard_name = 'shard #%i' % i
     sharding_map[str(i)] = {'benchmarks': collections.OrderedDict()}
     debug_map[shard_name] = collections.OrderedDict()
     time_per_shard = 0
     stories_in_shard = []
     expected_total_time = expected_time_per_shard * (i + 1)
     last_diff = abs(total_time_scheduled - expected_total_time)
     # Keep adding story to the current shard until either:
     # * The absolute difference between the total time of shards so far and
     #   expected total time is minimal.
     # * The shard is final shard, and there is no more stories to add.
     #
     # Note: we do check for the final shard in case due to rounding error,
     # the last_diff can be minimal even if we don't add all the stories to the
     # final shard.
     while story_timing_list:
       candidate_story, candidate_story_duration = story_timing_list[-1]
       new_diff = abs(total_time_scheduled + candidate_story_duration -
                      expected_total_time)
       if new_diff < last_diff or i == final_shard_index:
         story_timing_list.pop()
         total_time_scheduled += candidate_story_duration
         time_per_shard += candidate_story_duration
         stories_in_shard.append(candidate_story)
         debug_map[shard_name][candidate_story] = candidate_story_duration
         last_diff = abs(total_time_scheduled - expected_total_time)
         _add_benchmarks_to_shard(sharding_map, i, stories_in_shard, all_stories)
       else:
         break
     # Double time_per_shard to account for reference benchmark run.
     debug_map[shard_name]['expected_total_time'] = time_per_shard * 2
     if time_per_shard > max_shard_time:
       max_shard_time = time_per_shard
       max_shard_index = i
     if time_per_shard < min_shard_time:
       min_shard_time = time_per_shard
       min_shard_index = i

     predicted_shard_timings.append((shard_name, time_per_shard * 2))

   sharding_map['extra_infos'] = collections.OrderedDict([
       ('num_stories', num_stories),
       # Double all the time stats by 2 to account for reference build.
       ('predicted_min_shard_time', min_shard_time * 2),
       ('predicted_min_shard_index', min_shard_index),
       ('predicted_max_shard_time', max_shard_time * 2),
       ('predicted_max_shard_index', max_shard_index),
       ])

   if debug:
     sharding_map['extra_infos'].update(debug_map)
   else:
     sharding_map['extra_infos'].update(predicted_shard_timings)
   return sharding_map


 def _add_benchmarks_to_shard(sharding_map, shard_index, stories_in_shard,
     all_stories):
   benchmarks = collections.OrderedDict()
   for story in stories_in_shard:
     (b, story) = story.split('/', 1)
     if b not in benchmarks:
       benchmarks[b] = []
     benchmarks[b].append(story)

   # Format the benchmark's stories by indices
   benchmarks_in_shard = collections.OrderedDict()
   for b in benchmarks:
     benchmarks_in_shard[b] = {}
     first_story = all_stories[b].index(benchmarks[b][0])
     last_story = all_stories[b].index(benchmarks[b][-1]) + 1
     if first_story != 0:
       benchmarks_in_shard[b]['begin'] = first_story
     if last_story != len(all_stories[b]):
       benchmarks_in_shard[b]['end'] = last_story
   sharding_map[str(shard_index)] = {'benchmarks': benchmarks_in_shard}


 def _gather_timing_data(benchmarks_to_shard, timing_data, repeat):
   story_timing_dict = {}
   benchmarks_data_by_name = {}
   for b in benchmarks_to_shard:
     story_list = b['stories']
     benchmarks_data_by_name[b['name']] = b
     # Initialize the duration of all stories to be shard to 1 * repeat.
     # The reasons are:
     # 1) Even if the stories are skipped, they still have non neligible
     #    overhead.
     # 2) For a case of sharding a set of benchmarks with no existing data about
     #    timing, initializing the stories time within a single repeat to 1 leads
     #    to a roughly equal distribution of stories on the shards, whereas
     #    initializing them to zero will make the algorithm put all the stories
     #    into the first shard.
     for story in story_list:
       story_timing_dict[b['name'] + '/' + story] = b['repeat']

   for run in timing_data:
     benchmark = run['name'].split('/', 1)[0]
     if run['name'] in story_timing_dict:
       if run['duration']:
         if repeat:
           story_timing_dict[run['name']] = (float(run['duration'])
               * benchmarks_data_by_name[benchmark]['repeat'])
         else:
           story_timing_dict[run['name']] = float(run['duration'])
   story_timing_list = []
   for entry in benchmarks_to_shard:
     benchmark_name = entry['name']
     for story_name in entry['stories']:
       test_name = '%s/%s' % (benchmark_name, story_name)
       story_timing_list.append((test_name, story_timing_dict[test_name]))
   return story_timing_list


 def _generate_empty_sharding_map(num_shards):
   sharding_map = collections.OrderedDict()
   for i in range(0, num_shards):
     sharding_map[str(i)] = {'benchmarks': collections.OrderedDict()}
   return sharding_map


 def test_sharding_map(
     sharding_map, benchmarks_to_shard, test_timing_data):
   story_timing_list = _gather_timing_data(
       benchmarks_to_shard, test_timing_data, False)

   story_timing_dict = dict(story_timing_list)

   results = collections.OrderedDict()
   all_stories = {}
   for b in benchmarks_to_shard:
     all_stories[b['name']] = b['stories']

   sharding_map.pop('extra_infos', None)
   for shard in sharding_map:
     results[shard] = collections.OrderedDict()
     shard_total_time = 0
     for benchmark_name in sharding_map[shard]['benchmarks']:
       benchmark = sharding_map[shard]['benchmarks'][benchmark_name]
       begin = 0
       end = len(all_stories[benchmark_name])
       if 'begin' in benchmark:
         begin = benchmark['begin']
       if 'end' in benchmark:
         end = benchmark['end']
       benchmark_timing = 0
       for story in all_stories[benchmark_name][begin : end]:
         story_timing = story_timing_dict[benchmark_name + '/' + story]
         results[shard][benchmark_name + '/' + story] = str(story_timing)
         benchmark_timing += story_timing
       shard_total_time += benchmark_timing
     results[shard]['full_time'] = shard_total_time
   return results
	# Copyright 2018 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.
	import collections
	import sys

	import core.path_util

	core.path_util.AddTelemetryToPath()



	def generate_sharding_map(
	benchmarks_to_shard, timing_data, num_shards, debug):
	"""Generate sharding map.

	Args:
	benchmarks_to_shard is a list all benchmarks to be sharded. Its
	structure is as follows:
	[{
	"name": "benchmark_1",
	"stories": [ "storyA", "storyB",...],
	"repeat": <number of pageset_repeat>
	},
	{
	"name": "benchmark_2",
	"stories": [ "storyA", "storyB",...],
	"repeat": <number of pageset_repeat>
	},
	...
	]

	The "stories" field contains a list of ordered story names. Notes that
	this should match the actual order of how the benchmark stories are
	executed for the sharding algorithm to be effective.

	"""
	# Sort the list of benchmarks to be sharded by benchmark's name to make the
	# execution of this algorithm deterministic.
	benchmarks_to_shard.sort(key=lambda entry: entry['name'])

	story_timing_list = _gather_timing_data(
	benchmarks_to_shard, timing_data, True)

	all_stories = {}
	for b in benchmarks_to_shard:
	all_stories[b['name']] = b['stories']

	total_time = sum(p[1] for p in story_timing_list)
	expected_time_per_shard = total_time/num_shards

	total_time_scheduled = 0
	sharding_map = collections.OrderedDict()
	debug_map = collections.OrderedDict()
	min_shard_time = sys.maxint
	min_shard_index = None
	max_shard_time = 0
	max_shard_index = None
	num_stories = len(story_timing_list)
	predicted_shard_timings = []

	# The algorithm below removes all the stories from \|story_timing_list\| one by
	# one and add them to the current shard until the shard's total time is
	# approximately equals to \|expected_time_per_shard\|. After that point,
	# it moves to the next shard.
	# For efficient removal of \|story_timing_list\|'s elements & to keep the
	# ordering of benchmark alphabetically sorted in the shards' assignment, we
	# reverse the \|story_timing_list\|.
	story_timing_list.reverse()
	num_stories = len(story_timing_list)
	final_shard_index = num_shards - 1
	for i in range(num_shards):
	shard_name = 'shard #%i' % i
	sharding_map[str(i)] = {'benchmarks': collections.OrderedDict()}
	debug_map[shard_name] = collections.OrderedDict()
	time_per_shard = 0
	stories_in_shard = []
	expected_total_time = expected_time_per_shard * (i + 1)
	last_diff = abs(total_time_scheduled - expected_total_time)
	# Keep adding story to the current shard until either:
	# * The absolute difference between the total time of shards so far and
	# expected total time is minimal.
	# * The shard is final shard, and there is no more stories to add.
	#
	# Note: we do check for the final shard in case due to rounding error,
	# the last_diff can be minimal even if we don't add all the stories to the
	# final shard.
	while story_timing_list:
	candidate_story, candidate_story_duration = story_timing_list[-1]
	new_diff = abs(total_time_scheduled + candidate_story_duration -
	expected_total_time)
	if new_diff < last_diff or i == final_shard_index:
	story_timing_list.pop()
	total_time_scheduled += candidate_story_duration
	time_per_shard += candidate_story_duration
	stories_in_shard.append(candidate_story)
	debug_map[shard_name][candidate_story] = candidate_story_duration
	last_diff = abs(total_time_scheduled - expected_total_time)
	_add_benchmarks_to_shard(sharding_map, i, stories_in_shard, all_stories)
	else:
	break
	# Double time_per_shard to account for reference benchmark run.
	debug_map[shard_name]['expected_total_time'] = time_per_shard * 2
	if time_per_shard > max_shard_time:
	max_shard_time = time_per_shard
	max_shard_index = i
	if time_per_shard < min_shard_time:
	min_shard_time = time_per_shard
	min_shard_index = i

	predicted_shard_timings.append((shard_name, time_per_shard * 2))

	sharding_map['extra_infos'] = collections.OrderedDict([
	('num_stories', num_stories),
	# Double all the time stats by 2 to account for reference build.
	('predicted_min_shard_time', min_shard_time * 2),
	('predicted_min_shard_index', min_shard_index),
	('predicted_max_shard_time', max_shard_time * 2),
	('predicted_max_shard_index', max_shard_index),
	])

	if debug:
	sharding_map['extra_infos'].update(debug_map)
	else:
	sharding_map['extra_infos'].update(predicted_shard_timings)
	return sharding_map


	def _add_benchmarks_to_shard(sharding_map, shard_index, stories_in_shard,
	all_stories):
	benchmarks = collections.OrderedDict()
	for story in stories_in_shard:
	(b, story) = story.split('/', 1)
	if b not in benchmarks:
	benchmarks[b] = []
	benchmarks[b].append(story)

	# Format the benchmark's stories by indices
	benchmarks_in_shard = collections.OrderedDict()
	for b in benchmarks:
	benchmarks_in_shard[b] = {}
	first_story = all_stories[b].index(benchmarks[b][0])
	last_story = all_stories[b].index(benchmarks[b][-1]) + 1
	if first_story != 0:
	benchmarks_in_shard[b]['begin'] = first_story
	if last_story != len(all_stories[b]):
	benchmarks_in_shard[b]['end'] = last_story
	sharding_map[str(shard_index)] = {'benchmarks': benchmarks_in_shard}


	def _gather_timing_data(benchmarks_to_shard, timing_data, repeat):
	story_timing_dict = {}
	benchmarks_data_by_name = {}
	for b in benchmarks_to_shard:
	story_list = b['stories']
	benchmarks_data_by_name[b['name']] = b
	# Initialize the duration of all stories to be shard to 1 * repeat.
	# The reasons are:
	# 1) Even if the stories are skipped, they still have non neligible
	# overhead.
	# 2) For a case of sharding a set of benchmarks with no existing data about
	# timing, initializing the stories time within a single repeat to 1 leads
	# to a roughly equal distribution of stories on the shards, whereas
	# initializing them to zero will make the algorithm put all the stories
	# into the first shard.
	for story in story_list:
	story_timing_dict[b['name'] + '/' + story] = b['repeat']

	for run in timing_data:
	benchmark = run['name'].split('/', 1)[0]
	if run['name'] in story_timing_dict:
	if run['duration']:
	if repeat:
	story_timing_dict[run['name']] = (float(run['duration'])
	* benchmarks_data_by_name[benchmark]['repeat'])
	else:
	story_timing_dict[run['name']] = float(run['duration'])
	story_timing_list = []
	for entry in benchmarks_to_shard:
	benchmark_name = entry['name']
	for story_name in entry['stories']:
	test_name = '%s/%s' % (benchmark_name, story_name)
	story_timing_list.append((test_name, story_timing_dict[test_name]))
	return story_timing_list


	def _generate_empty_sharding_map(num_shards):
	sharding_map = collections.OrderedDict()
	for i in range(0, num_shards):
	sharding_map[str(i)] = {'benchmarks': collections.OrderedDict()}
	return sharding_map


	def test_sharding_map(
	sharding_map, benchmarks_to_shard, test_timing_data):
	story_timing_list = _gather_timing_data(
	benchmarks_to_shard, test_timing_data, False)

	story_timing_dict = dict(story_timing_list)

	results = collections.OrderedDict()
	all_stories = {}
	for b in benchmarks_to_shard:
	all_stories[b['name']] = b['stories']

	sharding_map.pop('extra_infos', None)
	for shard in sharding_map:
	results[shard] = collections.OrderedDict()
	shard_total_time = 0
	for benchmark_name in sharding_map[shard]['benchmarks']:
	benchmark = sharding_map[shard]['benchmarks'][benchmark_name]
	begin = 0
	end = len(all_stories[benchmark_name])
	if 'begin' in benchmark:
	begin = benchmark['begin']
	if 'end' in benchmark:
	end = benchmark['end']
	benchmark_timing = 0
	for story in all_stories[benchmark_name][begin : end]:
	story_timing = story_timing_dict[benchmark_name + '/' + story]
	results[shard][benchmark_name + '/' + story] = str(story_timing)
	benchmark_timing += story_timing
	shard_total_time += benchmark_timing
	results[shard]['full_time'] = shard_total_time
	return results