| #!/usr/bin/env python |
| # Copyright 2018 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| """Custom swarming triggering script. |
| |
| This script does custom swarming triggering logic, to allow one bot to |
| conceptually span multiple Swarming configurations, while lumping all trigger |
| calls under one logical step. |
| |
| The reason this script is needed is to allow seamless upgrades of the GPU, OS |
| version, or graphics driver. Most Chromium tests, GPU tests in particular, are |
| triggered with precise values for all of these Swarming dimensions. This ensures |
| that if a machine is added to the Swarming pool with a slightly different |
| configuration, tests don't fail for unexpected reasons. |
| |
| During an upgrade of the fleet, it's not feasible to take half of the machines |
| offline. Some experience was gained with this during a recent upgrade of the |
| GPUs in Chromium's main Windows and Linux NVIDIA bots. In the middle of the |
| upgrade, only 50% of the capacity was available, and CQ jobs started to time |
| out. Once the hurdle had been passed in the middle of the upgrade, capacity was |
| sufficient, but it's crucial that this process remain seamless. |
| |
| This script receives multiple machine configurations on the command line in the |
| form of quoted strings. These strings are JSON dictionaries that represent |
| entries in the "dimensions" array of the "swarming" dictionary in the |
| src/testing/buildbot JSON files. The script queries the Swarming pool for the |
| number of machines of each configuration, and distributes work (shards) among |
| them using the following algorithm: |
| |
| 1. If either configuration has machines available (online, not busy at the time |
| of the query) then distribute shards to them first. |
| |
| 2. Compute the relative fractions of all of the live (online, not quarantined, |
| not dead) machines of all configurations. |
| |
| 3. Distribute the remaining shards probabilistically among these configurations. |
| |
| The use of random numbers attempts to avoid the pathology where one |
| configuration only has a couple of machines, and work is never distributed to it |
| once all machines are busy. |
| |
| This script must have roughly the same command line interface as swarming.py |
| trigger. It modifies it in the following ways: |
| * Intercepts the dump-json argument, and creates its own by combining the |
| results from each trigger call. |
| * Scans through the multiple-trigger-configs dictionaries. For any key found, |
| deletes that dimension from the originally triggered task's dimensions. This |
| is what allows the Swarming dimensions to be replaced. |
| * On a per-shard basis, adds the Swarming dimensions chosen from the |
| multiple-trigger-configs list to the dimensions for the shard. |
| |
| This script is normally called from the swarming recipe module in tools/build. |
| |
| """ |
| |
| import argparse |
| import copy |
| import json |
| import os |
| import random |
| import subprocess |
| import sys |
| import tempfile |
| import urllib |
| |
| import base_test_triggerer |
| |
| |
| class MultiDimensionTestTriggerer(base_test_triggerer.BaseTestTriggerer): |
| def __init__(self): |
| super(MultiDimensionTestTriggerer, self).__init__() |
| |
| def choose_random_int(self, max_num): |
| return random.randint(1, max_num) |
| |
| def pick_bot_configuration(self, verbose): |
| # These are the rules used: |
| # 1. If any configuration has bots available, pick the configuration with |
| # the most bots available. |
| # 2. If no configuration has bots available, pick a random configuration |
| # based on the total number of bots in each configuration. |
| # |
| # This method updates bot_statuses_ in case (1), and in both cases, returns |
| # the index into bot_configs_ that should be used. |
| if any(status['available'] > 0 for status in self._bot_statuses): |
| # Case 1. |
| max_index = 0 |
| max_val = self._bot_statuses[0]['available'] |
| for i in xrange(1, len(self._bot_statuses)): |
| avail = self._bot_statuses[i]['available'] |
| if avail > max_val: |
| max_index = i |
| max_val = avail |
| self._bot_statuses[max_index]['available'] -= 1 |
| assert self._bot_statuses[max_index]['available'] >= 0 |
| if verbose: |
| print 'Chose bot config %d because bots were available' % (max_index) |
| return max_index |
| # Case 2. |
| # We want to choose a bot uniformly at random from all of the bots specified |
| # in the bot configs. To do this, we conceptually group the bots into |
| # buckets, pick a random number between 1 and the total number of bots, and |
| # figure out which bucket of bots it landed in. |
| r = self.choose_random_int(self._total_bots) |
| for i, status in enumerate(self._bot_statuses): |
| if r <= status['total']: |
| if verbose: |
| print 'Chose bot config %d stochastically' % (i) |
| return i |
| r -= status['total'] |
| raise Exception('Should not reach here') |
| |
| def select_config_indices(self, args, verbose): |
| selected_indices = [] |
| for _ in xrange(args.shards): |
| selected_indices.append(self.pick_bot_configuration(verbose)) |
| return selected_indices |
| |
| def prune_test_specific_configs(self, args, verbose): |
| self.query_swarming_for_bot_configs(verbose) |
| # This script doesn't know how long individual test shards take to |
| # run, nor how many Swarming jobs are waiting to run on a |
| # particular configuration. It can end up scheduling jobs on |
| # configurations that have very few machines, and backing them up |
| # to the point where the tasks start expiring. To try to prevent |
| # this, don't schedule jobs at all on configurations that have |
| # less than 10% of the total capacity. crbug.com/886985 |
| MIN_CONFIG_CAPACITY_PERCENTAGE = 0.1 |
| filtered_bot_configs = [] |
| filtered_bot_statuses = [] |
| for i in xrange(len(self._bot_configs)): |
| config = self._bot_configs[i] |
| status = self._bot_statuses[i] |
| if status['total'] >= MIN_CONFIG_CAPACITY_PERCENTAGE * self._total_bots: |
| filtered_bot_configs.append(config) |
| filtered_bot_statuses.append(status) |
| else: |
| if verbose: |
| print 'Filtered config because it had too few bots: %s' % str(status) |
| if len(filtered_bot_configs) == 0: |
| raise Exception('The bot configurations are too fragmented; no single ' + |
| 'configuration has even 10% of the total capacity. ' + |
| 'Distribution will not work well. Failing.') |
| self._bot_configs = filtered_bot_configs |
| self._bot_statuses = filtered_bot_statuses |
| self._total_bots = sum(x['total'] for x in self._bot_statuses) |
| if verbose: |
| print 'Total bots after filtering: %d' % (self._total_bots) |
| |
| def main(): |
| # setup args for common contract of base class |
| parser = base_test_triggerer.BaseTestTriggerer.setup_parser_contract( |
| argparse.ArgumentParser(description=__doc__)) |
| args, remaining = parser.parse_known_args() |
| |
| triggerer = MultiDimensionTestTriggerer() |
| return triggerer.trigger_tasks(args, remaining) |
| |
| |
| if __name__ == '__main__': |
| sys.exit(main()) |