blob: 3412cf802c145b5600713228f86a1535b19f68e0 [file] [log] [blame]
#!/usr/bin/env vpython3
# Copyright 2024 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Script to help find bad GPU test machines that need fixing."""
import argparse
import logging
from typing import Dict
from bad_machine_finder import bigquery
from bad_machine_finder import buganizer
from bad_machine_finder import detection
from bad_machine_finder import swarming
from bad_machine_finder import tasks
from bad_machine_finder import test_specs
MIXIN_GROUPS = {
'gpu': [
# ChromeOS amd64-generic omitted since it is run on GCE instances.
'chromium_pixel_2_q',
'gpu_nvidia_shield_tv_stable',
'gpu_pixel_4_stable',
'gpu_pixel_6_experimental',
'gpu_pixel_6_stable',
'gpu_samsung_a13_stable',
'gpu_samsung_a23_stable',
'gpu_samsung_s23_stable',
'gpu_samsung_s24_stable',
'linux_amd_rx_5500_xt',
'linux_amd_rx_7600_stable',
'linux_intel_uhd_630_experimental',
'linux_intel_uhd_630_stable',
'linux_intel_uhd_770_stable',
'linux_nvidia_gtx_1660_experimental',
'linux_nvidia_gtx_1660_stable',
'linux_nvidia_rtx_4070_super_stable',
'mac_arm64_apple_m1_gpu_experimental',
'mac_arm64_apple_m1_gpu_stable',
'mac_arm64_apple_m2_retina_gpu_experimental',
'mac_arm64_apple_m2_retina_gpu_stable',
'mac_arm64_apple_m3_retina_gpu_stable',
'mac_mini_intel_gpu_experimental',
'mac_mini_intel_gpu_stable',
'mac_pro_amd_gpu',
'mac_retina_amd_gpu_experimental',
'mac_retina_amd_gpu_stable',
'win10_amd_rx_5500_xt_stable',
'win10_intel_uhd_630_experimental',
'win10_intel_uhd_630_stable',
'win10_intel_uhd_770_stable',
'win10_nvidia_gtx_1660_experimental',
'win10_nvidia_gtx_1660_stable',
'win11_amd_rx_7600_stable',
'win11_nvidia_rtx_4070_super_experimental',
'win11_nvidia_rtx_4070_super_stable',
'win11_qualcomm_adreno_690_stable',
'win11_qualcomm_snapdragon_x_elite_stable',
],
}
def ParseArgs() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description='Find machines that are likely contributing to test failures')
parser.add_argument('--sample-period',
type=int,
default=7,
help='The number of days to sample data from')
parser.add_argument('--billing-project',
default='chrome-unexpected-pass-data',
help='The billing project to use for queries')
parser.add_argument('-v',
'--verbose',
dest='verbose_count',
action='count',
default=0,
help=('Increase logging verbosity, can be passed '
'multiple times.'))
parser.add_argument('-q',
'--quiet',
action='store_true',
default=False,
help='Disable logging for non-errors.')
parser.add_argument('--minimum-detection-method-count',
type=int,
default=2,
help=('The minimum number of detection methods that need '
'to flag a machine as bad in order for it to be '
'reported.'))
# Does not work locally due to auth issues reported in crbug.com/361488152.
parser.add_argument('--bug-id',
type=int,
default=0,
help=('A Buganizer bug ID. If specified, the bug will be '
'updated with the script results. DOES NOT '
'CURRENTLY WORK LOCALLY.'))
parser.add_argument('--report-grace-period',
type=int,
default=7,
help=('The number of days to wait before reporting the '
'same bot to the bug again'))
detection_modifiers = parser.add_argument_group(
title='Detection Method Modifiers',
description=('Arguments that modify the behavior of individual detection '
'methods'))
detection_modifiers.add_argument(
'--stddev-multiplier',
type=float,
default=3,
help=('Used with the stddev outlier detection method. Sets how many '
"standard deviations a bot's failure rate has to be over the "
'fleet-wide mean for it to be considered bad.'))
detection_modifiers.add_argument(
'--random-chance-probability-threshold',
type=float,
default=0.0001,
help=('Used with the random chance detection method. Sets how unlikely '
'it has to be that a bot randomly got at least as many failures as '
'it did in order for it to be considered bad.'))
detection_modifiers.add_argument(
'--iqr-multiplier',
type=float,
default=3,
help=('How many interquartile ranges a failure rate must be above the '
'third quartile for it to be considered an outlier.'))
detection_modifiers.add_argument(
'--minimum-failed-tasks',
type=int,
default=5,
help=('Used with the stddev outlier and iqr detection methods. Bots '
'that have fewer than this number of failed tasks within the '
'sample period will not be reported. This helps avoid false '
'reports due to getting a small number of flakes in a small number '
'of total tasks.'))
mixin_group = parser.add_mutually_exclusive_group(required=True)
mixin_group.add_argument('--mixin',
action='append',
dest='mixins',
help=('The name of the mixin to get data for. Can '
'be specified multiple times.'))
mixin_group.add_argument('--mixin-group',
choices=sorted(list(MIXIN_GROUPS.keys())),
help='A preset group of mixins to check.')
args = parser.parse_args()
_VerifyArgs(parser, args)
_SetLoggingVerbosity(args)
return args
def _VerifyArgs(parser: argparse.ArgumentParser,
args: argparse.Namespace) -> None:
if args.sample_period <= 0:
parser.error('--sample-period must be greater than 0')
if args.minimum_detection_method_count <= 0:
parser.error('--minimum-detection-method-count must be greater than 0')
if args.bug_id < 0:
parser.error('--bug-id must be non-negative')
if args.report_grace_period < 0:
parser.error('--report-grace-period must be non-negative')
if args.minimum_failed_tasks < 0:
parser.error('--minimum-failed-tasks must be non-negative')
def _SetLoggingVerbosity(args: argparse.Namespace) -> None:
if args.quiet:
args.verbose_count = -1
if args.verbose_count == -1:
level = logging.ERROR
elif args.verbose_count == 0:
level = logging.WARNING
elif args.verbose_count == 1:
level = logging.INFO
else:
level = logging.DEBUG
logging.getLogger().setLevel(level)
def _GetDimensionsByMixin(
args: argparse.Namespace) -> Dict[str, test_specs.DimensionSet]:
if args.mixin_group:
mixins = MIXIN_GROUPS[args.mixin_group]
else:
mixins = args.mixins
dimensions_by_mixin = {
mixin_name: test_specs.GetMixinDimensions(mixin_name)
for mixin_name in mixins
}
return dimensions_by_mixin
def _AnalyzeMixin(mixin_stats: tasks.MixinStats, mixin_name: str,
args: argparse.Namespace) -> detection.BadMachineList:
bad_machine_list = detection.BadMachineList()
bad_machine_list.Merge(
detection.DetectViaStdDevOutlier(mixin_stats, args.stddev_multiplier,
args.minimum_failed_tasks))
bad_machine_list.Merge(
detection.DetectViaRandomChance(mixin_stats,
args.random_chance_probability_threshold))
bad_machine_list.Merge(
detection.DetectViaInterquartileRange(mixin_stats, mixin_name,
args.iqr_multiplier,
args.minimum_failed_tasks))
return bad_machine_list
def main() -> None:
args = ParseArgs()
dimensions_by_mixin = _GetDimensionsByMixin(args)
querier = bigquery.Querier(args.billing_project)
task_stats = swarming.GetTaskStatsForMixins(querier, dimensions_by_mixin,
args.sample_period)
mixin_grouped_bad_machines = detection.MixinGroupedBadMachines()
for mixin_name, mixin_stats in task_stats.items():
bad_machine_list = _AnalyzeMixin(mixin_stats, mixin_name, args)
bad_machine_list.RemoveLowConfidenceMachines(
args.minimum_detection_method_count)
if not bad_machine_list.bad_machines:
continue
mixin_grouped_bad_machines.AddMixinData(mixin_name, bad_machine_list)
markdown = mixin_grouped_bad_machines.GenerateMarkdown()
if not markdown:
print('No bad machines detected')
else:
print(markdown)
if args.bug_id:
buganizer.UpdateBug(args.bug_id, mixin_grouped_bad_machines,
args.report_grace_period)
if __name__ == '__main__':
main()