content/test/gpu/find_bad_machines.py - chromium/src.git - Git at Google

 #!/usr/bin/env vpython3
 # Copyright 2024 The Chromium Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 """Script to help find bad GPU test machines that need fixing."""

 import argparse
 import logging
 from typing import Dict

 from bad_machine_finder import bigquery
 from bad_machine_finder import buganizer
 from bad_machine_finder import detection
 from bad_machine_finder import swarming
 from bad_machine_finder import tasks
 from bad_machine_finder import test_specs

 MIXIN_GROUPS = {
     'gpu': [
         # ChromeOS amd64-generic omitted since it is run on GCE instances.
         'chromium_pixel_2_q',
         'gpu_nvidia_shield_tv_stable',
         'gpu_pixel_4_stable',
         'gpu_pixel_6_experimental',
         'gpu_pixel_6_stable',
         'gpu_samsung_a13_stable',
         'gpu_samsung_a23_stable',
         'gpu_samsung_s23_stable',
         'gpu_samsung_s24_stable',
         'linux_amd_rx_5500_xt',
         'linux_amd_rx_7600_stable',
         'linux_intel_uhd_630_experimental',
         'linux_intel_uhd_630_stable',
         'linux_intel_uhd_770_stable',
         'linux_nvidia_gtx_1660_experimental',
         'linux_nvidia_gtx_1660_stable',
         'linux_nvidia_rtx_4070_super_stable',
         'mac_arm64_apple_m1_gpu_experimental',
         'mac_arm64_apple_m1_gpu_stable',
         'mac_arm64_apple_m2_retina_gpu_experimental',
         'mac_arm64_apple_m2_retina_gpu_stable',
         'mac_arm64_apple_m3_retina_gpu_stable',
         'mac_mini_intel_gpu_experimental',
         'mac_mini_intel_gpu_stable',
         'mac_pro_amd_gpu',
         'mac_retina_amd_gpu_experimental',
         'mac_retina_amd_gpu_stable',
         'win10_amd_rx_5500_xt_stable',
         'win10_intel_uhd_630_experimental',
         'win10_intel_uhd_630_stable',
         'win10_intel_uhd_770_stable',
         'win10_nvidia_gtx_1660_experimental',
         'win10_nvidia_gtx_1660_stable',
         'win11_amd_rx_7600_stable',
         'win11_nvidia_rtx_4070_super_experimental',
         'win11_nvidia_rtx_4070_super_stable',
         'win11_qualcomm_adreno_690_stable',
         'win11_qualcomm_snapdragon_x_elite_stable',
     ],
 }


 def ParseArgs() -> argparse.Namespace:
   parser = argparse.ArgumentParser(
       description='Find machines that are likely contributing to test failures')
   parser.add_argument('--sample-period',
                       type=int,
                       default=7,
                       help='The number of days to sample data from')
   parser.add_argument('--billing-project',
                       default='chrome-unexpected-pass-data',
                       help='The billing project to use for queries')
   parser.add_argument('-v',
                       '--verbose',
                       dest='verbose_count',
                       action='count',
                       default=0,
                       help=('Increase logging verbosity, can be passed '
                             'multiple times.'))
   parser.add_argument('-q',
                       '--quiet',
                       action='store_true',
                       default=False,
                       help='Disable logging for non-errors.')
   parser.add_argument('--minimum-detection-method-count',
                       type=int,
                       default=2,
                       help=('The minimum number of detection methods that need '
                             'to flag a machine as bad in order for it to be '
                             'reported.'))
   # Does not work locally due to auth issues reported in crbug.com/361488152.
   parser.add_argument('--bug-id',
                       type=int,
                       default=0,
                       help=('A Buganizer bug ID. If specified, the bug will be '
                             'updated with the script results. DOES NOT '
                             'CURRENTLY WORK LOCALLY.'))
   parser.add_argument('--report-grace-period',
                       type=int,
                       default=7,
                       help=('The number of days to wait before reporting the '
                             'same bot to the bug again'))

   detection_modifiers = parser.add_argument_group(
       title='Detection Method Modifiers',
       description=('Arguments that modify the behavior of individual detection '
                    'methods'))
   detection_modifiers.add_argument(
       '--stddev-multiplier',
       type=float,
       default=3,
       help=('Used with the stddev outlier detection method. Sets how many '
             "standard deviations a bot's failure rate has to be over the "
             'fleet-wide mean for it to be considered bad.'))
   detection_modifiers.add_argument(
       '--random-chance-probability-threshold',
       type=float,
       default=0.0001,
       help=('Used with the random chance detection method. Sets how unlikely '
             'it has to be that a bot randomly got at least as many failures as '
             'it did in order for it to be considered bad.'))
   detection_modifiers.add_argument(
       '--iqr-multiplier',
       type=float,
       default=3,
       help=('How many interquartile ranges a failure rate must be above the '
             'third quartile for it to be considered an outlier.'))
   detection_modifiers.add_argument(
       '--minimum-failed-tasks',
       type=int,
       default=5,
       help=('Used with the stddev outlier and iqr detection methods. Bots '
             'that have fewer than this number of failed tasks within the '
             'sample period will not be reported. This helps avoid false '
             'reports due to getting a small number of flakes in a small number '
             'of total tasks.'))

   mixin_group = parser.add_mutually_exclusive_group(required=True)
   mixin_group.add_argument('--mixin',
                            action='append',
                            dest='mixins',
                            help=('The name of the mixin to get data for. Can '
                                  'be specified multiple times.'))
   mixin_group.add_argument('--mixin-group',
                            choices=sorted(list(MIXIN_GROUPS.keys())),
                            help='A preset group of mixins to check.')

   args = parser.parse_args()

   _VerifyArgs(parser, args)
   _SetLoggingVerbosity(args)

   return args


 def _VerifyArgs(parser: argparse.ArgumentParser,
                 args: argparse.Namespace) -> None:
   if args.sample_period <= 0:
     parser.error('--sample-period must be greater than 0')
   if args.minimum_detection_method_count <= 0:
     parser.error('--minimum-detection-method-count must be greater than 0')
   if args.bug_id < 0:
     parser.error('--bug-id must be non-negative')
   if args.report_grace_period < 0:
     parser.error('--report-grace-period must be non-negative')
   if args.minimum_failed_tasks < 0:
     parser.error('--minimum-failed-tasks must be non-negative')


 def _SetLoggingVerbosity(args: argparse.Namespace) -> None:
   if args.quiet:
     args.verbose_count = -1
   if args.verbose_count == -1:
     level = logging.ERROR
   elif args.verbose_count == 0:
     level = logging.WARNING
   elif args.verbose_count == 1:
     level = logging.INFO
   else:
     level = logging.DEBUG
   logging.getLogger().setLevel(level)


 def _GetDimensionsByMixin(
     args: argparse.Namespace) -> Dict[str, test_specs.DimensionSet]:
   if args.mixin_group:
     mixins = MIXIN_GROUPS[args.mixin_group]
   else:
     mixins = args.mixins

   dimensions_by_mixin = {
       mixin_name: test_specs.GetMixinDimensions(mixin_name)
       for mixin_name in mixins
   }

   return dimensions_by_mixin


 def _AnalyzeMixin(mixin_stats: tasks.MixinStats, mixin_name: str,
                   args: argparse.Namespace) -> detection.BadMachineList:
   bad_machine_list = detection.BadMachineList()
   bad_machine_list.Merge(
       detection.DetectViaStdDevOutlier(mixin_stats, args.stddev_multiplier,
                                        args.minimum_failed_tasks))
   bad_machine_list.Merge(
       detection.DetectViaRandomChance(mixin_stats,
                                       args.random_chance_probability_threshold))
   bad_machine_list.Merge(
       detection.DetectViaInterquartileRange(mixin_stats, mixin_name,
                                             args.iqr_multiplier,
                                             args.minimum_failed_tasks))
   return bad_machine_list


 def main() -> None:
   args = ParseArgs()
   dimensions_by_mixin = _GetDimensionsByMixin(args)
   querier = bigquery.Querier(args.billing_project)
   task_stats = swarming.GetTaskStatsForMixins(querier, dimensions_by_mixin,
                                               args.sample_period)

   mixin_grouped_bad_machines = detection.MixinGroupedBadMachines()

   for mixin_name, mixin_stats in task_stats.items():
     bad_machine_list = _AnalyzeMixin(mixin_stats, mixin_name, args)
     bad_machine_list.RemoveLowConfidenceMachines(
         args.minimum_detection_method_count)
     if not bad_machine_list.bad_machines:
       continue
     mixin_grouped_bad_machines.AddMixinData(mixin_name, bad_machine_list)

   markdown = mixin_grouped_bad_machines.GenerateMarkdown()
   if not markdown:
     print('No bad machines detected')
   else:
     print(markdown)

   if args.bug_id:
     buganizer.UpdateBug(args.bug_id, mixin_grouped_bad_machines,
                         args.report_grace_period)


 if __name__ == '__main__':
   main()
	#!/usr/bin/env vpython3
	# Copyright 2024 The Chromium Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.
	"""Script to help find bad GPU test machines that need fixing."""

	import argparse
	import logging
	from typing import Dict

	from bad_machine_finder import bigquery
	from bad_machine_finder import buganizer
	from bad_machine_finder import detection
	from bad_machine_finder import swarming
	from bad_machine_finder import tasks
	from bad_machine_finder import test_specs

	MIXIN_GROUPS = {
	'gpu': [
	# ChromeOS amd64-generic omitted since it is run on GCE instances.
	'chromium_pixel_2_q',
	'gpu_nvidia_shield_tv_stable',
	'gpu_pixel_4_stable',
	'gpu_pixel_6_experimental',
	'gpu_pixel_6_stable',
	'gpu_samsung_a13_stable',
	'gpu_samsung_a23_stable',
	'gpu_samsung_s23_stable',
	'gpu_samsung_s24_stable',
	'linux_amd_rx_5500_xt',
	'linux_amd_rx_7600_stable',
	'linux_intel_uhd_630_experimental',
	'linux_intel_uhd_630_stable',
	'linux_intel_uhd_770_stable',
	'linux_nvidia_gtx_1660_experimental',
	'linux_nvidia_gtx_1660_stable',
	'linux_nvidia_rtx_4070_super_stable',
	'mac_arm64_apple_m1_gpu_experimental',
	'mac_arm64_apple_m1_gpu_stable',
	'mac_arm64_apple_m2_retina_gpu_experimental',
	'mac_arm64_apple_m2_retina_gpu_stable',
	'mac_arm64_apple_m3_retina_gpu_stable',
	'mac_mini_intel_gpu_experimental',
	'mac_mini_intel_gpu_stable',
	'mac_pro_amd_gpu',
	'mac_retina_amd_gpu_experimental',
	'mac_retina_amd_gpu_stable',
	'win10_amd_rx_5500_xt_stable',
	'win10_intel_uhd_630_experimental',
	'win10_intel_uhd_630_stable',
	'win10_intel_uhd_770_stable',
	'win10_nvidia_gtx_1660_experimental',
	'win10_nvidia_gtx_1660_stable',
	'win11_amd_rx_7600_stable',
	'win11_nvidia_rtx_4070_super_experimental',
	'win11_nvidia_rtx_4070_super_stable',
	'win11_qualcomm_adreno_690_stable',
	'win11_qualcomm_snapdragon_x_elite_stable',
	],
	}


	def ParseArgs() -> argparse.Namespace:
	parser = argparse.ArgumentParser(
	description='Find machines that are likely contributing to test failures')
	parser.add_argument('--sample-period',
	type=int,
	default=7,
	help='The number of days to sample data from')
	parser.add_argument('--billing-project',
	default='chrome-unexpected-pass-data',
	help='The billing project to use for queries')
	parser.add_argument('-v',
	'--verbose',
	dest='verbose_count',
	action='count',
	default=0,
	help=('Increase logging verbosity, can be passed '
	'multiple times.'))
	parser.add_argument('-q',
	'--quiet',
	action='store_true',
	default=False,
	help='Disable logging for non-errors.')
	parser.add_argument('--minimum-detection-method-count',
	type=int,
	default=2,
	help=('The minimum number of detection methods that need '
	'to flag a machine as bad in order for it to be '
	'reported.'))
	# Does not work locally due to auth issues reported in crbug.com/361488152.
	parser.add_argument('--bug-id',
	type=int,
	default=0,
	help=('A Buganizer bug ID. If specified, the bug will be '
	'updated with the script results. DOES NOT '
	'CURRENTLY WORK LOCALLY.'))
	parser.add_argument('--report-grace-period',
	type=int,
	default=7,
	help=('The number of days to wait before reporting the '
	'same bot to the bug again'))

	detection_modifiers = parser.add_argument_group(
	title='Detection Method Modifiers',
	description=('Arguments that modify the behavior of individual detection '
	'methods'))
	detection_modifiers.add_argument(
	'--stddev-multiplier',
	type=float,
	default=3,
	help=('Used with the stddev outlier detection method. Sets how many '
	"standard deviations a bot's failure rate has to be over the "
	'fleet-wide mean for it to be considered bad.'))
	detection_modifiers.add_argument(
	'--random-chance-probability-threshold',
	type=float,
	default=0.0001,
	help=('Used with the random chance detection method. Sets how unlikely '
	'it has to be that a bot randomly got at least as many failures as '
	'it did in order for it to be considered bad.'))
	detection_modifiers.add_argument(
	'--iqr-multiplier',
	type=float,
	default=3,
	help=('How many interquartile ranges a failure rate must be above the '
	'third quartile for it to be considered an outlier.'))
	detection_modifiers.add_argument(
	'--minimum-failed-tasks',
	type=int,
	default=5,
	help=('Used with the stddev outlier and iqr detection methods. Bots '
	'that have fewer than this number of failed tasks within the '
	'sample period will not be reported. This helps avoid false '
	'reports due to getting a small number of flakes in a small number '
	'of total tasks.'))

	mixin_group = parser.add_mutually_exclusive_group(required=True)
	mixin_group.add_argument('--mixin',
	action='append',
	dest='mixins',
	help=('The name of the mixin to get data for. Can '
	'be specified multiple times.'))
	mixin_group.add_argument('--mixin-group',
	choices=sorted(list(MIXIN_GROUPS.keys())),
	help='A preset group of mixins to check.')

	args = parser.parse_args()

	_VerifyArgs(parser, args)
	_SetLoggingVerbosity(args)

	return args


	def _VerifyArgs(parser: argparse.ArgumentParser,
	args: argparse.Namespace) -> None:
	if args.sample_period <= 0:
	parser.error('--sample-period must be greater than 0')
	if args.minimum_detection_method_count <= 0:
	parser.error('--minimum-detection-method-count must be greater than 0')
	if args.bug_id < 0:
	parser.error('--bug-id must be non-negative')
	if args.report_grace_period < 0:
	parser.error('--report-grace-period must be non-negative')
	if args.minimum_failed_tasks < 0:
	parser.error('--minimum-failed-tasks must be non-negative')


	def _SetLoggingVerbosity(args: argparse.Namespace) -> None:
	if args.quiet:
	args.verbose_count = -1
	if args.verbose_count == -1:
	level = logging.ERROR
	elif args.verbose_count == 0:
	level = logging.WARNING
	elif args.verbose_count == 1:
	level = logging.INFO
	else:
	level = logging.DEBUG
	logging.getLogger().setLevel(level)


	def _GetDimensionsByMixin(
	args: argparse.Namespace) -> Dict[str, test_specs.DimensionSet]:
	if args.mixin_group:
	mixins = MIXIN_GROUPS[args.mixin_group]
	else:
	mixins = args.mixins

	dimensions_by_mixin = {
	mixin_name: test_specs.GetMixinDimensions(mixin_name)
	for mixin_name in mixins
	}

	return dimensions_by_mixin


	def _AnalyzeMixin(mixin_stats: tasks.MixinStats, mixin_name: str,
	args: argparse.Namespace) -> detection.BadMachineList:
	bad_machine_list = detection.BadMachineList()
	bad_machine_list.Merge(
	detection.DetectViaStdDevOutlier(mixin_stats, args.stddev_multiplier,
	args.minimum_failed_tasks))
	bad_machine_list.Merge(
	detection.DetectViaRandomChance(mixin_stats,
	args.random_chance_probability_threshold))
	bad_machine_list.Merge(
	detection.DetectViaInterquartileRange(mixin_stats, mixin_name,
	args.iqr_multiplier,
	args.minimum_failed_tasks))
	return bad_machine_list


	def main() -> None:
	args = ParseArgs()
	dimensions_by_mixin = _GetDimensionsByMixin(args)
	querier = bigquery.Querier(args.billing_project)
	task_stats = swarming.GetTaskStatsForMixins(querier, dimensions_by_mixin,
	args.sample_period)

	mixin_grouped_bad_machines = detection.MixinGroupedBadMachines()

	for mixin_name, mixin_stats in task_stats.items():
	bad_machine_list = _AnalyzeMixin(mixin_stats, mixin_name, args)
	bad_machine_list.RemoveLowConfidenceMachines(
	args.minimum_detection_method_count)
	if not bad_machine_list.bad_machines:
	continue
	mixin_grouped_bad_machines.AddMixinData(mixin_name, bad_machine_list)

	markdown = mixin_grouped_bad_machines.GenerateMarkdown()
	if not markdown:
	print('No bad machines detected')
	else:
	print(markdown)

	if args.bug_id:
	buganizer.UpdateBug(args.bug_id, mixin_grouped_bad_machines,
	args.report_grace_period)


	if __name__ == '__main__':
	main()