exonerator/innocent_cls_precq_flake.py - chromiumos/infra/cl_exonerator - Git at Google

 # Copyright 2018 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Returns a sequence of new innocent CLs which failed pre-cq due to flake.

 Queries CIDB for "natural" sanity Pre-CQ build failures - namely, CL
 patchsets which failed a pre-CQ config and then later passed, implying infra
 or test flake.
 """

 from __future__ import print_function
 from __future__ import absolute_import
 from __future__ import division

 import pandas

 from chromite.lib import clactions


 GerritPatchTuple = clactions.GerritPatchTuple

 # TODO(phobbs) Calculate this prior in a systematic way. These are made-up
 # numbers for now just to get pre-CQ flake exonerator working half-decently.
 # We can derive better numbers using either empirical bayes, by measuring the
 # historical rates of flake for weeks (say within the last year) that are above
 # some threshold of runs, or by fitting a hierarchical model using a
 # probabilistic programming library such as Edward (http://edwardlib.org) using
 # an uninformative prior.
 _FLAKE_PRIOR_CONCENTRATION_0 = 49.0
 _FLAKE_PRIOR_CONCENTRATION_1 = 1

 # Ignore pre-CQ configs with less than 2.5% flake.
 # TODO(phobbs) remove this when CL-Exonerator is properly rate-limited by
 # available pre-CQ capacity.
 # We will want to retry more CLs at night-time and on the weekend when pre-CQ
 # resources are basically unlimited.
 _FLAKE_THRESHOLD_ON_PEAK = 0.025
 # Forgive more CLs on off-peak (weekends and night time) when pre-CQ resources
 # are essentially unlimited.
 _FLAKE_THRESHOLD_OFF_PEAK = 0.01


 # Maximum of 20 minutes in difference of start time for two pre-CQ builds to be
 # considered part of the same "pre-CQ run". This is used for detecting when
 # only a single config failed in the pre-CQ run.
 _MAX_START_TIME_DIFFERENCE = '00:20:00'


 def InnocentPreCQsFromFlake(conn, on_peak):
   """Returns a sequence flakey pre-CQ failures to forgive.

   Args:
     conn: A CIDBConnection to use.
     on_peak: Whether it's business hours (peak load) or off-peak.

   Yields:
     Tuples of (change, pre-cq build id) to forgive.
   """
   configs_by_flake = _PreCQConfigsByFlakePercent(conn, on_peak)
   for config in configs_by_flake:
     for cl_and_build in _AffectedPreCQBuilds(conn, config):
       # TODO(crbug.com/820379) yield build ids from here.
       yield cl_and_build


 def _AffectedPreCQBuilds(conn, config):
   """Returns pre-CQ failures which could be explained by config being flakey.

   Only returns pre-CQ failures which are the *latest build for that
   config/change*. This prevents us from wasting time going to gerrit.

   Args:
     conn: A CIDBConnection to use.
     config: The build config which is flakey.

   Yields:
     Tuples of (change, pre-cq build id) to forgive.
   """
   # Find all failed builds for the config which don't have another build for
   # that change with a later start_date.
   # TODO(phobbs): should this be moved to cidb.py ?
   # See https://stackoverflow.com/a/7745635/219138 for a discussion of how to
   # perform "greatest-n-per-group" in SQL. I chose the LEFT OUTER JOIN solution.
   # This query joins two sub-queries because you can't perform more than one
   # LEFT OUTER JOIN in MySQL.
   # Each subquery is very fast because it uses the build_config_index and has a
   # limited start_time window. The subsequent LEFT OUTER JOIN is fast because
   # (a) there isn't much data and (b) change_number is indexed.
   # For example, executing on moblab-generic-vm-pre-cq takes 0.07 sec.
   query = """
 SELECT c1.build_id, c1.start_time, c1.build_config,
        c1.change_number, c1.patch_number, c1.change_source

 FROM (
     SELECT c1.build_id, b1.start_time, b1.build_config,
            c1.change_number, c1.patch_number, c1.change_source
     FROM buildTable as b1
         JOIN clActionTable as c1
     WHERE
         c1.build_id = b1.id
         AND c1.action = 'pre_cq_failed'
         AND b1.build_config = '{config}'
         AND b1.start_time > DATE_SUB(NOW(), INTERVAL 7 DAY)
         AND b1.status = 'fail'
 ) as c1

 LEFT OUTER JOIN (
     SELECT c2.build_id, c2.change_number, c2.patch_number, c2.change_source, b2.start_time
     FROM buildTable as b2
         JOIN clActionTable as c2
     WHERE
         c2.build_id = b2.id
         AND b2.build_config = '{config}'
         AND b2.start_time > DATE_SUB(NOW(), INTERVAL 7 DAY)
 ) as c2

 ON
     c1.change_number = c2.change_number
     /*
      * Note: we do <= here because we don't want to exonerate failures for which
      * there is a newer run - even if the newer run is in a later patchset.
      */
     AND c1.patch_number <= c2.patch_number
     AND c1.change_source = c2.change_source
     AND c2.start_time > c1.start_time
 WHERE
     c2.build_id IS NULL
   """.format(config=config)

   # Filter out only builds that failed alone. If other configs failed at the
   # same time, it wasn't flake.
   # TODO(phobbs) this technically has a race condition, because it's assuming
   # "query1" will give the same answer as the query above. Can we do this in a
   # safer way, or do both in one query?
   # TODO(phobbs) this function is pretty complicated. We need MySQL integration
   # tests to make sure this doesn't break.
   failed_other_configs = """
 SELECT q1.change_number, q1.patch_number, q1.change_source
 FROM
     ({query1}) as q1
     JOIN clActionTable c
     JOIN buildTable b
 ON
     q1.change_number = c.change_number
     AND q1.patch_number = c.patch_number
     AND q1.change_source = c.change_source
     AND c.build_id = b.id
 WHERE
     b.build_config != '{config}'
     AND b.build_config != 'pre-cq-launcher'
     AND b.status = 'fail'
     AND TIMEDIFF(q1.start_time, b.start_time) <= TIME('{max_start_time_diff}')
   """.format(
       query1=query,
       config=config,
       max_start_time_diff=_MAX_START_TIME_DIFFERENCE)

   # This should technically be in chromite.lib.cidb, but this will be moved to
   # a CIDB API anyway.
   # pylint: disable=protected-access
   rows = conn._Execute(query).fetchall()
   failed_other_configs = set(map(tuple,
       conn._Execute(failed_other_configs).fetchall()))

   rows_filtered = []
   for row in rows:
     _, _, _, change_number, patch_number, change_source = row
     k = (change_number, patch_number, change_source)
     if k not in failed_other_configs:
       rows_filtered.append(row)

   # TODO(crbug.com/820379) yield build ids from here.
   return [
       (GerritPatchTuple(number, patch, source == 'internal'), build_id)
       for build_id, _start_time, _build_config,
           number, patch, source in rows_filtered]


 def _PreCQConfigsByFlakePercent(conn, on_peak):
   """Returns build configs in order of their posterior flake percentage.

   We use a Beta(49.0, 1.0) prior for each collection of (config, time window)
   counts. This is just a way to represent our expectation that there is
   generally a low (2%) level of flake, and that we need more than just a few
   data points to consider a config highly flakey.

   The prior (49.0, 1.0) is based on me (phobbs@) eyeballing a weeks worth of
   data. I am confident we could choose a better prior if we wanted to do this
   systematically using empirical Bayes or a hierarchical model.

   See http://varianceexplained.org/r/empirical_bayes_baseball/ for a good
   blog post on the Beta-Bernoulli model for estimating rates.

   Args:
     conn: A CIDBConnection to use.
     on_peak: Whether we're currently throttled (during business hours) or
         whether it's off-peak hours (weekend, night-time, holidays)

   Returns:
     a numpy.array of configs in descending order of flake.
   """
   counts = conn.GetPreCQFlakeCounts()
   # columns=[...] is necessary in case counts is empty.
   counts_df = pandas.DataFrame(counts, columns=[
       'build_config', 'flake_count', 'build_count'])

   denominator_fuzz = _FLAKE_PRIOR_CONCENTRATION_0 + _FLAKE_PRIOR_CONCENTRATION_1
   counts_df['posterior_ratio'] = (
     (counts_df['flake_count'] + _FLAKE_PRIOR_CONCENTRATION_1)
     / (counts_df['build_count'] + denominator_fuzz))
   # TODO(phobbs) remove this threshold when pre-CQ exonerator is properly rate
   # limited.
   threshold = _FLAKE_THRESHOLD_ON_PEAK if on_peak else _FLAKE_THRESHOLD_OFF_PEAK
   counts_df = counts_df[counts_df['posterior_ratio'] > threshold]
   counts_df = counts_df.sort_values(by='posterior_ratio', ascending=False)
   return counts_df['build_config'].values
	# Copyright 2018 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Returns a sequence of new innocent CLs which failed pre-cq due to flake.

	Queries CIDB for "natural" sanity Pre-CQ build failures - namely, CL
	patchsets which failed a pre-CQ config and then later passed, implying infra
	or test flake.
	"""

	from __future__ import print_function
	from __future__ import absolute_import
	from __future__ import division

	import pandas

	from chromite.lib import clactions


	GerritPatchTuple = clactions.GerritPatchTuple

	# TODO(phobbs) Calculate this prior in a systematic way. These are made-up
	# numbers for now just to get pre-CQ flake exonerator working half-decently.
	# We can derive better numbers using either empirical bayes, by measuring the
	# historical rates of flake for weeks (say within the last year) that are above
	# some threshold of runs, or by fitting a hierarchical model using a
	# probabilistic programming library such as Edward (http://edwardlib.org) using
	# an uninformative prior.
	_FLAKE_PRIOR_CONCENTRATION_0 = 49.0
	_FLAKE_PRIOR_CONCENTRATION_1 = 1

	# Ignore pre-CQ configs with less than 2.5% flake.
	# TODO(phobbs) remove this when CL-Exonerator is properly rate-limited by
	# available pre-CQ capacity.
	# We will want to retry more CLs at night-time and on the weekend when pre-CQ
	# resources are basically unlimited.
	_FLAKE_THRESHOLD_ON_PEAK = 0.025
	# Forgive more CLs on off-peak (weekends and night time) when pre-CQ resources
	# are essentially unlimited.
	_FLAKE_THRESHOLD_OFF_PEAK = 0.01


	# Maximum of 20 minutes in difference of start time for two pre-CQ builds to be
	# considered part of the same "pre-CQ run". This is used for detecting when
	# only a single config failed in the pre-CQ run.
	_MAX_START_TIME_DIFFERENCE = '00:20:00'


	def InnocentPreCQsFromFlake(conn, on_peak):
	"""Returns a sequence flakey pre-CQ failures to forgive.

	Args:
	conn: A CIDBConnection to use.
	on_peak: Whether it's business hours (peak load) or off-peak.

	Yields:
	Tuples of (change, pre-cq build id) to forgive.
	"""
	configs_by_flake = _PreCQConfigsByFlakePercent(conn, on_peak)
	for config in configs_by_flake:
	for cl_and_build in _AffectedPreCQBuilds(conn, config):
	# TODO(crbug.com/820379) yield build ids from here.
	yield cl_and_build


	def _AffectedPreCQBuilds(conn, config):
	"""Returns pre-CQ failures which could be explained by config being flakey.

	Only returns pre-CQ failures which are the *latest build for that
	config/change*. This prevents us from wasting time going to gerrit.

	Args:
	conn: A CIDBConnection to use.
	config: The build config which is flakey.

	Yields:
	Tuples of (change, pre-cq build id) to forgive.
	"""
	# Find all failed builds for the config which don't have another build for
	# that change with a later start_date.
	# TODO(phobbs): should this be moved to cidb.py ?
	# See https://stackoverflow.com/a/7745635/219138 for a discussion of how to
	# perform "greatest-n-per-group" in SQL. I chose the LEFT OUTER JOIN solution.
	# This query joins two sub-queries because you can't perform more than one
	# LEFT OUTER JOIN in MySQL.
	# Each subquery is very fast because it uses the build_config_index and has a
	# limited start_time window. The subsequent LEFT OUTER JOIN is fast because
	# (a) there isn't much data and (b) change_number is indexed.
	# For example, executing on moblab-generic-vm-pre-cq takes 0.07 sec.
	query = """
	SELECT c1.build_id, c1.start_time, c1.build_config,
	c1.change_number, c1.patch_number, c1.change_source

	FROM (
	SELECT c1.build_id, b1.start_time, b1.build_config,
	c1.change_number, c1.patch_number, c1.change_source
	FROM buildTable as b1
	JOIN clActionTable as c1
	WHERE
	c1.build_id = b1.id
	AND c1.action = 'pre_cq_failed'
	AND b1.build_config = '{config}'
	AND b1.start_time > DATE_SUB(NOW(), INTERVAL 7 DAY)
	AND b1.status = 'fail'
	) as c1

	LEFT OUTER JOIN (
	SELECT c2.build_id, c2.change_number, c2.patch_number, c2.change_source, b2.start_time
	FROM buildTable as b2
	JOIN clActionTable as c2
	WHERE
	c2.build_id = b2.id
	AND b2.build_config = '{config}'
	AND b2.start_time > DATE_SUB(NOW(), INTERVAL 7 DAY)
	) as c2

	ON
	c1.change_number = c2.change_number
	/*
	* Note: we do <= here because we don't want to exonerate failures for which
	* there is a newer run - even if the newer run is in a later patchset.
	*/
	AND c1.patch_number <= c2.patch_number
	AND c1.change_source = c2.change_source
	AND c2.start_time > c1.start_time
	WHERE
	c2.build_id IS NULL
	""".format(config=config)

	# Filter out only builds that failed alone. If other configs failed at the
	# same time, it wasn't flake.
	# TODO(phobbs) this technically has a race condition, because it's assuming
	# "query1" will give the same answer as the query above. Can we do this in a
	# safer way, or do both in one query?
	# TODO(phobbs) this function is pretty complicated. We need MySQL integration
	# tests to make sure this doesn't break.
	failed_other_configs = """
	SELECT q1.change_number, q1.patch_number, q1.change_source
	FROM
	({query1}) as q1
	JOIN clActionTable c
	JOIN buildTable b
	ON
	q1.change_number = c.change_number
	AND q1.patch_number = c.patch_number
	AND q1.change_source = c.change_source
	AND c.build_id = b.id
	WHERE
	b.build_config != '{config}'
	AND b.build_config != 'pre-cq-launcher'
	AND b.status = 'fail'
	AND TIMEDIFF(q1.start_time, b.start_time) <= TIME('{max_start_time_diff}')
	""".format(
	query1=query,
	config=config,
	max_start_time_diff=_MAX_START_TIME_DIFFERENCE)

	# This should technically be in chromite.lib.cidb, but this will be moved to
	# a CIDB API anyway.
	# pylint: disable=protected-access
	rows = conn._Execute(query).fetchall()
	failed_other_configs = set(map(tuple,
	conn._Execute(failed_other_configs).fetchall()))

	rows_filtered = []
	for row in rows:
	_, _, _, change_number, patch_number, change_source = row
	k = (change_number, patch_number, change_source)
	if k not in failed_other_configs:
	rows_filtered.append(row)

	# TODO(crbug.com/820379) yield build ids from here.
	return [
	(GerritPatchTuple(number, patch, source == 'internal'), build_id)
	for build_id, _start_time, _build_config,
	number, patch, source in rows_filtered]


	def _PreCQConfigsByFlakePercent(conn, on_peak):
	"""Returns build configs in order of their posterior flake percentage.

	We use a Beta(49.0, 1.0) prior for each collection of (config, time window)
	counts. This is just a way to represent our expectation that there is
	generally a low (2%) level of flake, and that we need more than just a few
	data points to consider a config highly flakey.

	The prior (49.0, 1.0) is based on me (phobbs@) eyeballing a weeks worth of
	data. I am confident we could choose a better prior if we wanted to do this
	systematically using empirical Bayes or a hierarchical model.

	See http://varianceexplained.org/r/empirical_bayes_baseball/ for a good
	blog post on the Beta-Bernoulli model for estimating rates.

	Args:
	conn: A CIDBConnection to use.
	on_peak: Whether we're currently throttled (during business hours) or
	whether it's off-peak hours (weekend, night-time, holidays)

	Returns:
	a numpy.array of configs in descending order of flake.
	"""
	counts = conn.GetPreCQFlakeCounts()
	# columns=[...] is necessary in case counts is empty.
	counts_df = pandas.DataFrame(counts, columns=[
	'build_config', 'flake_count', 'build_count'])

	denominator_fuzz = _FLAKE_PRIOR_CONCENTRATION_0 + _FLAKE_PRIOR_CONCENTRATION_1
	counts_df['posterior_ratio'] = (
	(counts_df['flake_count'] + _FLAKE_PRIOR_CONCENTRATION_1)
	/ (counts_df['build_count'] + denominator_fuzz))
	# TODO(phobbs) remove this threshold when pre-CQ exonerator is properly rate
	# limited.
	threshold = _FLAKE_THRESHOLD_ON_PEAK if on_peak else _FLAKE_THRESHOLD_OFF_PEAK
	counts_df = counts_df[counts_df['posterior_ratio'] > threshold]
	counts_df = counts_df.sort_values(by='posterior_ratio', ascending=False)
	return counts_df['build_config'].values