blob: 08c8c826f4a5f74b67ccfe071ed67f2327002e3e [file] [log] [blame]
# Copyright 2018 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Returns a sequence of new innocent CLs which failed pre-cq due to flake.
Queries CIDB for "natural" sanity Pre-CQ build failures - namely, CL
patchsets which failed a pre-CQ config and then later passed, implying infra
or test flake.
"""
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
import pandas
from chromite.lib import clactions
GerritPatchTuple = clactions.GerritPatchTuple
# TODO(phobbs) Calculate this prior in a systematic way. These are made-up
# numbers for now just to get pre-CQ flake exonerator working half-decently.
# We can derive better numbers using either empirical bayes, by measuring the
# historical rates of flake for weeks (say within the last year) that are above
# some threshold of runs, or by fitting a hierarchical model using a
# probabilistic programming library such as Edward (http://edwardlib.org) using
# an uninformative prior.
_FLAKE_PRIOR_CONCENTRATION_0 = 49.0
_FLAKE_PRIOR_CONCENTRATION_1 = 1
# Ignore pre-CQ configs with less than 2.5% flake.
# TODO(phobbs) remove this when CL-Exonerator is properly rate-limited by
# available pre-CQ capacity.
# We will want to retry more CLs at night-time and on the weekend when pre-CQ
# resources are basically unlimited.
_FLAKE_THRESHOLD_ON_PEAK = 0.025
# Forgive more CLs on off-peak (weekends and night time) when pre-CQ resources
# are essentially unlimited.
_FLAKE_THRESHOLD_OFF_PEAK = 0.01
# Maximum of 20 minutes in difference of start time for two pre-CQ builds to be
# considered part of the same "pre-CQ run". This is used for detecting when
# only a single config failed in the pre-CQ run.
_MAX_START_TIME_DIFFERENCE = '00:20:00'
def InnocentPreCQsFromFlake(conn, on_peak):
"""Returns a sequence flakey pre-CQ failures to forgive.
Args:
conn: A CIDBConnection to use.
on_peak: Whether it's business hours (peak load) or off-peak.
Yields:
Tuples of (change, pre-cq build id) to forgive.
"""
configs_by_flake = _PreCQConfigsByFlakePercent(conn, on_peak)
for config in configs_by_flake:
for cl_and_build in _AffectedPreCQBuilds(conn, config):
# TODO(crbug.com/820379) yield build ids from here.
yield cl_and_build
def _AffectedPreCQBuilds(conn, config):
"""Returns pre-CQ failures which could be explained by config being flakey.
Only returns pre-CQ failures which are the *latest build for that
config/change*. This prevents us from wasting time going to gerrit.
Args:
conn: A CIDBConnection to use.
config: The build config which is flakey.
Yields:
Tuples of (change, pre-cq build id) to forgive.
"""
# Find all failed builds for the config which don't have another build for
# that change with a later start_date.
# TODO(phobbs): should this be moved to cidb.py ?
# See https://stackoverflow.com/a/7745635/219138 for a discussion of how to
# perform "greatest-n-per-group" in SQL. I chose the LEFT OUTER JOIN solution.
# This query joins two sub-queries because you can't perform more than one
# LEFT OUTER JOIN in MySQL.
# Each subquery is very fast because it uses the build_config_index and has a
# limited start_time window. The subsequent LEFT OUTER JOIN is fast because
# (a) there isn't much data and (b) change_number is indexed.
# For example, executing on moblab-generic-vm-pre-cq takes 0.07 sec.
query = """
SELECT c1.build_id, c1.start_time, c1.build_config,
c1.change_number, c1.patch_number, c1.change_source
FROM (
SELECT c1.build_id, b1.start_time, b1.build_config,
c1.change_number, c1.patch_number, c1.change_source
FROM buildTable as b1
JOIN clActionTable as c1
WHERE
c1.build_id = b1.id
AND c1.action = 'pre_cq_failed'
AND b1.build_config = '{config}'
AND b1.start_time > DATE_SUB(NOW(), INTERVAL 7 DAY)
AND b1.status = 'fail'
) as c1
LEFT OUTER JOIN (
SELECT c2.build_id, c2.change_number, c2.patch_number, c2.change_source, b2.start_time
FROM buildTable as b2
JOIN clActionTable as c2
WHERE
c2.build_id = b2.id
AND b2.build_config = '{config}'
AND b2.start_time > DATE_SUB(NOW(), INTERVAL 7 DAY)
) as c2
ON
c1.change_number = c2.change_number
/*
* Note: we do <= here because we don't want to exonerate failures for which
* there is a newer run - even if the newer run is in a later patchset.
*/
AND c1.patch_number <= c2.patch_number
AND c1.change_source = c2.change_source
AND c2.start_time > c1.start_time
WHERE
c2.build_id IS NULL
""".format(config=config)
# Filter out only builds that failed alone. If other configs failed at the
# same time, it wasn't flake.
# TODO(phobbs) this technically has a race condition, because it's assuming
# "query1" will give the same answer as the query above. Can we do this in a
# safer way, or do both in one query?
# TODO(phobbs) this function is pretty complicated. We need MySQL integration
# tests to make sure this doesn't break.
failed_other_configs = """
SELECT q1.change_number, q1.patch_number, q1.change_source
FROM
({query1}) as q1
JOIN clActionTable c
JOIN buildTable b
ON
q1.change_number = c.change_number
AND q1.patch_number = c.patch_number
AND q1.change_source = c.change_source
AND c.build_id = b.id
WHERE
b.build_config != '{config}'
AND b.build_config != 'pre-cq-launcher'
AND b.status = 'fail'
AND TIMEDIFF(q1.start_time, b.start_time) <= TIME('{max_start_time_diff}')
""".format(
query1=query,
config=config,
max_start_time_diff=_MAX_START_TIME_DIFFERENCE)
# This should technically be in chromite.lib.cidb, but this will be moved to
# a CIDB API anyway.
# pylint: disable=protected-access
rows = conn._Execute(query).fetchall()
failed_other_configs = set(map(tuple,
conn._Execute(failed_other_configs).fetchall()))
rows_filtered = []
for row in rows:
_, _, _, change_number, patch_number, change_source = row
k = (change_number, patch_number, change_source)
if k not in failed_other_configs:
rows_filtered.append(row)
# TODO(crbug.com/820379) yield build ids from here.
return [
(GerritPatchTuple(number, patch, source == 'internal'), build_id)
for build_id, _start_time, _build_config,
number, patch, source in rows_filtered]
def _PreCQConfigsByFlakePercent(conn, on_peak):
"""Returns build configs in order of their posterior flake percentage.
We use a Beta(49.0, 1.0) prior for each collection of (config, time window)
counts. This is just a way to represent our expectation that there is
generally a low (2%) level of flake, and that we need more than just a few
data points to consider a config highly flakey.
The prior (49.0, 1.0) is based on me (phobbs@) eyeballing a weeks worth of
data. I am confident we could choose a better prior if we wanted to do this
systematically using empirical Bayes or a hierarchical model.
See http://varianceexplained.org/r/empirical_bayes_baseball/ for a good
blog post on the Beta-Bernoulli model for estimating rates.
Args:
conn: A CIDBConnection to use.
on_peak: Whether we're currently throttled (during business hours) or
whether it's off-peak hours (weekend, night-time, holidays)
Returns:
a numpy.array of configs in descending order of flake.
"""
counts = conn.GetPreCQFlakeCounts()
# columns=[...] is necessary in case counts is empty.
counts_df = pandas.DataFrame(counts, columns=[
'build_config', 'flake_count', 'build_count'])
denominator_fuzz = _FLAKE_PRIOR_CONCENTRATION_0 + _FLAKE_PRIOR_CONCENTRATION_1
counts_df['posterior_ratio'] = (
(counts_df['flake_count'] + _FLAKE_PRIOR_CONCENTRATION_1)
/ (counts_df['build_count'] + denominator_fuzz))
# TODO(phobbs) remove this threshold when pre-CQ exonerator is properly rate
# limited.
threshold = _FLAKE_THRESHOLD_ON_PEAK if on_peak else _FLAKE_THRESHOLD_OFF_PEAK
counts_df = counts_df[counts_df['posterior_ratio'] > threshold]
counts_df = counts_df.sort_values(by='posterior_ratio', ascending=False)
return counts_df['build_config'].values