| # Copyright 2018 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Returns a sequence of new innocent CLs which failed pre-cq due to flake. |
| |
| Queries CIDB for "natural" sanity Pre-CQ build failures - namely, CL |
| patchsets which failed a pre-CQ config and then later passed, implying infra |
| or test flake. |
| """ |
| |
| from __future__ import print_function |
| from __future__ import absolute_import |
| from __future__ import division |
| |
| import pandas |
| |
| from chromite.lib import clactions |
| |
| |
| GerritPatchTuple = clactions.GerritPatchTuple |
| |
| # TODO(phobbs) Calculate this prior in a systematic way. These are made-up |
| # numbers for now just to get pre-CQ flake exonerator working half-decently. |
| # We can derive better numbers using either empirical bayes, by measuring the |
| # historical rates of flake for weeks (say within the last year) that are above |
| # some threshold of runs, or by fitting a hierarchical model using a |
| # probabilistic programming library such as Edward (http://edwardlib.org) using |
| # an uninformative prior. |
| _FLAKE_PRIOR_CONCENTRATION_0 = 49.0 |
| _FLAKE_PRIOR_CONCENTRATION_1 = 1 |
| |
| # Ignore pre-CQ configs with less than 2.5% flake. |
| # TODO(phobbs) remove this when CL-Exonerator is properly rate-limited by |
| # available pre-CQ capacity. |
| # We will want to retry more CLs at night-time and on the weekend when pre-CQ |
| # resources are basically unlimited. |
| _FLAKE_THRESHOLD_ON_PEAK = 0.025 |
| # Forgive more CLs on off-peak (weekends and night time) when pre-CQ resources |
| # are essentially unlimited. |
| _FLAKE_THRESHOLD_OFF_PEAK = 0.01 |
| |
| |
| # Maximum of 20 minutes in difference of start time for two pre-CQ builds to be |
| # considered part of the same "pre-CQ run". This is used for detecting when |
| # only a single config failed in the pre-CQ run. |
| _MAX_START_TIME_DIFFERENCE = '00:20:00' |
| |
| |
| def InnocentPreCQsFromFlake(conn, on_peak): |
| """Returns a sequence flakey pre-CQ failures to forgive. |
| |
| Args: |
| conn: A CIDBConnection to use. |
| on_peak: Whether it's business hours (peak load) or off-peak. |
| |
| Yields: |
| Tuples of (change, pre-cq build id) to forgive. |
| """ |
| configs_by_flake = _PreCQConfigsByFlakePercent(conn, on_peak) |
| for config in configs_by_flake: |
| for cl_and_build in _AffectedPreCQBuilds(conn, config): |
| # TODO(crbug.com/820379) yield build ids from here. |
| yield cl_and_build |
| |
| |
| def _AffectedPreCQBuilds(conn, config): |
| """Returns pre-CQ failures which could be explained by config being flakey. |
| |
| Only returns pre-CQ failures which are the *latest build for that |
| config/change*. This prevents us from wasting time going to gerrit. |
| |
| Args: |
| conn: A CIDBConnection to use. |
| config: The build config which is flakey. |
| |
| Yields: |
| Tuples of (change, pre-cq build id) to forgive. |
| """ |
| # Find all failed builds for the config which don't have another build for |
| # that change with a later start_date. |
| # TODO(phobbs): should this be moved to cidb.py ? |
| # See https://stackoverflow.com/a/7745635/219138 for a discussion of how to |
| # perform "greatest-n-per-group" in SQL. I chose the LEFT OUTER JOIN solution. |
| # This query joins two sub-queries because you can't perform more than one |
| # LEFT OUTER JOIN in MySQL. |
| # Each subquery is very fast because it uses the build_config_index and has a |
| # limited start_time window. The subsequent LEFT OUTER JOIN is fast because |
| # (a) there isn't much data and (b) change_number is indexed. |
| # For example, executing on moblab-generic-vm-pre-cq takes 0.07 sec. |
| query = """ |
| SELECT c1.build_id, c1.start_time, c1.build_config, |
| c1.change_number, c1.patch_number, c1.change_source |
| |
| FROM ( |
| SELECT c1.build_id, b1.start_time, b1.build_config, |
| c1.change_number, c1.patch_number, c1.change_source |
| FROM buildTable as b1 |
| JOIN clActionTable as c1 |
| WHERE |
| c1.build_id = b1.id |
| AND c1.action = 'pre_cq_failed' |
| AND b1.build_config = '{config}' |
| AND b1.start_time > DATE_SUB(NOW(), INTERVAL 7 DAY) |
| AND b1.status = 'fail' |
| ) as c1 |
| |
| LEFT OUTER JOIN ( |
| SELECT c2.build_id, c2.change_number, c2.patch_number, c2.change_source, b2.start_time |
| FROM buildTable as b2 |
| JOIN clActionTable as c2 |
| WHERE |
| c2.build_id = b2.id |
| AND b2.build_config = '{config}' |
| AND b2.start_time > DATE_SUB(NOW(), INTERVAL 7 DAY) |
| ) as c2 |
| |
| ON |
| c1.change_number = c2.change_number |
| /* |
| * Note: we do <= here because we don't want to exonerate failures for which |
| * there is a newer run - even if the newer run is in a later patchset. |
| */ |
| AND c1.patch_number <= c2.patch_number |
| AND c1.change_source = c2.change_source |
| AND c2.start_time > c1.start_time |
| WHERE |
| c2.build_id IS NULL |
| """.format(config=config) |
| |
| # Filter out only builds that failed alone. If other configs failed at the |
| # same time, it wasn't flake. |
| # TODO(phobbs) this technically has a race condition, because it's assuming |
| # "query1" will give the same answer as the query above. Can we do this in a |
| # safer way, or do both in one query? |
| # TODO(phobbs) this function is pretty complicated. We need MySQL integration |
| # tests to make sure this doesn't break. |
| failed_other_configs = """ |
| SELECT q1.change_number, q1.patch_number, q1.change_source |
| FROM |
| ({query1}) as q1 |
| JOIN clActionTable c |
| JOIN buildTable b |
| ON |
| q1.change_number = c.change_number |
| AND q1.patch_number = c.patch_number |
| AND q1.change_source = c.change_source |
| AND c.build_id = b.id |
| WHERE |
| b.build_config != '{config}' |
| AND b.build_config != 'pre-cq-launcher' |
| AND b.status = 'fail' |
| AND TIMEDIFF(q1.start_time, b.start_time) <= TIME('{max_start_time_diff}') |
| """.format( |
| query1=query, |
| config=config, |
| max_start_time_diff=_MAX_START_TIME_DIFFERENCE) |
| |
| # This should technically be in chromite.lib.cidb, but this will be moved to |
| # a CIDB API anyway. |
| # pylint: disable=protected-access |
| rows = conn._Execute(query).fetchall() |
| failed_other_configs = set(map(tuple, |
| conn._Execute(failed_other_configs).fetchall())) |
| |
| rows_filtered = [] |
| for row in rows: |
| _, _, _, change_number, patch_number, change_source = row |
| k = (change_number, patch_number, change_source) |
| if k not in failed_other_configs: |
| rows_filtered.append(row) |
| |
| # TODO(crbug.com/820379) yield build ids from here. |
| return [ |
| (GerritPatchTuple(number, patch, source == 'internal'), build_id) |
| for build_id, _start_time, _build_config, |
| number, patch, source in rows_filtered] |
| |
| |
| def _PreCQConfigsByFlakePercent(conn, on_peak): |
| """Returns build configs in order of their posterior flake percentage. |
| |
| We use a Beta(49.0, 1.0) prior for each collection of (config, time window) |
| counts. This is just a way to represent our expectation that there is |
| generally a low (2%) level of flake, and that we need more than just a few |
| data points to consider a config highly flakey. |
| |
| The prior (49.0, 1.0) is based on me (phobbs@) eyeballing a weeks worth of |
| data. I am confident we could choose a better prior if we wanted to do this |
| systematically using empirical Bayes or a hierarchical model. |
| |
| See http://varianceexplained.org/r/empirical_bayes_baseball/ for a good |
| blog post on the Beta-Bernoulli model for estimating rates. |
| |
| Args: |
| conn: A CIDBConnection to use. |
| on_peak: Whether we're currently throttled (during business hours) or |
| whether it's off-peak hours (weekend, night-time, holidays) |
| |
| Returns: |
| a numpy.array of configs in descending order of flake. |
| """ |
| counts = conn.GetPreCQFlakeCounts() |
| # columns=[...] is necessary in case counts is empty. |
| counts_df = pandas.DataFrame(counts, columns=[ |
| 'build_config', 'flake_count', 'build_count']) |
| |
| denominator_fuzz = _FLAKE_PRIOR_CONCENTRATION_0 + _FLAKE_PRIOR_CONCENTRATION_1 |
| counts_df['posterior_ratio'] = ( |
| (counts_df['flake_count'] + _FLAKE_PRIOR_CONCENTRATION_1) |
| / (counts_df['build_count'] + denominator_fuzz)) |
| # TODO(phobbs) remove this threshold when pre-CQ exonerator is properly rate |
| # limited. |
| threshold = _FLAKE_THRESHOLD_ON_PEAK if on_peak else _FLAKE_THRESHOLD_OFF_PEAK |
| counts_df = counts_df[counts_df['posterior_ratio'] > threshold] |
| counts_df = counts_df.sort_values(by='posterior_ratio', ascending=False) |
| return counts_df['build_config'].values |