| # Copyright 2014 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| import math |
| import os |
| |
| import bisect_utils |
| import math_utils |
| import source_control |
| import ttest |
| |
| from bisect_state import RevisionState |
| |
| |
| class BisectResults(object): |
| """Contains results of the completed bisect. |
| |
| Properties: |
| error: Error message if the bisect failed. |
| |
| If the error is None, the following properties are present: |
| warnings: List of warnings from the bisect run. |
| state: BisectState object from which these results were generated. |
| first_working_revision: First good revision. |
| last_broken_revision: Last bad revision. |
| |
| If both of above revisions are not None, the follow properties are present: |
| culprit_revisions: A list of revisions, which contain the bad change |
| introducing the failure. |
| regression_size: For performance bisects, this is a relative change of |
| the mean metric value. For other bisects this field always contains |
| 'zero-to-nonzero'. |
| regression_std_err: For performance bisects, it is a pooled standard error |
| for groups of good and bad runs. Not used for other bisects. |
| confidence: For performance bisects, it is a confidence that the good and |
| bad runs are distinct groups. Not used for non-performance bisects. |
| """ |
| |
| def __init__(self, bisect_state=None, depot_registry=None, opts=None, |
| runtime_warnings=None, error=None, abort_reason=None): |
| """Computes final bisect results after a bisect run is complete. |
| |
| This constructor should be called in one of the following ways: |
| BisectResults(state, depot_registry, opts, runtime_warnings) |
| BisectResults(error=error) |
| |
| First option creates an object representing successful bisect results, while |
| second option creates an error result. |
| |
| Args: |
| bisect_state: BisectState object representing latest bisect state. |
| depot_registry: DepotDirectoryRegistry object with information on each |
| repository in the bisect_state. |
| opts: Options passed to the bisect run. |
| runtime_warnings: A list of warnings from the bisect run. |
| error: Error message. When error is not None, other arguments are ignored. |
| """ |
| # Setting these attributes so that bisect printer does not break when the |
| # regression cannot be reproduced (no broken revision was found) |
| self.regression_size = 0 |
| self.regression_std_err = 0 |
| self.confidence = 0 |
| self.culprit_revisions = [] |
| |
| self.error = error |
| self.abort_reason = abort_reason |
| if error is not None or abort_reason is not None: |
| return |
| |
| assert (bisect_state is not None and depot_registry is not None and |
| opts is not None and runtime_warnings is not None), ( |
| 'Incorrect use of the BisectResults constructor. ' |
| 'When error is None, all other arguments are required.') |
| |
| self.state = bisect_state |
| |
| rev_states = bisect_state.GetRevisionStates() |
| first_working_rev, last_broken_rev = self.FindBreakingRevRange(rev_states) |
| self.first_working_revision = first_working_rev |
| self.last_broken_revision = last_broken_rev |
| |
| self.warnings = runtime_warnings |
| |
| self.retest_results_tot = None |
| self.retest_results_reverted = None |
| |
| if first_working_rev is not None and last_broken_rev is not None: |
| statistics = self._ComputeRegressionStatistics( |
| rev_states, first_working_rev, last_broken_rev) |
| |
| self.regression_size = statistics['regression_size'] |
| self.regression_std_err = statistics['regression_std_err'] |
| self.confidence = statistics['confidence'] |
| |
| self.culprit_revisions = self._FindCulpritRevisions( |
| rev_states, depot_registry, first_working_rev, last_broken_rev) |
| |
| self.warnings += self._GetResultBasedWarnings( |
| self.culprit_revisions, opts, self.confidence) |
| |
| def AddRetestResults(self, results_tot, results_reverted): |
| if not results_tot or not results_reverted: |
| self.warnings.append( |
| 'Failed to re-test reverted culprit CL against ToT.') |
| return |
| |
| confidence = BisectResults.ConfidenceScore( |
| results_reverted[0]['values'], |
| results_tot[0]['values']) |
| |
| self.retest_results_tot = RevisionState('ToT', 'n/a', 0) |
| self.retest_results_tot.value = results_tot[0] |
| |
| self.retest_results_reverted = RevisionState('Reverted', 'n/a', 0) |
| self.retest_results_reverted.value = results_reverted[0] |
| |
| if confidence <= bisect_utils.HIGH_CONFIDENCE: |
| self.warnings.append( |
| 'Confidence of re-test with reverted CL is not high.' |
| ' Check that the regression hasn\'t already recovered. ' |
| ' There\'s still a chance this is a regression, as performance of' |
| ' local builds may not match official builds.') |
| |
| @staticmethod |
| def _GetResultBasedWarnings(culprit_revisions, opts, confidence): |
| warnings = [] |
| if len(culprit_revisions) > 1: |
| warnings.append('Due to build errors, regression range could ' |
| 'not be narrowed down to a single commit.') |
| if opts.repeat_test_count == 1: |
| warnings.append('Tests were only set to run once. This may ' |
| 'be insufficient to get meaningful results.') |
| if 0 < confidence < bisect_utils.HIGH_CONFIDENCE: |
| warnings.append('Confidence is not high. Try bisecting again ' |
| 'with increased repeat_count, larger range, or ' |
| 'on another metric.') |
| if not confidence: |
| warnings.append('Confidence score is 0%. Try bisecting again on ' |
| 'another platform or another metric.') |
| return warnings |
| |
| @staticmethod |
| def ConfidenceScore(sample1, sample2, accept_single_bad_or_good=False): |
| """Calculates a confidence score. |
| |
| This score is based on a statistical hypothesis test. The null |
| hypothesis is that the two groups of results have no difference, |
| i.e. there is no performance regression. The alternative hypothesis |
| is that there is some difference between the groups that's unlikely |
| to occur by chance. |
| |
| The score returned by this function represents our confidence in the |
| alternative hypothesis. |
| |
| Note that if there's only one item in either sample, this means only |
| one revision was classified good or bad, so there's not much evidence |
| to make a decision. |
| |
| Args: |
| sample1: A flat list of "good" result numbers. |
| sample2: A flat list of "bad" result numbers. |
| accept_single_bad_or_good: If True, compute a value even if |
| there is only one bad or good revision. |
| |
| Returns: |
| A float between 0 and 100; 0 if the samples aren't large enough. |
| """ |
| if ((len(sample1) <= 1 or len(sample2) <= 1) and |
| not accept_single_bad_or_good): |
| return 0.0 |
| if not sample1 or not sample2: |
| return 0.0 |
| _, _, p_value = ttest.WelchsTTest(sample1, sample2) |
| return 100.0 * (1.0 - p_value) |
| |
| @staticmethod |
| def FindBreakingRevRange(revision_states): |
| """Finds the last known good and first known bad revisions. |
| |
| Note that since revision_states is expected to be in reverse chronological |
| order, the last known good revision is the first revision in the list that |
| has the passed property set to 1, therefore the name |
| `first_working_revision`. The inverse applies to `last_broken_revision`. |
| |
| Args: |
| revision_states: A list of RevisionState instances. |
| |
| Returns: |
| A tuple containing the two revision states at the border. (Last |
| known good and first known bad.) |
| """ |
| first_working_revision = None |
| last_broken_revision = None |
| |
| for revision_state in revision_states: |
| if revision_state.passed == 1 and not first_working_revision: |
| first_working_revision = revision_state |
| |
| if not revision_state.passed: |
| last_broken_revision = revision_state |
| |
| return first_working_revision, last_broken_revision |
| |
| @staticmethod |
| def _FindCulpritRevisions(revision_states, depot_registry, first_working_rev, |
| last_broken_rev): |
| cwd = os.getcwd() |
| |
| culprit_revisions = [] |
| for i in xrange(last_broken_rev.index, first_working_rev.index): |
| depot_registry.ChangeToDepotDir(revision_states[i].depot) |
| info = source_control.QueryRevisionInfo(revision_states[i].revision) |
| culprit_revisions.append((revision_states[i].revision, info, |
| revision_states[i].depot)) |
| |
| os.chdir(cwd) |
| return culprit_revisions |
| |
| @classmethod |
| def _ComputeRegressionStatistics(cls, rev_states, first_working_rev, |
| last_broken_rev): |
| # TODO(sergiyb): We assume that value has "values" key, which may not be |
| # the case for failure-bisects, where there is a single value only. |
| broken_means = [state.value['values'] |
| for state in rev_states[:last_broken_rev.index+1] |
| if state.value] |
| |
| working_means = [state.value['values'] |
| for state in rev_states[first_working_rev.index:] |
| if state.value] |
| |
| # Flatten the lists to calculate mean of all values. |
| working_mean = sum(working_means, []) |
| broken_mean = sum(broken_means, []) |
| |
| # Calculate the approximate size of the regression |
| mean_of_bad_runs = math_utils.Mean(broken_mean) |
| mean_of_good_runs = math_utils.Mean(working_mean) |
| |
| regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs, |
| mean_of_bad_runs) |
| if math.isnan(regression_size): |
| regression_size = 'zero-to-nonzero' |
| |
| regression_std_err = math.fabs(math_utils.PooledStandardError( |
| [working_mean, broken_mean]) / |
| max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 |
| |
| # Give a "confidence" in the bisect culprit by seeing whether the results |
| # of the culprit revision and the revision before that appear to be |
| # statistically significantly different. |
| confidence = cls.ConfidenceScore( |
| sum([first_working_rev.value['values']], []), |
| sum([last_broken_rev.value['values']], [])) |
| |
| bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs |
| |
| return {'regression_size': regression_size, |
| 'regression_std_err': regression_std_err, |
| 'confidence': confidence, |
| 'bad_greater_than_good': bad_greater_than_good} |