bisect_kit/strategy.py - chromiumos/platform/bisect-kit - Git at Google

 # Copyright 2017 The ChromiumOS Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """The binary search algorithm."""

 import logging
 import math
 import re
 import sys

 from bisect_kit import errors
 from bisect_kit import math_util
 from bisect_kit import strategy_states
 from scipy.stats import boschloo_exact


 logger = logging.getLogger(__name__)

 # If a given version got equal to or more than SKIP_FOREVER times of 'skip'
 # evaluation result, ignores the said version forever.
 SKIP_FOREVER = 3

 # Constant that denotes version is known not noisy.
 NOT_NOISY = (None, None)

 # Constant that denotes whether the statistical method is used in initial range
 ENABLE_AUTO_ENDPOINT_VERIFICATION = False


 def trend_score(prev_from, prev_to, now_from, now_to):
     """Calculates "trend score".

     Assume we have observed benchmark number changed from `prev_from` to
     `prev_to` from a dashboard, say, from 100 to 110. Now, we reproduced the same
     test locally and obtained our own benchmark values, from `now_from` to
     `now_to`. Because our local test may not be able to reproduce the exact
     identical changes (100 to 102) for some reasons, but it may have similar
     trend, say, 95 to 97, which is considered as a good reproduction.

     Here, we define the "similar trend" for two cases.
       1. we can reproduce the delta, but the value shifted, ex. previous values
          are 100->102 and our new values are 95->97 (delta=+2).
       2. we can reproduce the proportion change, but the value scaled, ex.
          previous values are 10->15 and our local values are 8->12 (+50%).

     The property of trend score:
       - score=0 if our local values stay the same
       - score=1 if we have similar change (delta or proportion); it's possible
         that score>1.
       - between 0 and 1 if we have the same trend but no so significant
       - negative if opposite direction, ex. previous values are increasing but our
         values are decreasing.

     Some cases proportion is inapplicable (ex. zero to non-zero, negative to
     positive, etc.), then only delta is used in calculation.

     Args:
       prev_from: previous value changed from
       prev_to: previous value changed to
       now_from: our new value changed from
       now_to: our new value changed to

     Returns:
       the score value
     """
     assert prev_to != prev_from
     abs_score = (now_to - now_from) / (prev_to - prev_from)
     if now_from == 0 or prev_from == 0 or prev_to * prev_from < 0:
         return abs_score

     ratio_score = ((now_to - now_from) / now_from) / (
         (prev_to - prev_from) / prev_from
     )
     return max(abs_score, ratio_score)


 class NoisyBinarySearch:
     """Binary search algorithm supporting noisy signal.

     Examples:
       bsearch = NoisyBinarySearch(rev_info, observation='old=1/10,new=9/10')
       while not bsearch.is_done():
         bsearch.update(idx, get_result(rev_info[idx]))
       print(bsearch.get_best_guess())
     """

     # States
     #   INITED:
     #     Before both ends are verified.
     #     Recomputing initial values is also done in this phase.
     #     Not yet to calculate probabilities for each candidate.
     #   STARTED:
     #     Started to bisect.
     # There are no error states because exceptions are raised for fatal errors.
     # There are no "done" states because the confidence may become lower after
     # feeding more data.
     INITED = 'inited'
     STARTED = 'started'

     # During the initial range verification phase, there are four init-states:
     #   INIT_DO_ONE:
     #     Run several times at both ends, and the init-state will
     #     change to INIT_CHECK_ONE.
     #     The time is determined by rounds, up to three rounds.
     #   INIT_CHECK_ONE:
     #     Use statistical method to check for regression.
     #     According to the statistical test result, the init-state may
     #     change to INIT_DO_ONE or INIT_DO_TWO.
     #   INIT_DO_TWO:
     #     Do the final round of test. Run 10 times at both ends.
     #   INIT_CHECK_TWO:
     #     Do the final round of statistical test to check.
     #     According to the statistical test result, determine whether
     #     there is a regression.
     # INIT_STATUS_MAP describes transition of the init-states and the alpha value
     # for statistical tests.
     INIT_DO_ONE = 'init_do_one'
     INIT_CHECK_ONE = 'init_check_one'
     INIT_DO_TWO = 'init_do_two'
     INIT_CHECK_TWO = 'init_check_two'

     INIT_STATUS_MAP = {
         INIT_DO_ONE: {
             'next_state': INIT_CHECK_ONE,
             'action': 'counting',
         },
         INIT_CHECK_ONE: {
             'next_state': INIT_DO_TWO,
             'last_state': INIT_DO_ONE,
             'alpha': 0.01,
             'action': 'checking',
         },
         INIT_DO_TWO: {
             'next_state': INIT_CHECK_TWO,
             'action': 'counting',
         },
         INIT_CHECK_TWO: {
             'next_state': None,
             'alpha': 0.05,
             'action': 'checking',
         },
     }

     # INIT_ROUNDS:
     #   INIT_ROUNDS[i] represents the cumulative number of times it has run on
     #   both ends in the i-th round.
     # INIT_P_BOUND:
     #   For the first three rounds(e.g. round i), if the p-value calculated by
     #   the statistical method is greater than or equal to INIT_P_BOUND[i], the
     #   regression doesn't exist.
     #   For the last round, if the p-value calculated by the statistical method
     #   is greater than or equal to alpha, the regression doesn't exist.
     # INIT_P_BOUND is used to eliminate tests which apparently don't have
     #   regression.
     INIT_ROUND = [5, 10, 15, 25]
     INIT_P_BOUND = [0.65, 0.55, 0.5]

     def __init__(
         self,
         rev_info,
         old_idx=0,
         new_idx=None,
         old_value=None,
         new_value=None,
         term_map=None,
         recompute_init_values=False,
         oracle=None,
         observation=None,
         confidence=0.999,
         verify_confidence=0.999,
         endpoint_verification=None,
         saved_states=None,
     ):
         """Initializes NoisyBinarySearch.

         The probability model could be specified in three ways:
           - If oracle=(old_p, new_p) is specified. This means probability
             distribution is known.
               P(new behavior | old version) = old_p
               P(new behavior | new version) = new_p

           - If observation is specified, which means although the probability
             distribution is unknown, some previous test results are provided. The
             algorithm will re-estimate probability overtime once more test results
             feed.

             For example, prior_observation="old=2/3,new=9/10" means
                 old version failed 2 times out of 3 times, and
                 new version failed 9 times out of 10 times,
             where "new behavior" is "failed" in this case.

             If one of them is known non-noisy, it could be omitted. For example,
             "new=9/10" (old omitted) means old versions always have old behavior.

           - If both oracle and prior_observation are not specified, it means the
             signal is not noisy and this class will fallback to classic binary
             search.

         If recompute_init_values is True, the state transition is:
           1. `old_value` and `new_value` have values
              but `threshold` is None
              => `prob` is None
           2. _next_idx_for_initial() return None, so
              `old_avg`, `new_avg`, and `threshold` could be calculated
              => reclassify 'old' and 'new'

         Args:
           rev_info: List of RevInfo.
           old_idx: Index of rev, which has old behavior.
           new_idx: Index of rev, which has new behavior.
           old_value: For value bisection, the value of old version.
           new_value: For value bisection, the value of new version.
           term_map: Alternative term for states
           recompute_init_values: For value bisection, recompute values of old and
             new versions.
           oracle: The oracle model of probability distribution.
           observation: Observed test results.
           confidence: Required confidence for bisection result.
           verify_confidence: Bisect range verification confidence.
           endpoint_verification: Flag indicating whether the statistical
             verification method is enabled.
           saved_states: internal states from previous execution, if any.
         """
         assert oracle is None or observation is None
         self._state = self.INITED
         # guards self._state. See self.state()
         self.dirty = True
         if new_idx is None:
             new_idx = len(rev_info) - 1
         assert rev_info is not None
         self.rev_info = rev_info
         self.old_idx = old_idx
         self.new_idx = new_idx
         self.old_value = old_value
         self.new_value = new_value
         self.term_map = term_map or {}
         self.recompute_init_values = recompute_init_values
         self.oracle = oracle
         if oracle:
             self.old_p, self.new_p = self.oracle
             self.prior_observation = None
         elif observation:
             self.prior_observation = self._parse_observation(observation)
             self.observation = None
             (left_new, left_trial), (
                 right_new,
                 right_trial,
             ) = self.prior_observation
             self.old_p, self.new_p = self._observation_to_prob(
                 left_new, left_trial, right_new, right_trial
             )
         else:
             self.prior_observation = None
             self.old_p, self.new_p = 0, 1
         self.confidence = confidence
         self.verify_confidence = verify_confidence

         self.prob = None

         # The probability of a revision to be the one introduce new behavior.
         # Initialize weight to 1.0 for revs in between old_idx and new_idx; and
         # to 0.0 otherwise.
         # TODO(kcwu): use heuristic to guess initial weight.
         self.init_weight = [
             1.0 if self.old_idx < i <= self.new_idx else 0.0
             for i in range(len(self.rev_info))
         ]
         self.skip_weight = None
         self.init_prob = None

         self.threshold = None
         if old_value is not None:
             if self.recompute_init_values:
                 self.old_avg = None
                 self.new_avg = None
             else:
                 self.old_avg = old_value
                 self.new_avg = new_value
                 self.threshold = (old_value + new_value) / 2

         if endpoint_verification is None:
             self.endpoint_verification = ENABLE_AUTO_ENDPOINT_VERIFICATION
         else:
             self.endpoint_verification = endpoint_verification

         # init_verify_state includes: INIT_DO_ONE, INIT_CHECK_ONE,
         #                             INIT_DO_TWO, INIT_CHECK_TWO
         # init_range_verified denotes whether the initial range has been verified
         # with statistical method.
         # count records the results in both ends.
         self.init_rounds = 0
         self._init_verify_state = self.INIT_DO_ONE
         self.init_range_verified = False
         self.count_revold_old = 0
         self.count_revold_new = 0
         self.count_revnew_old = 0
         self.count_revnew_new = 0

         # passed_noise_rate denotes whether the noise rate calculated during the
         # initial range verification stage has been passed to noisy binary search.
         self.passed_noise_rate = False

         # Reads in saved states from previuos execution, if any.
         self._import_states(saved_states)

     def export_states(self) -> strategy_states.States:
         return strategy_states.States(
             state=self.state,
             init_verify_state=self.init_verify_state,
             init_range_verified=self.init_range_verified,
             init_rounds=self.init_rounds,
             count_revold_old=self.count_revold_old,
             count_revold_new=self.count_revold_new,
             count_revnew_old=self.count_revnew_old,
             count_revnew_new=self.count_revnew_new,
             passed_noise_rate=self.passed_noise_rate,
         )

     def _import_states(self, states: strategy_states.States):
         logger.debug('_import_states: %s', states)
         if not states:
             return

         if states.state is not None:
             self._state = states.state
             if self._state == self.STARTED:
                 self._init_for_state_started()
             self.dirty = False
         if states.init_verify_state is not None:
             self._init_verify_state = states.init_verify_state
         if states.init_range_verified is not None:
             self.init_range_verified = states.init_range_verified
         if states.init_rounds is not None:
             self.init_rounds = states.init_rounds
         if states.count_revold_old is not None:
             self.count_revold_old = states.count_revold_old
         if states.count_revold_new is not None:
             self.count_revold_new = states.count_revold_new
         if states.count_revnew_old is not None:
             self.count_revnew_old = states.count_revnew_old
         if states.count_revnew_new is not None:
             self.count_revnew_new = states.count_revnew_new
         if states.passed_noise_rate is not None:
             self.passed_noise_rate = states.passed_noise_rate

     def is_noisy(self):
         """Is noisy case?"""
         if self.oracle and self.oracle != (0, 1):
             return True
         return self.prior_observation is not None

     @property
     def init_verify_state(self):
         current_state = self._init_verify_state
         if self.INIT_STATUS_MAP[current_state]['action'] == 'checking':
             return self._init_verify_state

         current_round = self.INIT_ROUND[self.init_rounds]
         if (
             self.count_revnew_old + self.count_revnew_new >= current_round
             and self.count_revold_old + self.count_revold_new >= current_round
         ):
             self._init_verify_state = self.INIT_STATUS_MAP[current_state][
                 'next_state'
             ]
         return self._init_verify_state

     @property
     def state(self):
         if self._state == self.INITED:
             if not self.dirty:
                 return self._state
             # It may raise exception. So if we don't want to get exception in
             # read-only operations like get_range() across execution, save
             # internal states to make self.dirty False after initialization.
             try:
                 idx = self._next_idx_for_initial()
             finally:
                 self.dirty = False
             if idx is not None:
                 return self._state
             self._state = self.STARTED
             self._init_for_state_started()

         assert self._state == self.STARTED, 'unexpected state: %s' % self._state
         return self._state

     def _init_for_state_started(self):
         """build internal states when self._state reached STARTED.

         Raises:
           errors.WrongAssumption: failed to calculate self.prob.
         """
         assert self._state == self.STARTED
         if self.is_value_bisection() and self.recompute_init_values:
             if self.old_avg is None:
                 self.old_avg = math_util.average(
                     self.rev_info[self.old_idx].averages()
                 )
             if self.new_avg is None:
                 self.new_avg = math_util.average(
                     self.rev_info[self.new_idx].averages()
                 )
             if self.threshold is None:
                 self.threshold = (self.old_avg + self.new_avg) / 2
             self.rev_info[self.old_idx].reclassify(
                 self.old_avg, self.threshold, self.new_avg
             )
             self.rev_info[self.new_idx].reclassify(
                 self.old_avg, self.threshold, self.new_avg
             )

         if self.prob is None:
             self._rebuild()
             self._estimate_model()
             assert self.prob

     def add_sample(self, idx, status=None, **kwargs):
         sample = {"status": status, **kwargs}
         self.rev_info[idx].add_sample(**sample)
         self.dirty = True

         status = sample['status']

         if self.endpoint_verification and not self.is_value_bisection():
             self.renew_initial_count(status, idx)

         self.check_verification_range()
         if status == 'skip':
             self.check_proceed_with_skip(
                 idx, sample.get('reason', 'something wrong')
             )
         if self.state == self.INITED:
             pass
         elif self.state == self.STARTED:
             if status == 'value':
                 status = self.classify_result_from_values(sample['values'])
             assert self.prob
             for _ in range(sample.get('times', 1)):
                 self._update(idx, status)
         else:
             assert 0, 'unexpected state: ' + self.state

     @staticmethod
     def _parse_observation(observation):
         """Parses prior from argument.

         Args:
           observation: A string denoting previous evaluation results. See comments
               of __init__ for detail format.

         Returns:
           ((old false-positive, old trials), (new true-positive, new trials))
         """
         # default not noisy
         old_f = NOT_NOISY
         new_t = NOT_NOISY
         if not observation:
             return old_f, new_t

         for term in observation.split(','):
             m = re.match(r'^(old|new)=(\d+)/(\d+)$', term)
             assert m
             new_behavior, trial = int(m.group(2)), int(m.group(3))
             assert 0 <= new_behavior <= trial

             if m.group(1) == 'new':
                 new_t = new_behavior, trial
             elif m.group(1) == 'old':
                 old_f = new_behavior, trial

         return old_f, new_t

     def _gather_noise_count(self):
         """Gathers noise count from current run results.

         After few rounds of binary search, the candidates could be split into three
         groups.
           - left: high confidence to have OLD behavior.
           - middle: not enough confidence, need further inspection.
           - right: high confidence to have NEW behavior.

         Returns:
           left_new: how many NEW in left region.
           left_trial: how many trials in left region.
           right_new: how many NEW in right region.
           right_trial: how many trials in right region.
         """
         if self.observation:
             (left_new, left_trial), (right_new, right_trial) = self.observation
             return left_new, left_trial, right_new, right_trial

         (left_new, left_trial), (
             right_new,
             right_trial,
         ) = self.prior_observation
         if (left_new, left_trial) != NOT_NOISY:
             info = self.rev_info[self.old_idx]
             left_trial += info['old'] + info['new']
             left_new += info['new']
         if (right_new, right_trial) != NOT_NOISY:
             info = self.rev_info[self.new_idx]
             right_trial += info['old'] + info['new']
             right_new += info['new']

         return left_new, left_trial, right_new, right_trial

     @staticmethod
     def _expected_probability_from_observation(trial, num):
         """Expected probability from given observation.

         Given `trial` times of Bernoulli experiment, the results are 1 for `num`
         times. Calculates the expected probability.

         Returns:
           prob: the expected probability.
         """
         # P(observation)
         #   = \int_0^1 C(trial, num) * p^num * (1-p)^(trial-num) dp
         #   = 1 / (trial+1)
         # P(p | observation)
         #   = P(observation | p) * P(p) / P(observation)
         #   = C(trial, num) * p^num * (1-p)^(trial-num) * P(p) * (trial+1)
         # E[P(p | observation)]
         #   = \int_0^1 P(p | observation) * p dp
         #   = C(trial, num) * \int_0^1 p^num * (1-p)^(trial-num)*p dp
         #   = (num+1)/(trial+2)
         return float(num + 1) / (trial + 2)

     @staticmethod
     def _observation_to_prob(left_new, left_trial, right_new, right_trial):
         old_p, new_p = 0, 1
         if (left_new, left_trial) != NOT_NOISY:
             old_p = NoisyBinarySearch._expected_probability_from_observation(
                 left_trial, left_new
             )
         if (right_new, right_trial) != NOT_NOISY:
             new_p = NoisyBinarySearch._expected_probability_from_observation(
                 right_trial, right_new
             )
         return old_p, new_p

     def _estimate_model_from_observation(self):
         """Estimates model from observation.

         We don't know the real probability distribution. Assume they are Bernoulli
         process and calculates the expected probability from observations.

         Because the model will be slightly different after _rebuild(), we will
         repeat until it is stable.
         """
         assert self.prior_observation
         # TODO: endpoint verification for perf metrics has not been implemented yet.
         # When endpoint_verification is enabled, 'observation' will be calculated
         # based on the results of statistical methods, not user input.
         if self.endpoint_verification and not self.is_value_bisection():
             (left_new, left_trial), (right_new, right_trial) = ((0, 0), (0, 0))
         else:
             (left_new, left_trial), (
                 right_new,
                 right_trial,
             ) = self.prior_observation

         seen = set()
         while True:
             old_idx, new_idx = self.get_range()
             if (old_idx, new_idx) in seen:
                 break
             seen.add((old_idx, new_idx))
             if (left_new, left_trial) != NOT_NOISY:
                 for info in self.rev_info[: old_idx + 1]:
                     left_trial += info['old'] + info['new']
                     left_new += info['new']
             if (right_new, right_trial) != NOT_NOISY:
                 for info in self.rev_info[new_idx:]:
                     right_trial += info['old'] + info['new']
                     right_new += info['new']

             self.observation = (left_new, left_trial), (right_new, right_trial)
             self.old_p, self.new_p = self._observation_to_prob(
                 left_new, left_trial, right_new, right_trial
             )

             if (
                 not self.passed_noise_rate
                 and self.endpoint_verification
                 and not self.is_value_bisection()
             ):
                 logger.info(
                     'Failure rate is recalculated: old=%d/%d (%f%%), new=%d/%d (%f%%)',
                     self.observation[0][0],
                     self.observation[0][1],
                     self.old_p * 100,
                     self.observation[1][0],
                     self.observation[1][1],
                     self.new_p * 100,
                 )
                 self.passed_noise_rate = True

             if (self.old_p, self.new_p) in seen:
                 break
             seen.add((self.old_p, self.new_p))
             self._rebuild()

     def _estimate_model(self):
         """Estimates probability of old false-positive and new true-positive.

         In other words, the rates of new behavior is observed at old revisions
         and new revisions respectively.
         """
         if self.oracle:
             self.old_p, self.new_p = self.oracle
         elif self.prior_observation:
             self._estimate_model_from_observation()
             logger.debug(
                 'estimated model: old_p=%s, new_p=%s', self.old_p, self.new_p
             )
         else:
             self.old_p, self.new_p = 0, 1

         if 0 < self.old_p < 1.0 - self.confidence:
             logger.warning(
                 'Confidence value (%r) is too low to detect '
                 'old candidates with new behavior (p=%r)',
                 self.confidence,
                 self.old_p,
             )
         if self.confidence < self.new_p < 1.0:
             logger.warning(
                 'Confidence value (%r) is too low to detect '
                 'new candidates with old behavior (p=%r)',
                 self.confidence,
                 1.0 - self.new_p,
             )

     def get_noise_observation(self) -> str | None:
         """Gets current noise observation.

         This includes the counts of prior observations from __init__.

         Returns:
           A string indicates eval result noise observation so far. See comments of
           __init__ for detail format.
         """
         assert not self.oracle
         if not self.prior_observation:
             return None
         (
             left_new,
             left_trial,
             right_new,
             right_trial,
         ) = self._gather_noise_count()
         observation = []
         if (left_new, left_trial) != NOT_NOISY:
             observation.append('old=%d/%d' % (left_new, left_trial))
         if (right_new, right_trial) != NOT_NOISY:
             observation.append('new=%d/%d' % (right_new, right_trial))
         return ','.join(observation)

     @staticmethod
     def get_weight_by_skip(rev_info):
         """Gets weighting adjustment according to 'skip' test result.

         There are two cases of 'skip':
           1. The tested version is bad (say, compile error). In such case, its
              neighbors are likely 'skip' as well.
           2. Some thing wrong temporarily (say, network error). Needs retry.

         Ideal solution to handle 'skip' properly during binary search:
           - Avoid (less likely) to waste time to eval the 'skip' versions and
             their neighbors.
           - If other candidates are ruled out, we still need to retry them.
           - If a version keeps 'skip' few times for every retry, it is not
             temporarily case 2, we can ignore them completely.

         It's easy to implement above idea. The secret is that we can assign prior,
         P(x_i is first), of the Bayesian formula.
           - We can assign smaller probabilities if we prefer not to try them.
           - We can assign zero probabilities if we don't want to try them.

         Args:
           rev_info: List of RevInfo.

         Returns:
           list of weighting; 1.0 for not adjusted; otherwise, lower values.
         """
         # heuristic, chosen arbitrary for now
         impact_range = int(math.sqrt(len(rev_info)))
         impact_factor = 0.9

         weighting = [1.0] * len(rev_info)
         for i, info in enumerate(rev_info):
             # If ever non-skip, we don't care their 'skip' record.
             if info['new'] or info['old']:
                 continue
             if not info['skip']:
                 continue

             # If retried enough times but still 'skip', don't try them forever.
             if info['skip'] >= SKIP_FOREVER:
                 weighting[i] = 0
             else:
                 # Allow retry, but the more 'skip', the lower weighting.
                 weighting[i] *= 0.1 ** info['skip']

         # Adjust weighting of neighbors.
         # Following formula is chosen arbitrarily. The main idea is "impact" is
         # correlated to the distance until reaching non-skip version.
         # p.s. e**(-pi*x**2) is borrowed from normal distribution.
         for i, info in enumerate(rev_info):
             # If ever non-skip, we don't care their 'skip' record.
             if info['new'] or info['old']:
                 continue
             if not info['skip']:
                 continue

             # left to right
             for j in range(i + 1, min(i + impact_range, len(rev_info))):
                 if rev_info[j]['new'] or rev_info[j]['old']:
                     break
                 x = float(i - j) / impact_range
                 impact = math.e ** (-math.pi * x**2) * impact_factor
                 weighting[j] = min(weighting[j], 1.0 - impact)

             # right to left
             for j in reversed(list(range(max(0, i - impact_range + 1), i))):
                 if rev_info[j]['new'] or rev_info[j]['old']:
                     break
                 x = float(i - j) / impact_range
                 impact = math.e ** (-math.pi * x**2) * impact_factor
                 weighting[j] = min(weighting[j], 1.0 - impact)

         return weighting

     @staticmethod
     def _calculate_probs(old_p, new_p, rev_info, init_prob):
         """Calculates P(x_i is first new | results) for all candidates.

         What we want:
           prob[i] = P(x_i is first new | results)
                   = P(x_i is first new) * P(results | x_i is first new) / P(results)

         Args:
           old_p: P(observe=new | x=old)
           new_p: P(observe=new | x=new)
           rev_info: List of RevInfo.
           init_prob: unnormalized P(x_i is first new)

         Returns:
           prob, where prob[i] = P(x_i is first new | results)

         Raises:
           errors.WrongAssumption if failed to calculate prob.
         """
         assert 0 <= old_p < new_p

         # First, let prob[i] = P(x_i is first new). Because prob[i] will be
         # normalized later, it's okay to use unnormalized init_prob here directly.
         prob = list(init_prob)

         total_old = sum(info['old'] for info in rev_info)
         total_new = sum(info['new'] for info in rev_info)

         # Estimate the lower bound of prob[i].
         # If the value is less than minimum float value, use ExtendedFloat for
         # calculation to prevent underflow. See comments of ExtendedFloat for more
         # detail.
         min_prob = math_util.least([old_p, 1 - old_p, new_p, 1 - new_p]) ** (
             total_old + total_new
         ) * math_util.least(prob)
         if min_prob < sys.float_info.min:
             prob = [math_util.ExtendedFloat(p) for p in prob]
             old_p = math_util.ExtendedFloat(old_p)
             new_p = math_util.ExtendedFloat(new_p)

         # These four variables, {left,right}_{old,new}, are corresponding to x_i.
         # That is, initialized to corresponding to x_0 and incrementally updated at
         # end of each loop iteration.
         # left_old means how many observed old behavior left to x_i. So on and so
         # forth.
         left_old = left_new = 0
         right_old = total_old
         right_new = total_new
         for i, p in enumerate(prob):
             # prob[i] *= P(results | x_i is first new)
             if p != 0:
                 if left_new != 0:
                     p *= old_p**left_new
                 if left_old != 0:
                     p *= (1 - old_p) ** left_old
                 if right_new != 0:
                     p *= new_p**right_new
                 if right_old != 0:
                     p *= (1 - new_p) ** right_old
                 prob[i] = p

             # Update count for next iteration.
             left_old += rev_info[i]['old']
             left_new += rev_info[i]['new']
             right_old -= rev_info[i]['old']
             right_new -= rev_info[i]['new']

         # P(results) = \sum_i P(results | x_i is first new) * P(x_i is first new)
         #            = \sum_i prob[i]
         sum_prob = sum(prob)

         if old_p == 0 and new_p == 1:
             if sum_prob == 0:
                 raise errors.WrongAssumption(
                     'Test results conflict with assumption, try --noisy'
                 )
         else:
             assert sum_prob > 0

         # prob[i] /= P(results)
         prob = [x / sum_prob for x in prob]

         # Convert back to float.
         prob = [float(p) for p in prob]
         return prob

     def _rebuild(self):
         """Rebuilds probability.

         Raises:
           errors.WrongAssumption: failed to calculate self.prob.
         """
         assert 0 <= self.old_idx < self.new_idx < len(self.rev_info)
         # First, P(x_i is first new) = Normalize(init_prob[i])
         #                            = Normalize(init_weight[i] * skip_weight[i])
         self.skip_weight = self.get_weight_by_skip(self.rev_info)
         self.init_prob = [
             p * w for p, w in zip(self.init_weight, self.skip_weight)
         ]
         self.prob = self._calculate_probs(
             self.old_p, self.new_p, self.rev_info, self.init_prob
         )

     def _is_changing_skip_weight(self, idx, result):
         if result == 'skip':
             # It is skipped, so skip_weight must be changed.
             return True

         # Not the first non-skip result, skip_weight is unaffected.
         if self.rev_info[idx][result] > 1:
             return False

         assert self.rev_info[idx][result] == 1
         # For the first non-skip result, skip_weight is affected if itself or its
         # neighbor's skip_weight != 1.
         for i in [idx - 1, idx, idx + 1]:
             if 0 <= i < len(self.rev_info) and self.skip_weight[i] != 1:
                 return True

         return False

     def is_value_bisection(self):
         return self.old_value is not None and self.new_value is not None

     def classify_result_from_values(self, values):
         assert (
             values
         ), 'eval script terminated normally but does not output values?'
         if self.state == self.INITED and self.recompute_init_values:
             return 'value'
         assert self.threshold is not None
         assert values
         avg = math_util.average(values)
         if self.old_avg < self.new_avg:
             return 'old' if avg < self.threshold else 'new'
         return 'new' if avg < self.threshold else 'old'

     def _update(self, idx, result):
         """Incremental update by one more test result.

         Args:
           idx: index of the new test.
           result: result of the new test. 'old', 'new', or 'skip'.

         Raises:
           errors.WrongAssumption: failed to calculate self.prob.
         """
         assert self.prob
         assert self.rev_info

         if self._is_changing_skip_weight(idx, result):
             # Incremental update by skip_weight.
             new_skip_weight = self.get_weight_by_skip(self.rev_info)
             for i, (cur_skip_w, new_skip_w) in enumerate(
                 zip(self.skip_weight, new_skip_weight)
             ):
                 if new_skip_w != cur_skip_w:
                     # A non-skip result after several skips, the weight will jump from
                     # zero to non-zero and unable to update incrementally.
                     if cur_skip_w == 0:
                         self._rebuild()
                         return
                     self.prob[i] *= new_skip_w / cur_skip_w
             self.skip_weight = new_skip_weight

         # If not skip, prob[i] needs to multiply probability of new observation.
         #   prob[i] *= P(last result | x_i is first new).
         # Let's consider the value of multiplier,
         #   if result=new && i <= idx: P(true-positive)
         #   if result=new && i > idx: P(false-positive)
         #   if result=old && i <= idx: P(true-negative) = 1 - P(true-positive)
         #   if result=old && i > idx: P(false-negative) = 1 - P(false-positive)
         if result == 'new':
             for i in range(idx + 1):
                 self.prob[i] *= self.new_p
             for i in range(idx + 1, len(self.prob)):
                 self.prob[i] *= self.old_p
         if result == 'old':
             for i in range(idx + 1):
                 self.prob[i] *= 1 - self.new_p
             for i in range(idx + 1, len(self.prob)):
                 self.prob[i] *= 1 - self.old_p

         sum_prob = sum(self.prob)
         assert sum_prob > 0
         self.prob = [x / sum_prob for x in self.prob]

         if self.prior_observation:
             self._estimate_model()

     def get_range(self, confidence=None):
         """Gets narrowed-down bisection range so far.

         Using a heuristic algorithm to find a semi-open range (left, right] that,
         sum(prob[left+1:right+1]) >= confidence. Some optimizations are applied so
         the range may not be the narrowest.

         If `confidence` value is bigger, the returned range is bigger.

         Args:
           confidence: Confidence value. Default is the confidence provided in ctor.

         Returns:
           (left, right):
             left: Index likely before the point of probability change.
             right: Index likely after the point of probability change.

         Raises:
           errors.WrongAssumption: the bisection range is invalid
         """
         if self.state == self.INITED:
             return self.old_idx, self.new_idx
         assert self.prob

         if confidence is None:
             confidence = self.confidence

         def order_by_prob(x):
             index, prob = x
             return -prob, index

         # Non-zero probabilities sorted by prob (big to small), then index (small
         # to big)
         sorted_prob = sorted(
             ((i, p) for (i, p) in enumerate(self.prob) if p > 0),
             key=order_by_prob,
         )
         cumulative = 0
         cut = None
         candidates = set()
         # This loop finds a range whose sum of probabilities are bigger than the
         # value of confidence. But on the edge of threshold, it will include more
         # candidates until the ratio of probability delta from the cut point is
         # larger than a predefined gap, which is empirical.
         GAP = 0.5
         # p is monotonically decreasing.
         for i, p in sorted_prob:
             cumulative += p
             if cut is None:
                 if cumulative >= confidence:
                     cut = p
             else:
                 if p < cut * GAP:
                     break
             candidates.add(i)
         assert cut is not None or confidence == 1

         left = max(min(candidates) - 1, self.old_idx)
         # Sometimes left has old probability due to 'skip', not 'old'. Seeks for
         # version with 'old'.
         while left > self.old_idx and self.rev_info[left]['old'] == 0:
             left -= 1

         right = max(candidates)
         assert right <= self.new_idx
         return left, right

     def check_verification_range(self):
         """Checks if bisection range is reasonable.

         If the eval results conflict with the bisection range, abort the
         bisection if the probability is too low.

         Raises:
           errors.VerificationFailed: Bisection range is not verified and false
               events occur too many times.
         """
         if self.state == self.INITED and self.recompute_init_values:
             return

         if (
             self.state == self.INITED
             and self.endpoint_verification
             and not self.is_value_bisection()
         ):
             return

         if self.rev_info[self.old_idx]['old'] == 0:
             # P(try N times and old behavior occurs 0 times)
             p = self.old_p ** self.rev_info[self.old_idx]['new']
             if p < 1.0 - self.verify_confidence:
                 raise errors.VerifyOldBehaviorFailed(
                     self.rev_info[self.old_idx].rev,
                     self.rev_info[self.old_idx]['new'],
                     term_old=self.term_map.get('old', 'old'),
                     term_new=self.term_map.get('new', 'new'),
                 )

         if self.rev_info[self.new_idx]['new'] == 0:
             # P(try N times and new behavior occurs 0 times)
             p = (1 - self.new_p) ** self.rev_info[self.new_idx]['old']
             if p < 1.0 - self.verify_confidence:
                 raise errors.VerifyNewBehaviorFailed(
                     self.rev_info[self.new_idx].rev,
                     self.rev_info[self.new_idx]['old'],
                     term_old=self.term_map.get('old', 'old'),
                     term_new=self.term_map.get('new', 'new'),
                 )

     def check_reproduced(self):
         """Checks if the bisection is reproduced.

         The bsearch contains two phases "INITED" and "STARTED".
         In "INITED" state, the "old" and "new" revisions are checked to see if
         the regression is reproducible. After verified, it enters "STARTED"
         phase.

         Returns:
           True if the regression is reproduced.
         """
         return self.state == self.STARTED

     def check_proceed_with_skip(self, idx, reason):
         """Checks if we should proceed after skips.

         We just got one more 'skip' result. Should we tolerate it or stop bisecting?

         Args:
           idx: index of the new skip.
           reason: the reason why a candidate reported 'skip' status

         Raises:
           errors.TooManyTemporaryErrors: skip too many times, probably serious issue and
               unable to proceed
         """
         # we just got skip
         assert self.rev_info[idx]['skip'] > 0

         non_skip_count = self.rev_info[idx]['old'] + self.rev_info[idx]['new']
         if (
             idx in (self.old_idx, self.new_idx)
             and non_skip_count == 0
             and self.rev_info[idx]['skip'] >= SKIP_FOREVER
         ) or (
             self.rev_info[idx]['skip']
             >= (non_skip_count + 1) * (SKIP_FOREVER + 2)
         ):
             raise errors.TooManyTemporaryErrors(
                 'unable to evaluate rev=%r (skip) %d times: %s'
                 % (self.rev_info[idx].rev, self.rev_info[idx]['skip'], reason)
             )

     def show_summary(self, more=False):
         """Shows top ranked candidates.

         Args:
           more: If true, shows versions with any test result as well.
         """

         def show_candidate(idx):
             if self.state == self.STARTED and self.is_noisy():
                 logger.info(
                     '[%d] %s %.4f%%; %s',
                     idx,
                     self.rev_info[idx].rev,
                     self.prob[idx] * 100,
                     self.rev_info[idx].summary(),
                 )
             else:
                 logger.info(
                     '[%d] %s; %s',
                     idx,
                     self.rev_info[idx].rev,
                     self.rev_info[idx].summary(),
                 )

         if self.state == self.INITED:
             logger.info('Verifying initial range:')
             for i in (self.old_idx, self.new_idx):
                 show_candidate(i)
             return

         prob_rank = sorted(
             ((p, i) for i, p in enumerate(self.prob) if p > 0), reverse=True
         )
         left, right = self.get_range(1.0 - self.confidence)
         interesting = set([left, right])

         if self.is_noisy():
             # show top candidates
             interesting.update(i for p, i in prob_rank[:10])
         else:
             # show all skip within range
             for i in range(left + 1, right):
                 if self.rev_info[i]['skip'] and self.prob[i] > 0:
                     interesting.add(i)

         if more:
             # show all revs with test results
             for i, info in enumerate(self.rev_info):
                 if sum(info.result_counter.values()) > 0:
                     interesting.add(i)

         size = right - left
         if size > 1 and not self.is_noisy():
             steps = math.ceil(math.log(size) / math.log(2))
             logger.info(
                 'Range: (%s, %s], %d revs left (roughly %.0f steps)',
                 self.rev_info[left].rev,
                 self.rev_info[right].rev,
                 size,
                 steps,
             )
         else:
             logger.info(
                 'Range: (%s, %s], %d revs left',
                 self.rev_info[left].rev,
                 self.rev_info[right].rev,
                 size,
             )

         for i in sorted(interesting):
             show_candidate(i)

     def remaining_steps(self):
         # TODO(kcwu): estimate remaining steps for noisy case
         if self.is_noisy():
             return None

         left, right = self.get_range()
         return math.ceil(math.log(right - left) / math.log(2))

     def is_done(self):
         """If we have an answer with high confidence."""
         if self.state == self.INITED:
             return False
         assert self.prob
         # TODO(kcwu): revise to consider the real probability of true-positive is
         #             far away to estimation.
         return max(self.prob) > self.confidence

     def get_best_guess(self):
         """Gets current best guess.

         If called before binary search done, it just returns one of candidates with
         highest probability.

         Returns:
           Index of the element with max probability.
         """
         if self.state == self.INITED:
             return None
         assert self.prob
         return self.prob.index(max(self.prob))

     @staticmethod
     def _next_idx_by_utility(old_p, new_p, prob, cost_table=None):
         """Determines next index to binary search according to utility."""
         assert prob
         if cost_table is None:
             cost_table = [(1, 1)] * len(prob)
         orig_stats = math_util.EntropyStats([float(p) for p in prob])
         orig_entropy = orig_stats.entropy()
         new_stats = orig_stats.multiply(old_p)
         old_stats = orig_stats.multiply(1.0 - old_p)

         # Enumerate |i| as next to query, find best |i| to maximize utility.
         best = float('-inf')
         best_i = None
         # |s| = cumulative probability of [p_0,..., p_i]
         s = 0.0
         for i, p in enumerate(prob):
             # If rev[i] is observed with old behavior, the result entropy is
             #   entropy( [p_0, ... p_i] * (1-new_p) + (p_i, ... p_n) * (1-old_p) )
             old_stats.replace(p * (1.0 - old_p), p * (1.0 - new_p))

             # If rev[i] is observed with new behavior, the result entropy is
             #   entropy( [p_0, ... p_i] * new_p + (p_i, ... p_n) * old_p )
             new_stats.replace(p * old_p, p * new_p)

             # Let s = sum(p[:i+1]).
             s += p
             # Expected probability to observe new behavior.
             r = s * new_p + (1.0 - s) * old_p

             # The expected entropy after getting the next result at rev[i].
             entropy = r * new_stats.entropy() + (1.0 - r) * old_stats.entropy()
             information_gain = orig_entropy - entropy
             cost = r * cost_table[i][1] + (1.0 - r) * cost_table[i][0]
             utility = information_gain / cost
             if utility > best:
                 best = utility
                 best_i = i

         return best_i

     def statistics_test(self, round_idx):
         """Helper function for verify_for_init_state

         Returns:
           p value calculated with Boschloo's Exact Test.
           See https://en.wikipedia.org/wiki/Boschloo%27s_test for reference.
         """
         assert (
             self.count_revnew_old + self.count_revnew_new
             >= self.INIT_ROUND[round_idx]
         )
         assert (
             self.count_revold_old + self.count_revold_new
             >= self.INIT_ROUND[round_idx]
         )

         test_table = [
             [self.count_revold_old, self.count_revnew_old],
             [self.count_revold_new, self.count_revnew_new],
         ]
         res = boschloo_exact(test_table, alternative='greater')
         return res.pvalue

     def _verify_for_init_state(self):
         """Returns index to probe for initial samples with statistical method"""
         current_state = self.init_verify_state

         # DO_ONE & DO_TWO
         if self.INIT_STATUS_MAP[current_state]['action'] == 'counting':
             if (
                 self.count_revnew_old + self.count_revnew_new
                 < self.INIT_ROUND[self.init_rounds]
             ):
                 return self.new_idx

             if (
                 self.count_revold_old + self.count_revold_new
                 < self.INIT_ROUND[self.init_rounds]
             ):
                 return self.old_idx
             raise AssertionError(
                 'Should not get the initial state: %s' % current_state
             )

         # CHECK_ONE & CHECK_TWO
         if self.INIT_STATUS_MAP[current_state]['action'] == 'checking':
             pvalue = self.statistics_test(self.init_rounds)
             alpha = self.INIT_STATUS_MAP[current_state]['alpha']

             if pvalue < alpha:
                 self.init_range_verified = True
                 return None

             if (
                 self.INIT_STATUS_MAP[current_state]['next_state'] is not None
                 and pvalue < self.INIT_P_BOUND[self.init_rounds]
             ):
                 self.init_rounds += 1
                 if self.init_rounds == len(self.INIT_P_BOUND):
                     self._init_verify_state = self.INIT_STATUS_MAP[
                         current_state
                     ]['next_state']
                 else:
                     self._init_verify_state = self.INIT_STATUS_MAP[
                         current_state
                     ]['last_state']
                 return self.new_idx

             left_new = self.count_revold_new
             left_trial = self.count_revold_old + self.count_revold_new
             right_new = self.count_revnew_new
             right_trial = self.count_revnew_old + self.count_revnew_new
             old_p, new_p = self._observation_to_prob(
                 left_new, left_trial, right_new, right_trial
             )
             raise errors.VerifyInitialRangeFailed(
                 self.init_rounds, old_p, new_p
             )

         # get the wrong init_verify_state
         raise AssertionError(
             "current state should be one of init_verify_state's states, but got %s"
             % current_state
         )

     def _next_idx_for_initial(self):
         """Returns index to probe for initial samples

         1. Enough sample points for old and new version. By "enough", it means
            (weak heuristic),
           - At leat two samples.
           - If the two samples are different, we need more.
           - If eval is fast, we should get more samples.

         2. Compare the value distribution between old and new version. Acquire
         more samples until they are roughly distinguishable.
           - Within reasonable time.
         """
         assert self._state == self.INITED
         if not self.recompute_init_values:
             # If the initial range has already been verified with statistical method,
             # the state will change to STARTED after resuming the test.
             if self.init_range_verified:
                 return None

             # Test the initial range of functional tests with statistical method,
             # and get the next index.
             if self.endpoint_verification and not self.is_value_bisection():
                 idx = self._verify_for_init_state()
                 return idx

             # Test the initial range without statistical method, and get the next
             # index.
             if self.rev_info[self.new_idx]['new'] == 0:
                 return self.new_idx
             if self.rev_info[self.old_idx]['old'] == 0:
                 return self.old_idx
             return None

         def is_one_side_enough(info):
             count = info['value']
             if count > 30:
                 return True
             if count < 2:
                 return False
             averages = info.averages()
             if count == 2 and averages[0] != averages[1]:
                 return False

             # Spend 5 minutes for one side sounds reasonable.
             return info.eval_time > 300

         def check_trend_score(old_avg, new_avg):
             """Checks whether the recomputed values exhibit the expected trend.

             Args:
               old_avg: average of the recomputed values at the "old" revision.
               new_avg: average of the recomputed values at the "new" revision.

             Raises:
               errors.WrongAssumption if the recomputed values are unlikely to
               match the trend given by old_value and new_value.
             """
             score = trend_score(
                 self.old_value, self.new_value, old_avg, new_avg
             )
             if score <= 0:
                 raise errors.WrongAssumption(
                     'unable to reproduce the same trend (%f->%f): %f->%f'
                     % (
                         self.old_value,
                         self.new_value,
                         old_avg,
                         new_avg,
                     )
                 )
             # Reject if we cannot reproduce the trend even half way.
             # TODO(kcwu): make this threshold configurable.
             if score < 0.5:
                 expect_delta = abs(self.new_value - self.old_value)
                 actual_delta = abs(new_avg - old_avg)
                 if self.old_value == 0 or old_avg == 0:
                     raise errors.WrongAssumption(
                         'unable to reproduce the value gap: '
                         'expect delta=%f; actual delta=%f'
                         % (expect_delta, actual_delta)
                     )
                 raise errors.WrongAssumption(
                     'unable to reproduce the value gap: '
                     'expect delta=%f (%f%%); actual delta=%f (%f%%)'
                     % (
                         expect_delta,
                         abs(expect_delta / self.old_value) * 100,
                         actual_delta,
                         abs(actual_delta / old_avg) * 100,
                     )
                 )

         if not is_one_side_enough(self.rev_info[self.new_idx]):
             logger.debug('%s not enough', self.rev_info[self.new_idx].rev)
             return self.new_idx

         if not is_one_side_enough(self.rev_info[self.old_idx]):
             logger.debug('%s not enough', self.rev_info[self.old_idx].rev)
             return self.old_idx

         old_avg = math_util.average(self.rev_info[self.old_idx].averages())
         new_avg = math_util.average(self.rev_info[self.new_idx].averages())
         user_diff = self.new_value - self.old_value
         assert user_diff != 0

         # If the samples have the same (at least 1/2) trend as expected, we can
         # stop collect more samples.
         if trend_score(self.old_value, self.new_value, old_avg, new_avg) > 0.5:
             return None
         logger.debug('not distinguishable')

         # If the best case values do not help, it probably has no hope.
         if user_diff > 0:
             new_max = max(self.rev_info[self.new_idx].averages())
             old_min = min(self.rev_info[self.old_idx].averages())
             if (
                 trend_score(self.old_value, self.new_value, old_min, new_max)
                 < 0.5
             ):
                 check_trend_score(old_avg, new_avg)
                 return None
         else:
             new_min = min(self.rev_info[self.new_idx].averages())
             old_max = max(self.rev_info[self.old_idx].averages())
             if (
                 trend_score(self.old_value, self.new_value, old_max, new_min)
                 < 0.5
             ):
                 check_trend_score(old_avg, new_avg)
                 return None

         # If not obviously distinguishable, spend half hour to collect initial
         # samples.
         if self.rev_info[self.old_idx].eval_time < 1800:
             return self.old_idx
         if self.rev_info[self.new_idx].eval_time < 1800:
             return self.new_idx
         check_trend_score(old_avg, new_avg)
         return None

     def get_prob(self):
         if self.state == self.INITED:
             return None
         return self.prob

     def next_idx(self, cost_table=None):
         """Determines next index to binary search.

         Args:
           cost_table: list of cost estimation of candidates, optional. Each
                       estimation is a tuple (estimated_old, estimated_new) where
             estimated_old: estimated cost if this candidate has old behavior
             estimated_new: estimated cost if this candidate has new behavior

         Returns:
           Index to try for binary search. The returned index is optimized for
           minimal total cost.
         """
         if self.state == self.INITED:
             idx = self._next_idx_for_initial()
             assert idx is not None
             return idx

         assert self.state == self.STARTED
         assert self.prob
         return self._next_idx_by_utility(
             self.old_p, self.new_p, self.prob, cost_table
         )

     def renew_initial_count(self, status, idx):
         if idx == self.old_idx:
             if status == 'old':
                 self.count_revold_old += 1
             if status == 'new':
                 self.count_revold_new += 1

         if idx == self.new_idx:
             if status == 'old':
                 self.count_revnew_old += 1
             if status == 'new':
                 self.count_revnew_new += 1