blob: 7e258209394894b340761fbe29ec34066ec3c296 [file] [log] [blame]
// Copyright 2023 The LUCI Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package bayesian
import (
"math"
"go.chromium.org/luci/analysis/internal/changepoints/inputbuffer"
)
const (
// Confidence interval tail is the default tail value for confidence analysis.
// 0.005 tail gives 99% confidence interval.
ConfidenceIntervalTail = 0.005
)
// changePointPositionConfidenceInterval returns the (100% - (2 * tail))
// two-tailed confidence interval for a change point that occurs in the
// given slice of history.
// E.g. tail = 0.005 gives the 99% confidence interval.
// For details, please see go/bayesian-changepoint-estimation.
// In practice, after detecting a change point, we will run this function for
// the combination of the left and the right segments of the change points.
// For example, if there are 3 segments:
// [3, 9], [10, 15], [16, 21].
// To analyze the confidence interval around change point at position 16, we
// will run this function with the slice of history between 10 and 21.
func (a ChangepointPredictor) changePointPositionConfidenceInterval(history []inputbuffer.PositionVerdict, tail float64) (min int, max int) {
length := len(history)
if length == 0 {
panic("test history is empty")
}
// Stores the total for the entire history.
var total counts
for _, v := range history {
total = total.addVerdict(v)
}
// changePointLogLikelihoods stores the log-likelihood of the observing the
// given test verdict sequence.
// i.e. log(P(Y | C = k)),
// where Y is the observations, and "C = k" means there is a change point at
// position k.
// We only store the loglikelihood of positions that mark the start of a new
// commit, positions that are not the start of a new commit will be skipped.
// Store changePointIndices so we can map back to the original slice later.
changePointIndices := []int{}
changePointLogLikelihoods := []float64{}
// Create SequenceLikelihood to calculate the likelihood.
firstTrySL := NewSequenceLikelihood(a.HasUnexpectedPrior)
retrySL := NewSequenceLikelihood(a.UnexpectedAfterRetryPrior)
// left counts the statistics to the left of a position.
var left counts
// Advance past the first commit position.
i, pending := nextPosition(history, 0)
left = left.add(pending)
for i < length {
// right counts the statistics from [i:].
right := total.subtract(left)
leftLikelihood := firstTrySL.LogLikelihood(left.HasUnexpected, left.Runs) + retrySL.LogLikelihood(left.UnexpectedAfterRetry, left.Retried)
rightLikelihood := firstTrySL.LogLikelihood(right.HasUnexpected, right.Runs) + retrySL.LogLikelihood(right.UnexpectedAfterRetry, right.Retried)
// conditionalLikelihood is log(P(Y | C = i)).
conditionalLikelihood := leftLikelihood + rightLikelihood
changePointIndices = append(changePointIndices, i)
changePointLogLikelihoods = append(changePointLogLikelihoods, conditionalLikelihood)
// Advance to the next commit position.
nextIndex, pending := nextPosition(history, i)
left = left.add(pending)
i = nextIndex
}
// totalLogLikelihood calculates log(P(Y)) - log (1/D).
// where D is the length of changePointLogLikelihoods.
// P(Y) = SUM OVER k of P(C = k AND Y)
// = SUM OVER k of ((P(Y | C = k)*P(C = k))
// = SUM OVER k of ((P(Y | C = k)* 1/D)
// = 1/D * SUM OVER k of (P(Y | C = k)
// because we assume a uniform distribution for P(C = k).
// So
// log(P(Y)) = log(1/D * SUM OVER k of (P(Y | C = k))
// = log(1/D) + log(SUM OVER k of (P(Y | C = k)))
// = log(1/D) + totalLogLikelihood
totalLogLikelihood := AddLogLikelihoods(changePointLogLikelihoods)
// Stores the likelihood of P(C <= t | Y).
cumulativeLikelihood := 0.0
min = 0
max = len(changePointLogLikelihoods) - 1
for k := 0; k < len(changePointLogLikelihoods); k++ {
// Calculates P(C = k | Y).
// P (C = k | Y) = P(C = k AND Y) / P(Y)
// = P(Y | C = k) * P(C = k) / P(Y)
// = P(Y | C = k) * (1 / D) / P(Y)
// = exp (log (P(Y | C = k) * (1 / D) / P(Y)))
// = exp (log(P(Y | C = k) + log (1/D) - log (P(Y))
// = exp (changePointLogLikelihoods[k] + log (1/D) - log(P(Y))
// = exp (changePointLogLikelihoods[k] + log (1/D) - (log(1/D) + totalLogLikelihood))
// = exp (changePointLogLikelihoods[k] - totalLogLikelihood)
changePointLikelihood := math.Exp(changePointLogLikelihoods[k] - totalLogLikelihood)
cumulativeLikelihood += changePointLikelihood
if cumulativeLikelihood < tail {
min = k
}
if cumulativeLikelihood > (1.0-tail) && max == (len(changePointLogLikelihoods)-1) {
max = k
}
}
return changePointIndices[min], changePointIndices[max]
}
// ChangePoints runs change point detection and confidence
// interval analysis for history.
// history is sorted by commit position ascendingly (oldest commit first).
func (a ChangepointPredictor) ChangePoints(history []inputbuffer.PositionVerdict, tail float64) []inputbuffer.ChangePoint {
changePointIndices := a.identifyChangePoints(history)
// For simplicity, we add a fake index to the end.
ids := make([]int, len(changePointIndices)+1)
copy(ids, changePointIndices)
ids[len(ids)-1] = len(history)
result := []inputbuffer.ChangePoint{}
startIndex := 0
for i := 1; i < len(ids); i++ {
vds := history[startIndex:ids[i]]
// min and max needs to be offset by startIndex.
min, max := a.changePointPositionConfidenceInterval(vds, ConfidenceIntervalTail)
result = append(result, inputbuffer.ChangePoint{
NominalIndex: ids[i-1],
LowerBound99ThIndex: min + startIndex,
UpperBound99ThIndex: max + startIndex,
})
startIndex = ids[i-1]
}
return result
}