blob: 11f62c164ed40a31499b7bedf66abc7e979dfbbe [file] [log] [blame]
# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""A simplified change-point detection algorithm.
Historically, the performance dashboard has used the GASP service for
detection. Brett Schein wrote a simplified version of this algorithm
for the dashboard in Matlab, and this was ported to Python by Dave Tu.
The general goal is to find any increase or decrease which is likely to
represent a real change in the underlying data source.
See: http://en.wikipedia.org/wiki/Step_detection
In 2019, we also integrate a successive bisection with combined Mann-Whitney
U-test and Kolmogorov-Smirnov tests to identify potential change points. This is
not exactly the E-divisive algorithm, but is close enough.
See: https://arxiv.org/abs/1306.4933
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import logging
from dashboard import find_step
from dashboard import ttest
from dashboard.common import math_utils
from dashboard.common import clustering_change_detector
from dashboard.common import defaults
class ChangePoint(
collections.namedtuple(
'ChangePoint',
(
# The x-value of the first point after a step.
'x_value',
# Median of the segments before and after the change point.
'median_before',
'median_after',
# Number of points before and after the change point.
'size_before',
'size_after',
# X-values of the first and last point in the series window used.
'window_start',
'window_end',
# Relative change from before to after.
'relative_change',
# Standard deviation of points before.
'std_dev_before',
# Results of the Welch's t-test for values before and after.
't_statistic',
'degrees_of_freedom',
'p_value',
# Extended possible change point range
'extended_start',
'extended_end',
))):
"""A ChangePoint represents a change in a series -- a potential alert."""
_slots = None
def AsDict(self):
"""Returns a dictionary mapping attributes to values."""
return self._asdict()
def FindChangePoints(series,
max_window_size=defaults.MAX_WINDOW_SIZE,
min_segment_size=defaults.MIN_SEGMENT_SIZE,
min_absolute_change=defaults.MIN_ABSOLUTE_CHANGE,
min_relative_change=defaults.MIN_RELATIVE_CHANGE,
min_steppiness=defaults.MIN_STEPPINESS,
multiple_of_std_dev=defaults.MULTIPLE_OF_STD_DEV,
rand=None):
"""Finds change points in the given series.
Only the last |max_window_size| points are examined, regardless of how many
points are passed in. The reason why it might make sense to limit the number
of points to look at is that if there are multiple change-points in the window
that's looked at, then this function will be less likely to find any of them.
This uses a clustering change detector (an approximation of E-divisive) in the
`clustering_change_detector` module.
Args:
series: A list of (x, y) pairs.
max_window_size: Number of points to analyze.
min_segment_size: Min size of segments before or after change point.
min_absolute_change: Absolute change threshold.
min_relative_change: Relative change threshold.
min_steppiness: Threshold for how similar to a step a change point must be.
multiple_of_std_dev: Threshold for change as multiple of std. deviation.
Returns:
A list of zero to many ChangePoint object(s).
"""
if len(series) < 2:
return [] # Not enough points to possibly contain a valid split point.
series = series[-max_window_size:]
_, y_values = list(zip(*series))
candidate_points = []
try:
candidate_points = clustering_change_detector.ClusterAndFindSplit(
y_values, rand=rand)
except clustering_change_detector.InsufficientData:
pass
def RevAndIdx(idx):
return ('rev:%s' % (series[idx][0],), 'idx:%s' % (idx,))
logging.info('E-Divisive candidate change-points: %s',
[RevAndIdx(idx) for idx, _ in candidate_points])
change_points = []
for point in reversed(sorted(candidate_points)):
passed_filter, reject_reason = _PassesThresholds(
y_values,
point[0],
min_segment_size=min_segment_size,
min_absolute_change=min_absolute_change,
min_relative_change=min_relative_change,
min_steppiness=min_steppiness,
multiple_of_std_dev=multiple_of_std_dev)
if passed_filter:
change_points.append(point)
else:
logging.info('Rejected %s as potential index (%s); reason = %s',
point, RevAndIdx(point[0]), reject_reason)
logging.info('E-Divisive potential change-points: %s',
[RevAndIdx(idx) for idx, _ in change_points])
return [MakeChangePoint(series, point) for point in change_points]
def MakeChangePoint(series, point):
"""Makes a ChangePoint object for the given series at the given point.
Args:
series: A list of (x, y) pairs.
split_index: Index of the first point after the split.
Returns:
A ChangePoint object.
"""
split_index, (lower, upper) = point
assert 0 <= split_index < len(series)
x_values, y_values = list(zip(*series))
left, right = y_values[:split_index], y_values[split_index:]
left_median, right_median = math_utils.Median(left), math_utils.Median(right)
ttest_results = ttest.WelchsTTest(left, right)
return ChangePoint(
x_value=x_values[split_index],
median_before=left_median,
median_after=right_median,
size_before=len(left),
size_after=len(right),
window_start=x_values[0],
window_end=x_values[-1], # inclusive bound
relative_change=math_utils.RelativeChange(left_median, right_median),
std_dev_before=math_utils.StandardDeviation(left),
t_statistic=ttest_results.t,
degrees_of_freedom=ttest_results.df,
p_value=ttest_results.p,
extended_start=x_values[lower],
extended_end=x_values[upper],
)
def _PassesThresholds(values, split_index, min_segment_size,
min_absolute_change, min_relative_change, min_steppiness,
multiple_of_std_dev):
"""Checks whether a point in a series appears to be an change point.
Args:
values: A list of numbers.
split_index: An index in the list of numbers.
min_segment_size: Threshold for size of segments before or after a point.
min_absolute_change: Minimum absolute median change threshold.
min_relative_change: Minimum relative median change threshold.
min_steppiness: Threshold for how similar to a step a change point must be.
multiple_of_std_dev: Threshold for change as multiple of std. deviation.
Returns:
A tuple of (bool, string) where the bool indicates whether the split index
passes the thresholds and the string being the reason it did not.
"""
left, right = values[:split_index], values[split_index:]
left_median, right_median = math_utils.Median(left), math_utils.Median(right)
# 1. Segment size filter.
if len(left) < min_segment_size or len(right) < min_segment_size:
return (False, 'min_segment_size')
# 2. Absolute change filter.
absolute_change = abs(left_median - right_median)
if absolute_change < min_absolute_change:
return (False, 'min_absolute_change')
# 3. Relative change filter.
relative_change = math_utils.RelativeChange(left_median, right_median)
if relative_change < min_relative_change:
return (False, 'min_relative_change')
# 4. Multiple of standard deviation filter.
min_std_dev = min(
math_utils.StandardDeviation(left), math_utils.StandardDeviation(right))
if absolute_change < multiple_of_std_dev * min_std_dev:
return (False, 'min_std_dev')
# 5. Steppiness filter.
steppiness = find_step.Steppiness(values, split_index)
if steppiness < min_steppiness:
return (False, 'min_steppiness')
# Passed all filters!
return (True, 'passed')