blob: 11ffed12683e4d535a6233d7a28d6302c2bfff6c [file] [log] [blame]
# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""The database model for an "Anomaly", which represents a step up or down."""
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
import logging
import sys
import time
from google.appengine.ext import ndb
from dashboard.common import timing
from dashboard.common import utils
from dashboard.common import datastore_hooks
from dashboard.models import internal_only_model
from dashboard.models import subscription
# A string to describe the magnitude of a change from zero to non-zero.
FREAKIN_HUGE = 'zero-to-nonzero'
# Possible improvement directions for a change. An Anomaly will always have a
# direction of UP or DOWN, but a test's improvement direction can be UNKNOWN.
UP, DOWN, UNKNOWN = (0, 1, 4)
class Issue(ndb.Model):
project_id = ndb.StringProperty(default='chromium', indexed=True)
issue_id = ndb.IntegerProperty(required=True, indexed=True)
class Anomaly(internal_only_model.InternalOnlyModel):
"""Represents a change-point or step found in the data series for a test.
An Anomaly can be an upward or downward change, and can represent an
improvement or a regression.
"""
# Whether the alert should only be viewable by internal users.
internal_only = ndb.BooleanProperty(indexed=True, default=False)
# The time the alert fired.
timestamp = ndb.DateTimeProperty(indexed=True, auto_now_add=True)
# TODO(dberris): Remove these after migrating all issues to use the issues
# repeated field, to allow an anomaly to be represented in multiple issues on
# different Monorail projects.
# === DEPRECATED START ===
# Note: -1 denotes an invalid alert and -2 an ignored alert.
# By default, this is None, which denotes a non-triaged alert.
bug_id = ndb.IntegerProperty(indexed=True)
# This is the project to which an anomaly is associated with, in the issue
# tracker service.
project_id = ndb.StringProperty(indexed=True, default='chromium')
# === DEPRECATED END ===
# AlertGroups used for grouping
groups = ndb.KeyProperty(indexed=True, repeated=True)
# This is the list of issues associated with the anomaly. We're doing this to
# allow a single anomaly to be represented in multiple issues in different
# issue trackers.
issues = ndb.StructuredProperty(Issue, indexed=True, repeated=True)
# This field aims to replace the 'bug_id' field serving as a state indicator.
state = ndb.StringProperty(
default='untriaged',
choices=['untriaged', 'triaged', 'ignored', 'invalid'])
# The subscribers who recieve alerts
subscriptions = ndb.LocalStructuredProperty(
subscription.Subscription, repeated=True)
subscription_names = ndb.StringProperty(indexed=True, repeated=True)
# The anomaly configuration used to generate this anomaly, associated with the
# subscription.
anomaly_config = ndb.JsonProperty()
# Each Alert is related to one Test.
test = ndb.KeyProperty(indexed=True)
statistic = ndb.StringProperty(indexed=True)
# We'd like to be able to query Alerts by Master, Bot, and Benchmark names.
master_name = ndb.ComputedProperty(
lambda self: utils.TestPath(self.test).split('/')[0], indexed=True)
bot_name = ndb.ComputedProperty(
lambda self: utils.TestPath(self.test).split('/')[1], indexed=True)
benchmark_name = ndb.ComputedProperty(
lambda self: utils.TestPath(self.test).split('/')[2], indexed=True)
# Each Alert has a revision range it's associated with; however,
# start_revision and end_revision could be the same.
start_revision = ndb.IntegerProperty(indexed=True)
end_revision = ndb.IntegerProperty(indexed=True)
# The revisions to use for display, if different than point id.
display_start = ndb.IntegerProperty(indexed=False)
display_end = ndb.IntegerProperty(indexed=False)
# Ownership data, mapping e-mails to the benchmark's owners' emails and
# component as the benchmark's Monorail component
ownership = ndb.JsonProperty()
# Alert grouping is used to overide the default alert group (test suite)
# for auto-triage.
alert_grouping = ndb.StringProperty(indexed=False, repeated=True)
# The number of points before and after this anomaly that were looked at
# when finding this anomaly.
segment_size_before = ndb.IntegerProperty(indexed=False)
segment_size_after = ndb.IntegerProperty(indexed=False)
# The medians of the segments before and after the anomaly.
median_before_anomaly = ndb.FloatProperty(indexed=False)
median_after_anomaly = ndb.FloatProperty(indexed=False)
# The standard deviation of the segments before the anomaly.
std_dev_before_anomaly = ndb.FloatProperty(indexed=False)
# The number of points that were used in the before/after segments.
# This is also returned by FindAnomalies
window_end_revision = ndb.IntegerProperty(indexed=False)
# In order to estimate how likely it is that this anomaly is due to noise,
# t-test may be performed on the points before and after. The t-statistic,
# degrees of freedom, and p-value are potentially-useful intermediary results.
t_statistic = ndb.FloatProperty(indexed=False)
degrees_of_freedom = ndb.FloatProperty(indexed=False)
p_value = ndb.FloatProperty(indexed=False)
# Whether this anomaly represents an improvement; if false, this anomaly is
# considered to be a regression.
is_improvement = ndb.BooleanProperty(indexed=True, default=False)
# Whether this anomaly recovered (i.e. if this is a step down, whether there
# is a corresponding step up later on, or vice versa.)
recovered = ndb.BooleanProperty(indexed=True, default=False)
# If the TestMetadata alerted upon has a ref build, store the ref build.
ref_test = ndb.KeyProperty(indexed=False)
# The corresponding units from the TestMetaData entity.
units = ndb.StringProperty(indexed=False)
recipe_bisects = ndb.KeyProperty(repeated=True, indexed=False)
pinpoint_bisects = ndb.StringProperty(repeated=True, indexed=False)
# Additional Metadata
# ====
#
# Timestamps for the earliest and latest Row we used to determine whether this
# is an anomaly. We use this to compute time-to-detection.
earliest_input_timestamp = ndb.DateTimeProperty()
latest_input_timestamp = ndb.DateTimeProperty()
@property
def percent_changed(self):
"""The percent change from before the anomaly to after."""
if self.median_before_anomaly == 0.0:
return sys.float_info.max
difference = self.median_after_anomaly - self.median_before_anomaly
return 100 * difference / self.median_before_anomaly
@property
def absolute_delta(self):
"""The absolute change from before the anomaly to after."""
return self.median_after_anomaly - self.median_before_anomaly
@property
def direction(self):
"""Whether the change is numerically an increase or decrease."""
if self.median_before_anomaly < self.median_after_anomaly:
return UP
return DOWN
def GetDisplayPercentChanged(self):
"""Gets a string showing the percent change."""
if abs(self.percent_changed) == sys.float_info.max:
return FREAKIN_HUGE
else:
return '%.1f%%' % abs(self.percent_changed)
def GetDisplayAbsoluteChanged(self):
"""Gets a string showing the absolute change."""
if abs(self.absolute_delta) == sys.float_info.max:
return FREAKIN_HUGE
else:
return '%f' % abs(self.absolute_delta)
def GetRefTestPath(self):
if not self.ref_test:
return None
return utils.TestPath(self.ref_test)
def SetIsImprovement(self, test=None):
"""Sets whether the alert is an improvement for the given test."""
if not test:
test = self.GetTestMetadataKey().get()
# |self.direction| is never equal to |UNKNOWN| (see the definition above)
# so when the test improvement direction is |UNKNOWN|, |self.is_improvement|
# will be False.
self.is_improvement = (self.direction == test.improvement_direction)
def GetTestMetadataKey(self):
"""Get the key for the TestMetadata entity of this alert.
We are in the process of converting from Test entities to TestMetadata.
Until this is done, it's possible that an alert may store either Test
or TestMetadata in the 'test' KeyProperty. This gets the TestMetadata key
regardless of what's stored.
"""
return utils.TestMetadataKey(self.test)
@classmethod
@ndb.tasklet
def QueryAsync(cls,
bot_name=None,
bug_id=None,
count_limit=0,
deadline_seconds=50,
inequality_property=None,
is_improvement=None,
key=None,
keys_only=False,
limit=100,
master_name=None,
max_end_revision=None,
max_start_revision=None,
max_timestamp=None,
min_end_revision=None,
min_start_revision=None,
min_timestamp=None,
recovered=None,
subscriptions=None,
start_cursor=None,
test=None,
test_keys=None,
test_suite_name=None,
project_id=None):
if key:
# This tasklet isn't allowed to catch the internal_only AssertionError.
alert = yield ndb.Key(urlsafe=key).get_async()
raise ndb.Return(([alert], None, 1))
# post_filters can cause results to be empty, depending on the shape of the
# data and which filters are applied in the query and which filters are
# applied after the query. Automatically chase cursors until some results
# are found, but stay under the request timeout.
results = []
deadline = time.time() + deadline_seconds
while not results and time.time() < deadline:
query = cls.query()
equality_properties = []
if subscriptions: # Empty subscriptions is not allowed in query
query = query.filter(cls.subscription_names.IN(subscriptions))
equality_properties.append('subscription_names')
inequality_property = 'key'
if is_improvement is not None:
query = query.filter(cls.is_improvement == is_improvement)
equality_properties.append('is_improvement')
inequality_property = 'key'
if bug_id is not None:
if bug_id == '':
query = query.filter(cls.bug_id == None)
equality_properties.append('bug_id')
inequality_property = 'key'
elif bug_id != '*':
query = query.filter(cls.bug_id == int(bug_id))
equality_properties.append('bug_id')
inequality_property = 'key'
# bug_id='*' translates to bug_id != None, which is handled with the
# other inequality filters.
if recovered is not None:
query = query.filter(cls.recovered == recovered)
equality_properties.append('recovered')
inequality_property = 'key'
if test or test_keys:
if not test_keys:
test_keys = []
if test:
test_keys += [
utils.OldStyleTestKey(test),
utils.TestMetadataKey(test)
]
query = query.filter(cls.test.IN(test_keys))
query = query.order(cls.key)
equality_properties.append('test')
inequality_property = 'key'
if master_name:
query = query.filter(cls.master_name == master_name)
equality_properties.append('master_name')
inequality_property = 'key'
if bot_name:
query = query.filter(cls.bot_name == bot_name)
equality_properties.append('bot_name')
inequality_property = 'key'
if test_suite_name:
query = query.filter(cls.benchmark_name == test_suite_name)
equality_properties.append('benchmark_name')
inequality_property = 'key'
query, post_filters = cls._InequalityFilters(
query, equality_properties, inequality_property, bug_id,
min_end_revision, max_end_revision, min_start_revision,
max_start_revision, min_timestamp, max_timestamp)
if post_filters:
keys_only = False
query = query.order(-cls.timestamp, cls.key)
futures = [
query.fetch_page_async(
limit, start_cursor=start_cursor, keys_only=keys_only)
]
if count_limit:
futures.append(query.count_async(count_limit))
query_duration = timing.WallTimeLogger('query_duration')
with query_duration:
yield futures
results, start_cursor, more = futures[0].get_result()
if count_limit:
count = futures[1].get_result()
else:
count = len(results)
logging.info('query_results_count=%d', len(results))
if results:
logging.info('duration_per_result=%f',
query_duration.seconds / len(results))
if post_filters:
results = [
alert for alert in results if all(
post_filter(alert) for post_filter in post_filters)
]
# Temporary treat project_id as a postfilter. This is because some
# chromium alerts have been booked with empty project_id.
if project_id is not None:
results = [
alert for alert in results if alert.project_id == project_id
or alert.project_id == '' and project_id == 'chromium'
]
if not more:
start_cursor = None
if not start_cursor:
break
raise ndb.Return((results, start_cursor, count))
@classmethod
def _InequalityFilters(cls, query, equality_properties, inequality_property,
bug_id, min_end_revision, max_end_revision,
min_start_revision, max_start_revision, min_timestamp,
max_timestamp):
# A query cannot have more than one inequality filter.
# inequality_property allows users to decide which property to filter in the
# query, which can significantly affect performance. If other inequalities
# are specified, they will be handled by post_filters.
# If callers set inequality_property without actually specifying a
# corresponding inequality filter, then reset the inequality_property and
# compute it automatically as if it were not specified.
if inequality_property == 'start_revision':
if min_start_revision is None and max_start_revision is None:
inequality_property = None
elif inequality_property == 'end_revision':
if min_end_revision is None and max_end_revision is None:
inequality_property = None
elif inequality_property == 'timestamp':
if min_timestamp is None and max_timestamp is None:
inequality_property = None
elif inequality_property == 'bug_id':
if bug_id != '*':
inequality_property = None
elif inequality_property == 'key':
if equality_properties == [
'subscription_names'
] and (min_start_revision or max_start_revision):
# Use the composite index (subscription_names, start_revision,
# -timestamp). See index.yaml.
inequality_property = 'start_revision'
else:
inequality_property = None
if inequality_property is None:
# Compute a default inequality_property.
# We prioritise the 'min' filters first because that lets us limit the
# amount of data the Datastore instances might handle.
if min_start_revision:
inequality_property = 'start_revision'
elif min_end_revision:
inequality_property = 'end_revision'
elif min_timestamp:
inequality_property = 'timestamp'
elif max_start_revision:
inequality_property = 'start_revision'
elif max_end_revision:
inequality_property = 'end_revision'
elif max_timestamp:
inequality_property = 'timestamp'
elif bug_id == '*':
inequality_property = 'bug_id'
post_filters = []
if not inequality_property:
return query, post_filters
if not datastore_hooks.IsUnalteredQueryPermitted():
# _DatastorePreHook will filter internal_only=False. index.yaml does not
# specify indexes for `internal_only, $inequality_property, -timestamp`.
# Use post_filters for all inequality properties.
inequality_property = ''
if bug_id == '*':
if inequality_property == 'bug_id':
logging.info('filter:bug_id!=None')
query = query.filter(cls.bug_id != None).order(cls.bug_id)
else:
logging.info('post_filter:bug_id!=None')
post_filters.append(lambda a: a.bug_id != None)
# Apply the min filters before the max filters, because that lets us
# optimise the query application for more recent data, reducing the amount
# of data post-processing.
if min_start_revision:
min_start_revision = int(min_start_revision)
if inequality_property == 'start_revision':
logging.info('filter:min_start_revision=%d', min_start_revision)
query = query.filter(cls.start_revision >= min_start_revision)
query = query.order(cls.start_revision)
else:
logging.info('post_filter:min_start_revision=%d', min_start_revision)
post_filters.append(lambda a: a.start_revision >= min_start_revision)
if min_end_revision:
min_end_revision = int(min_end_revision)
if inequality_property == 'end_revision':
logging.info('filter:min_end_revision=%d', min_end_revision)
query = query.filter(cls.end_revision >= min_end_revision)
query = query.order(cls.end_revision)
else:
logging.info('post_filter:min_end_revision=%d', min_end_revision)
post_filters.append(lambda a: a.end_revision >= min_end_revision)
if min_timestamp:
if inequality_property == 'timestamp':
logging.info('filter:min_timestamp=%d',
time.mktime(min_timestamp.utctimetuple()))
query = query.filter(cls.timestamp >= min_timestamp)
else:
logging.info('post_filter:min_timestamp=%d',
time.mktime(min_timestamp.utctimetuple()))
post_filters.append(lambda a: a.timestamp >= min_timestamp)
if max_start_revision:
max_start_revision = int(max_start_revision)
if inequality_property == 'start_revision':
logging.info('filter:max_start_revision=%d', max_start_revision)
query = query.filter(cls.start_revision <= max_start_revision)
query = query.order(-cls.start_revision)
else:
logging.info('post_filter:max_start_revision=%d', max_start_revision)
post_filters.append(lambda a: a.start_revision <= max_start_revision)
if max_end_revision:
max_end_revision = int(max_end_revision)
if inequality_property == 'end_revision':
logging.info('filter:max_end_revision=%d', max_end_revision)
query = query.filter(cls.end_revision <= max_end_revision)
query = query.order(-cls.end_revision)
else:
logging.info('post_filter:max_end_revision=%d', max_end_revision)
post_filters.append(lambda a: a.end_revision <= max_end_revision)
if max_timestamp:
if inequality_property == 'timestamp':
logging.info('filter:max_timestamp=%d',
time.mktime(max_timestamp.utctimetuple()))
query = query.filter(cls.timestamp <= max_timestamp)
else:
logging.info('post_filter:max_timestamp=%d',
time.mktime(max_timestamp.utctimetuple()))
post_filters.append(lambda a: a.timestamp <= max_timestamp)
return query, post_filters