blob: 9776d1b02e2d3a9a9c579852ac531a44fcbf1860 [file]
# Copyright 2017 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from collections import namedtuple
import copy
import itertools
import os
import sys
_ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__),
os.path.pardir))
_FIRST_PARTY_DIR = os.path.join(_ROOT_DIR, 'first_party')
sys.path.insert(1, _FIRST_PARTY_DIR)
from local_libs import script_util
script_util.SetUpSystemPaths(_ROOT_DIR)
from app.common.model import triage_status
from scripts.delta_test import delta_test
from scripts import run_predator
SummaryStats = namedtuple(
'SummaryStats',
['total_examples',
'true_positives', 'true_negatives', 'false_positives', 'false_negatives',
'untriaged', 'unsure'])
class SuspectEntry(object):
def __init__(self, crash, culprit):
self._crash = crash
self._culprit = culprit
@classmethod
def PrintMetrics(cls, grade_model_result, *_): # pragma: no cover.
print 'Total examples:', grade_model_result.total_examples
print 'True positives:', grade_model_result.true_positives
print 'False positives:', grade_model_result.false_positives
print 'True negatives:', grade_model_result.true_negatives
print 'False negatives', grade_model_result.false_negatives
if grade_model_result.unsure or grade_model_result.untriaged:
print '--------'
if grade_model_result.unsure:
print "%s unsure examples discarded" % grade_model_result.unsure
if grade_model_result.untriaged:
print "%s untriaged examples discarded" % grade_model_result.untriaged
print '--------'
print 'Metrics:'
print ' precision: %.2f%%' % Precision(grade_model_result)
print ' recall: %.2f%%' % Recall(grade_model_result)
print ' accuracy: %.2f%%' % Accuracy(grade_model_result)
print ' detection rate: %.2f%%' % DetectionRate(grade_model_result)
class SuspectCl(SuspectEntry):
@property
def suspects(self):
return self._culprit.cls
@property
def correct_results(self):
return self._crash.culprit_cls
@property
def triage_status(self):
return self._crash.suspected_cls_triage_status
def IsTruePositive(self, strict=False):
"""Determines if this Predator result is a true positive.
Args:
correct_cls (list of str): The URL of the correct CL for this example.
suspects (list of Suspect): The suspects produced by Predator. Must be
non-empty.
strict (bool): If strict is true, an example is considered to be a true
positive iff: the correct CL is among the suspects identified by
Predator, and Predator assigned it a confidence value greater than or
equal to that of any other suspect. Else if strict is false, an example
is considered to be true when the suspects is identified by Predator,
even it's not with the highest confidence score.
"""
# I'm assuming for now that there's only ever going to be one correct CL.
assert len(self.correct_results) == 1
correct_cl_url = self.correct_results[0]
max_confidence = max(suspect.confidence for suspect in self.suspects)
if strict:
suspect_urls = [suspect.changelog.commit_url
for suspect in self.suspects
if suspect.confidence == max_confidence]
else:
suspect_urls = [suspect.changelog.commit_url for suspect in self.suspects]
return any(CommitUrlEquals(correct_cl_url, suspect_url)
for suspect_url in suspect_urls)
@classmethod
def PrintMetrics(cls, grade_model_result,
suspect_entries): # pragma: no cover.
# pylint: disable=arguments-differ
super(SuspectCl, cls).PrintMetrics(grade_model_result)
print '--------'
print 'Maximum possible values of the metrics when using a confidence '
print 'threshold:'
precision_threshold, max_precision = MaximizeMetricWithThreshold(
suspect_entries, Precision)
print (' Max precision is %.2f%% with a confidence threshold of %f.'
% (max_precision, precision_threshold))
recall_threshold, max_recall = MaximizeMetricWithThreshold(
suspect_entries, Recall)
print (' Max recall is %.2f%% with a confidence threshold of %f.'
% (max_recall, recall_threshold))
accuracy_threshold, max_accuracy = MaximizeMetricWithThreshold(
suspect_entries, Accuracy)
print (' Max accuracy is %.2f%% with a confidence threshold of %f.'
% (max_accuracy, accuracy_threshold))
f_score_threshold, max_f_score = MaximizeMetricWithThreshold(
suspect_entries, FbetaScore)
print (' Max f-beta score is %.2f with a confidence threshold of %f.'
% (max_f_score, f_score_threshold))
detection_rate_threshold, max_detection_rate = MaximizeMetricWithThreshold(
suspect_entries, DetectionRate)
print (' Max detection rate is %.2f%% with a confidence threshold of %f.'
% (max_detection_rate, detection_rate_threshold))
class SuspectComponent(SuspectEntry):
@property
def suspects(self):
return self._culprit.components
@property
def correct_results(self):
return self._crash.culprit_components
@property
def triage_status(self):
return self._crash.suspected_components_triage_status
def IsTruePositive(self, strict=False):
"""Determines if this Predator result is a true positive.
Args:
strict (bool): If strict is true, suspected components is considered to be
a true positive iff: the components identified by Predator are exactly
the same as those correct components, including order. Else if strict is
false, suspected components are considered to
be true when there are correct components are found by Predator.
"""
if strict:
return self.suspects == self.correct_results
return any(result == suspect or result.startswith(suspect + '>')
for result in self.correct_results for suspect in self.suspects)
def GetSuspectEntryClass(suspect_type):
if suspect_type == 'cls':
return SuspectCl
if suspect_type == 'components':
return SuspectComponent
return None
def RunModelOnTestSet(client_id, app_id, testset_path,
suspect_type): # pragma: no cover
"""Get pairs of (CrashAnalysis, list<Suspect>) for a set of test cases.
Args:
client_id (CrashClient enum value): The id of the client doing the analysis
E.g. if the client is CrashClient.UMA_SAMPLING_PROFILER, get
UMASamplingProfilerAnalysis entities and determine culprits using
PredatorForUMASamplingProfiler.
app_id (str): The id of the App Engine app to retrieve test cases from.
testset_path (str): Path to the csv file storing the testset - i.e. a list
of URLs of *Analysis entities. To generate a testset like this, use the
update-testset.py script.
# TODO(cweakliam): It would be better if we had triaged datasets saved in
# the Datastore, and downloaded them from there instead.
suspect_entry_class (Class of SuspectEntry): Class.
Returns:
List of SuspectedEntry:
crash (CrashAnalysis subclass): An entity representing one test case,
usually triaged and labelled with the correct CL if there is one.
cls (list of Suspect): The suspects produced by Predator.
"""
suspect_entry_class = GetSuspectEntryClass(suspect_type)
crashes = delta_test.ReadCrashesFromCsvTestset(testset_path)
culprits = run_predator.GetCulpritsOnRevision(crashes, 'HEAD', client_id,
app_id)
return [
suspect_entry_class(crashes[crash_id], culprit)
for crash_id, culprit in culprits.iteritems()
if culprit is not None
]
def CommitUrlEquals(url1, url2):
# Some URLs have '.git' in them, some don't
url1_standardized = url1.replace('.git', '')
url2_standardized = url2.replace('.git', '')
return url1_standardized == url2_standardized
def GradeModel(suspect_entries, strict=False):
"""Grade the model's performance on a set of examples.
Args:
suspect_entries (iterable of (CrashAnalysis, list<Suspect>) pairs): A set
of labelled examples, along with the result produced by Predator for each
example.
strict (bool): If strict is true, an example is considered to be a true
positive iff: the correct CL is among the suspects identified by Predator,
and Predator assigned it a confidence value greater than or equal to that
of any other suspect. Else if strict is false, an example is considered to
be true when the suspects is identified by Predator, even it's not with
the highest confidence score.
Returns:
A SummaryStats object, detailing the result of grading the model (e.g.
number of True Positives, False Positives etc.).
"""
true_positives = 0
true_negatives = 0
false_positives = 0
false_negatives = 0
untriaged = 0
unsure = 0
total_examples = 0
for suspect_entry in suspect_entries:
total_examples += 1
if suspect_entry.triage_status == triage_status.UNTRIAGED:
untriaged += 1
continue
if suspect_entry.triage_status == triage_status.TRIAGED_UNSURE:
unsure += 1
continue
if not suspect_entry.correct_results:
if suspect_entry.suspects:
false_positives += 1
else:
true_negatives += 1
continue
if not suspect_entry.suspects:
false_negatives += 1
continue
if suspect_entry.IsTruePositive(strict=strict):
true_positives += 1
else:
false_positives += 1
return SummaryStats(
total_examples=total_examples,
true_positives=true_positives,
true_negatives=true_negatives,
false_positives=false_positives,
false_negatives=false_negatives,
untriaged=untriaged,
unsure=unsure,
)
def Percent(a, b):
"""Return the percentage of a wrt. b, or None if b == 0."""
if b == 0:
return None
return (a / float(b)) * 100
# See https://en.wikipedia.org/wiki/Precision_and_recall for details on the
# following metrics.
def Precision(summary_stats):
"""The fraction of positive suggestions the model gives that are correct."""
return Percent(summary_stats.true_positives,
summary_stats.true_positives + summary_stats.false_positives)
def Recall(summary_stats):
"""The fraction of positive examples that the model gets right."""
return Percent(summary_stats.true_positives,
summary_stats.true_positives + summary_stats.false_negatives)
def Accuracy(summary_stats):
"""The fraction of all examples that the model gets right."""
return Percent(
summary_stats.true_positives + summary_stats.true_negatives,
summary_stats.true_positives + summary_stats.true_negatives
+ summary_stats.false_positives + summary_stats.false_negatives)
def FbetaScore(summary_stats, beta=0.5):
"""A metric between 0 and 1 that balances Precision and Recall.
Returns the metric you get if you consider Recall to be ``beta`` times as
important as Precision.
"""
tp = summary_stats.true_positives
fp = summary_stats.false_positives
fn = summary_stats.false_negatives
return (
((1 + beta**2) * tp)
/ float((1 + beta**2) * tp + beta**2 * fn + fp))
def DetectionRate(summary_stats):
"""The fraction of all examples that Predator do give some results.
All positive and true_negative results will be counted.
"""
# A detected example means Predator provide some results for it, or
# Predator find nothing if the example is supposed to have no result.
all_detected_examples = (summary_stats.true_positives +
summary_stats.false_positives +
summary_stats.true_negatives)
all_examples = (summary_stats.true_positives +
summary_stats.true_negatives +
summary_stats.false_positives +
summary_stats.false_negatives)
return Percent(all_detected_examples, all_examples)
def PrintMetrics(suspect_entries, suspect_type,
strict=False): # pragma: no cover
"""Print a series of metrics to the user about the given examples."""
result = GradeModel(suspect_entries, strict=strict)
suspect_entry_class = GetSuspectEntryClass(suspect_type)
suspect_entry_class.PrintMetrics(result, suspect_entries)
def MaximizeMetricWithThreshold(suspect_entries, metric, strict=False):
"""Find the confidence threshold that maximizes this metric on these examples.
We may want to optimize for different metrics depending on the situation. For
example, if we were sending email alerts to CL authors, we would want to
maximize for precision, to ensure we aren't sending alerts unless we're sure
the author's CL is responsible. However if we were just surfacing results on
a dashboard in an on-demand way, we would want to maximize for something like
accuracy or recall.
Args:
suspect_entries (iterable of SuspectEntry): A set
of labelled examples, along with the result produced by Predator for each
example.
metric (function: SummaryStats -> Number): The function to maximize.
Returns:
(threshold, value)
threshold (float): The confidence threshold that maximizes the value of this
metric on these examples.
value: The value of the metric given this threshold.
"""
confidences = [suspect.confidence
for suspect_entry in suspect_entries
for suspect in suspect_entry.suspects]
thresholds = itertools.chain([0], confidences)
results = (
(threshold, metric(GradeWithThreshold(suspect_entries, threshold,
strict=strict)))
for threshold in thresholds
)
return max(results, key=lambda pair: pair[1])
def GradeWithThreshold(suspect_entries, threshold, strict=False):
"""The result of GradeModel when using a confidence threshold.
Args:
suspect_entries (iterable of (CrashAnalysis, list<Suspect>) pairs): A set
of labelled examples, along with the result produced by Predator for each
example.
threshold (float): The confidence threshold for considering a suspect. I.e.
any suspect with confidence <= threshold will be discarded for the purpose
of grading the model.
Returns:
A SummaryStats: The result of calling GradeModel on the given examples,
after suspects have been filtered according to the threshold.
"""
def FilterSuspectsBelowThreshold(suspects):
return filter(
lambda s: s.confidence > threshold, suspects)
entries = copy.deepcopy(suspect_entries)
for entry in entries:
entry._culprit = entry._culprit._replace(
cls=FilterSuspectsBelowThreshold(entry.suspects))
return GradeModel(entries, strict=strict)