blob: f69a451e934a1e9300e0dfa882b8c2fd618d2210 [file] [log] [blame]
# Copyright (c) 2010 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import os
import sys
# Prepend the buildbot pylibs directory to our import path.
import fnmatch
import glob
import itertools
import math
import optparse
import re
import simplejson
import subprocess
__version__ = '1.0'
USAGE = ""
def GetSummaryFilelist(dir=None):
"""Finds all summary .dat files to clean up."""
if not dir:
raise Exception("No directory supplied.")
if not os.path.exists(dir):
raise Exception("Directory does not exist.")
files = itertools.chain(glob.iglob('%s/*' % dir),
glob.iglob('%s/*/*' % dir),
glob.iglob('%s/*/*/*' % dir))
summaries = [summary for summary in files
if fnmatch.fnmatch(summary, '*-summary.dat')]
return summaries
def ReadJson(filename):
"""Read a JSON file and convert its contents into a Python datatype."""
file = open(filename, 'r')
except IOError, e:
print >> sys.stderr, ("I/O Error reading file %s(%s): %s" %
(filename, e.errno, e.strerror))
raise e
if not file:
return None
data = []
contents =
contentslist = contents.split("\n")
for jsontext in contentslist:
if jsontext is None or len(jsontext) == 0:
json = simplejson.loads(jsontext,
except ValueError, e:
print >> sys.stderr, ("Error parsing file %s: '%s'" %
(filename, jsontext))
raise e
return data
def ConvertDataIntoBuckets(perfid, test, graph, data):
"""Convert an array of JSON data into a groomed collection of perf data.
For each item in the JSON data array, locate the '_ref' trace which indicates
the reference build's data. Locate the current build's trace corresponding to
the reference build. Store both values in new arrays within the groomed
collection under a key composed of the perf system ID, the test ID, the graph
ID, and the trace name.
Assume that the data has stabilized recently and only keep the first 30
values for each bucket."""
buckets = {}
for cl in data:
keys = cl['traces'].keys()
for refkey in keys:
m = re.match(r'^(.*)_ref$', refkey)
if not m:
tracekey =
if tracekey not in cl['traces']:
# Sometimes the current build measure is missing due to a failure.
perfkey = "%s/%s/%s/%s" % (perfid, test, graph, tracekey)
buckets.setdefault(perfkey, {})
# Get current build data.
buckets[perfkey].setdefault('current', [])
current = float(cl['traces'][tracekey][0])
# Get reference build data.
buckets[perfkey].setdefault('ref', [])
ref = float(cl['traces'][refkey][0])
# Get delta.
buckets[perfkey].setdefault('delta', [])
buckets[perfkey]['delta'].append(current - ref)
# Only keep the first 30 values for each bucket.
buckets[perfkey]['current'] = buckets[perfkey]['current'][:30]
buckets[perfkey]['ref'] = buckets[perfkey]['ref'][:30]
buckets[perfkey]['delta'] = buckets[perfkey]['delta'][:30]
return buckets
def GetDeltas(data, value=None):
"""Returns a list composed by subtracting each item by a given value.
If the value is not given or is None, the previous item is used and the
returned list will be one item less than the given list."""
deltas = []
for i in range(len(data)):
if value is None:
if i == 0:
value = data[i-1]
deltas.append(data[i] - value)
return deltas
def GetAbsoluteValueList(data):
"""Returns a list composed of the absolute value of each item."""
absv = []
for i in range(len(data)):
return absv
def GetNormalizedList(data, base_value):
"""Returns a list composed by dividing each item over the given base value.
If the base value is less than 0.0001, it is adjusted to 0.0001 to avoid
division by zero."""
if base_value < 0.0001:
base_value = 0.0001
normalized = []
for i in range(len(data)):
normalized.append(data[i] / float(base_value))
return normalized
def GetMedian(data):
"""Returns the median for a list of data."""
if len(data) == 0:
return None
sorted_data = sorted(data)
if len(sorted_data) % 2 == 0:
high = sorted_data[len(sorted_data) / 2]
low = sorted_data[(len(sorted_data) / 2) - 1]
return (float(high) + float(low)) / 2
return sorted_data[len(sorted_data) / 2]
def RemoveNoise(data, max_to_remove=3):
"""Removes noise from a list by identifying potentially unusual values.
This is a simple filter. The process is:
1. Calculate the list's b coefficient using the least fit squares algorithm.
2. Create a list of deltas using the b coefficient and take its absolute
3. Normalize the deltas list over the median value.
4. For each normalized delta, if that value is greater than 5, mark that
position, value, and delta as a candidate for removal.
The default number of noisy values to remove is 5 unless otherwise overridden.
This method sorts the candidates by the delta descending, then reduces the
candidate list to the maximum number allowed. The candidate values are
then removed from the input list."""
mean = sum(data) / len(data)
deltas = GetAbsoluteValueList(GetDeltas(data, mean))
normalized_deltas = GetNormalizedList(deltas, GetMedian(deltas))
candidates = []
for i in range(len(normalized_deltas)):
if normalized_deltas[i] > 3:
candidates.append({'position': i,
'value': data[i],
'delta': normalized_deltas[i]})
# Sort by delta descending, then drop all but max_to_remove elements.
candidates.sort(cmp=lambda a, b: cmp(b['delta'], a['delta']))
candidates = candidates[:max_to_remove]
# Sort by position descending, then pop those items from the list.
candidates.sort(cmp=lambda a, b: cmp(b['position'], a['position']))
for candidate in candidates:
def CalculateDeltaAndVar(perfkey, buckets, expectations):
"""Calculate the delta and variance for a bucket.
Store the resulting delta and variance in the expectations dictionary."""
data = buckets[perfkey]['delta']
# Calculate the delta of the data set by averaging the observed max and min
# values.
delta_max = round(max(data), 0)
delta_min = round(min(data), 0)
delta = int((delta_max + delta_min)/2.0)
# Get the mean and median of the data set.
mean = sum(data) / len(data)
median = GetMedian(data)
# Find the noise level by tripling the median/mean difference. Also see
# RemoveNoise().
buffer = abs((median-mean)*3)
# Get the max allowed point by subtracting the delta from the max delta, and
# adding the buffer.
max_allowed_point = abs(delta_max - delta + buffer)
# The variance is the max allowed point divided by 1.5 (the amount the
# variance is multiplied by in the Buildbot).
var = int(round(max_allowed_point/1.5 + 1, 0))
if var < 0:
raise Exception("variance for %s is negative" % perfkey)
expectations[perfkey] = {'delta': delta, 'var': var}
def WriteJson(filename, data):
"""Write a list of hashes in |data| to the file specified in |filename|."""
file = open(filename, 'w')
except IOError, e:
print >> sys.stderr, ("I/O Error writing file %s(%s): %s" %
(filename, e.errno, e.strerror))
if file:
contentslist = []
keys = data.keys()
for json in keys:
delta = '{"delta": %s, "var": %s}' % (data[json]['delta'],
contentslist.append(' "%s": %s,' % (json, delta))
contentslist.append(' "load": true')
contents = "\n".join(contentslist) + "\n"
return True
def IsWhitelistedPath(path):
"""Check path against a whitelisted set of regular expressions."""
whitelist = [r'^(.*/)?([^/]+)/([^/]+)/times-summary.dat',
for match in whitelist:
if re.match(match, path):
return True
return False
def IsBlacklistedPath(path):
"""Check path against a blacklisted set of regular expressions."""
blacklist = [r'^(.*/)?linux-release/([^/]+)/([^/]+)-summary.dat',
for match in blacklist:
if re.match(match, path):
return True
return False
def Main(args):
parser = optparse.OptionParser(usage=USAGE, version=__version__)
options, args = parser.parse_args(args)
# Get the given directories the user wants to work in.
options.dir = []
if len(args) > 1:
# If no directories are given, assume the current working directory.
if len(options.dir) == 0:
expectations = {}
for dir in options.dir:
for filename in GetSummaryFilelist(dir=dir):
if not IsWhitelistedPath(filename) or IsBlacklistedPath(filename):
# Get the perfid, test, and graph from filename.
print filename
m = re.match(r'^(.*/)?([^/]+)/([^/]+)/([^/]+)-summary.dat', filename)
if not m:
raise Exception('%s did not match expected format' % filename)
perfid =
test =
graph =
# Only look at the first 200 lines since some dat files contain old trace
# names we can safely skip.
jsondata = ReadJson(filename)[:200]
buckets = ConvertDataIntoBuckets(perfid, test, graph, jsondata)
for perfkey in buckets:
CalculateDeltaAndVar(perfkey, buckets, expectations)
WriteJson('perf_expectations.json', expectations)
return 0
if __name__ == '__main__':