| #!/usr/bin/python |
| # |
| # Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| import os |
| import sys |
| |
| # Prepend the buildbot pylibs directory to our import path. |
| sys.path.reverse() |
| sys.path.append(os.path.join(os.path.dirname(__file__), |
| '../../buildbot/pylibs')) |
| sys.path.reverse() |
| |
| import fnmatch |
| import glob |
| import itertools |
| import math |
| import optparse |
| import re |
| import simplejson |
| import subprocess |
| |
| __version__ = '1.0' |
| USAGE = "" |
| |
| |
| def GetSummaryFilelist(dir=None): |
| """Finds all summary .dat files to clean up.""" |
| if not dir: |
| raise Exception("No directory supplied.") |
| if not os.path.exists(dir): |
| raise Exception("Directory does not exist.") |
| files = itertools.chain(glob.iglob('%s/*' % dir), |
| glob.iglob('%s/*/*' % dir), |
| glob.iglob('%s/*/*/*' % dir)) |
| summaries = [summary for summary in files |
| if fnmatch.fnmatch(summary, '*-summary.dat')] |
| summaries.sort() |
| return summaries |
| |
| |
| def ReadJson(filename): |
| """Read a JSON file and convert its contents into a Python datatype.""" |
| try: |
| file = open(filename, 'r') |
| except IOError, e: |
| print >> sys.stderr, ("I/O Error reading file %s(%s): %s" % |
| (filename, e.errno, e.strerror)) |
| raise e |
| if not file: |
| return None |
| |
| data = [] |
| contents = file.read() |
| contentslist = contents.split("\n") |
| for jsontext in contentslist: |
| if jsontext is None or len(jsontext) == 0: |
| continue |
| try: |
| json = simplejson.loads(jsontext, |
| object_pairs_hook=simplejson.OrderedDict) |
| except ValueError, e: |
| print >> sys.stderr, ("Error parsing file %s: '%s'" % |
| (filename, jsontext)) |
| raise e |
| data.append(json) |
| file.close() |
| return data |
| |
| |
| def ConvertDataIntoBuckets(perfid, test, graph, data): |
| """Convert an array of JSON data into a groomed collection of perf data. |
| |
| For each item in the JSON data array, locate the '_ref' trace which indicates |
| the reference build's data. Locate the current build's trace corresponding to |
| the reference build. Store both values in new arrays within the groomed |
| collection under a key composed of the perf system ID, the test ID, the graph |
| ID, and the trace name. |
| |
| Assume that the data has stabilized recently and only keep the first 30 |
| values for each bucket.""" |
| buckets = {} |
| for cl in data: |
| keys = cl['traces'].keys() |
| for refkey in keys: |
| m = re.match(r'^(.*)_ref$', refkey) |
| if not m: |
| continue |
| tracekey = m.group(1) |
| if tracekey not in cl['traces']: |
| # Sometimes the current build measure is missing due to a failure. |
| continue |
| |
| perfkey = "%s/%s/%s/%s" % (perfid, test, graph, tracekey) |
| buckets.setdefault(perfkey, {}) |
| |
| # Get current build data. |
| buckets[perfkey].setdefault('current', []) |
| current = float(cl['traces'][tracekey][0]) |
| buckets[perfkey]['current'].append(current) |
| |
| # Get reference build data. |
| buckets[perfkey].setdefault('ref', []) |
| ref = float(cl['traces'][refkey][0]) |
| buckets[perfkey]['ref'].append(ref) |
| |
| # Get delta. |
| buckets[perfkey].setdefault('delta', []) |
| buckets[perfkey]['delta'].append(current - ref) |
| |
| # Only keep the first 30 values for each bucket. |
| buckets[perfkey]['current'] = buckets[perfkey]['current'][:30] |
| buckets[perfkey]['ref'] = buckets[perfkey]['ref'][:30] |
| buckets[perfkey]['delta'] = buckets[perfkey]['delta'][:30] |
| return buckets |
| |
| |
| def GetDeltas(data, value=None): |
| """Returns a list composed by subtracting each item by a given value. |
| |
| If the value is not given or is None, the previous item is used and the |
| returned list will be one item less than the given list.""" |
| deltas = [] |
| for i in range(len(data)): |
| if value is None: |
| if i == 0: |
| continue |
| value = data[i-1] |
| deltas.append(data[i] - value) |
| return deltas |
| |
| |
| def GetAbsoluteValueList(data): |
| """Returns a list composed of the absolute value of each item.""" |
| absv = [] |
| for i in range(len(data)): |
| absv.append(abs(data[i])) |
| return absv |
| |
| |
| def GetNormalizedList(data, base_value): |
| """Returns a list composed by dividing each item over the given base value. |
| |
| If the base value is less than 0.0001, it is adjusted to 0.0001 to avoid |
| division by zero.""" |
| if base_value < 0.0001: |
| base_value = 0.0001 |
| normalized = [] |
| for i in range(len(data)): |
| normalized.append(data[i] / float(base_value)) |
| return normalized |
| |
| |
| def GetMedian(data): |
| """Returns the median for a list of data.""" |
| if len(data) == 0: |
| return None |
| sorted_data = sorted(data) |
| if len(sorted_data) % 2 == 0: |
| high = sorted_data[len(sorted_data) / 2] |
| low = sorted_data[(len(sorted_data) / 2) - 1] |
| return (float(high) + float(low)) / 2 |
| else: |
| return sorted_data[len(sorted_data) / 2] |
| |
| |
| def RemoveNoise(data, max_to_remove=3): |
| """Removes noise from a list by identifying potentially unusual values. |
| |
| This is a simple filter. The process is: |
| 1. Calculate the list's b coefficient using the least fit squares algorithm. |
| 2. Create a list of deltas using the b coefficient and take its absolute |
| value. |
| 3. Normalize the deltas list over the median value. |
| 4. For each normalized delta, if that value is greater than 5, mark that |
| position, value, and delta as a candidate for removal. |
| |
| The default number of noisy values to remove is 5 unless otherwise overridden. |
| This method sorts the candidates by the delta descending, then reduces the |
| candidate list to the maximum number allowed. The candidate values are |
| then removed from the input list.""" |
| mean = sum(data) / len(data) |
| deltas = GetAbsoluteValueList(GetDeltas(data, mean)) |
| normalized_deltas = GetNormalizedList(deltas, GetMedian(deltas)) |
| |
| candidates = [] |
| for i in range(len(normalized_deltas)): |
| if normalized_deltas[i] > 3: |
| candidates.append({'position': i, |
| 'value': data[i], |
| 'delta': normalized_deltas[i]}) |
| # Sort by delta descending, then drop all but max_to_remove elements. |
| candidates.sort(cmp=lambda a, b: cmp(b['delta'], a['delta'])) |
| candidates = candidates[:max_to_remove] |
| # Sort by position descending, then pop those items from the list. |
| candidates.sort(cmp=lambda a, b: cmp(b['position'], a['position'])) |
| for candidate in candidates: |
| data.pop(candidate['position']) |
| |
| |
| def CalculateDeltaAndVar(perfkey, buckets, expectations): |
| """Calculate the delta and variance for a bucket. |
| |
| Store the resulting delta and variance in the expectations dictionary.""" |
| data = buckets[perfkey]['delta'] |
| |
| # Calculate the delta of the data set by averaging the observed max and min |
| # values. |
| delta_max = round(max(data), 0) |
| delta_min = round(min(data), 0) |
| delta = int((delta_max + delta_min)/2.0) |
| |
| # Get the mean and median of the data set. |
| mean = sum(data) / len(data) |
| median = GetMedian(data) |
| |
| # Find the noise level by tripling the median/mean difference. Also see |
| # RemoveNoise(). |
| buffer = abs((median-mean)*3) |
| |
| # Get the max allowed point by subtracting the delta from the max delta, and |
| # adding the buffer. |
| max_allowed_point = abs(delta_max - delta + buffer) |
| |
| # The variance is the max allowed point divided by 1.5 (the amount the |
| # variance is multiplied by in the Buildbot). |
| var = int(round(max_allowed_point/1.5 + 1, 0)) |
| if var < 0: |
| raise Exception("variance for %s is negative" % perfkey) |
| expectations[perfkey] = {'delta': delta, 'var': var} |
| |
| |
| def WriteJson(filename, data): |
| """Write a list of hashes in |data| to the file specified in |filename|.""" |
| try: |
| file = open(filename, 'w') |
| except IOError, e: |
| print >> sys.stderr, ("I/O Error writing file %s(%s): %s" % |
| (filename, e.errno, e.strerror)) |
| if file: |
| contentslist = [] |
| contentslist.append('{') |
| keys = data.keys() |
| keys.sort() |
| for json in keys: |
| delta = '{"delta": %s, "var": %s}' % (data[json]['delta'], |
| data[json]['var']) |
| contentslist.append(' "%s": %s,' % (json, delta)) |
| contentslist.append(' "load": true') |
| contentslist.append('}') |
| contents = "\n".join(contentslist) + "\n" |
| file.write(contents) |
| return True |
| |
| |
| def IsWhitelistedPath(path): |
| """Check path against a whitelisted set of regular expressions.""" |
| whitelist = [r'^(.*/)?([^/]+)/([^/]+)/times-summary.dat', |
| r'^(.*/)?xp-release-dual-core/moz/([^/]+)-summary.dat', |
| r'^(.*/)?([^/]+)/dromaeo/score-summary.dat', |
| r'^(.*/)?([^/]+)/startup/warm-summary.dat'] |
| for match in whitelist: |
| if re.match(match, path): |
| return True |
| return False |
| |
| |
| def IsBlacklistedPath(path): |
| """Check path against a blacklisted set of regular expressions.""" |
| blacklist = [r'^(.*/)?linux-release/([^/]+)/([^/]+)-summary.dat', |
| r'^(.*/)?linux-release-lowmem/([^/]+)/([^/]+)-summary.dat', |
| r'^(.*/)?linux-release-w([^/]+)/([^/]+)/([^/]+)-summary.dat', |
| r'^(.*/)?mac-release-10.5-([^/]+)/([^/]+)/([^/]+)-summary.dat', |
| r'^(.*/)?mac-release/([^/]+)/([^/]+)-summary.dat', |
| r'^(.*/)?vista-release-single-core/([^/]+)/([^/]+)-summary.dat', |
| r'^(.*/)?xp-release-single-core/([^/]+)/([^/]+)-summary.dat', |
| r'^(.*/)?xp-release-v8-latest/([^/]+)/([^/]+)-summary.dat', |
| r'^(.*/)?xp-release-webkit-latest/([^/]+)/([^/]+)-summary.dat', |
| r'^(.*/)?xp-release/([^/]+)/([^/]+)-summary.dat'] |
| for match in blacklist: |
| if re.match(match, path): |
| return True |
| return False |
| |
| |
| def Main(args): |
| parser = optparse.OptionParser(usage=USAGE, version=__version__) |
| options, args = parser.parse_args(args) |
| |
| # Get the given directories the user wants to work in. |
| options.dir = [] |
| if len(args) > 1: |
| options.dir.extend(args[1:len(args)]) |
| # If no directories are given, assume the current working directory. |
| if len(options.dir) == 0: |
| options.dir.append('.') |
| |
| expectations = {} |
| for dir in options.dir: |
| for filename in GetSummaryFilelist(dir=dir): |
| if not IsWhitelistedPath(filename) or IsBlacklistedPath(filename): |
| continue |
| |
| # Get the perfid, test, and graph from filename. |
| print filename |
| m = re.match(r'^(.*/)?([^/]+)/([^/]+)/([^/]+)-summary.dat', filename) |
| if not m: |
| raise Exception('%s did not match expected format' % filename) |
| perfid = m.group(2) |
| test = m.group(3) |
| graph = m.group(4) |
| |
| # Only look at the first 200 lines since some dat files contain old trace |
| # names we can safely skip. |
| jsondata = ReadJson(filename)[:200] |
| buckets = ConvertDataIntoBuckets(perfid, test, graph, jsondata) |
| for perfkey in buckets: |
| RemoveNoise(buckets[perfkey]['delta']) |
| CalculateDeltaAndVar(perfkey, buckets, expectations) |
| WriteJson('perf_expectations.json', expectations) |
| return 0 |
| |
| |
| if __name__ == '__main__': |
| sys.exit(Main(sys.argv)) |