make_expectations.py - chromium/tools/perf - Git at Google

 #!/usr/bin/python
 #
 # Copyright (c) 2010 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 import os
 import sys

 # Prepend the buildbot pylibs directory to our import path.
 sys.path.reverse()
 sys.path.append(os.path.join(os.path.dirname(__file__),
                              '../../buildbot/pylibs'))
 sys.path.reverse()

 import fnmatch
 import glob
 import itertools
 import math
 import optparse
 import re
 import simplejson
 import subprocess

 __version__ = '1.0'
 USAGE = ""


 def GetSummaryFilelist(dir=None):
   """Finds all summary .dat files to clean up."""
   if not dir:
     raise Exception("No directory supplied.")
   if not os.path.exists(dir):
     raise Exception("Directory does not exist.")
   files = itertools.chain(glob.iglob('%s/*' % dir),
                           glob.iglob('%s/*/*' % dir),
                           glob.iglob('%s/*/*/*' % dir))
   summaries = [summary for summary in files
                if fnmatch.fnmatch(summary, '*-summary.dat')]
   summaries.sort()
   return summaries


 def ReadJson(filename):
   """Read a JSON file and convert its contents into a Python datatype."""
   try:
     file = open(filename, 'r')
   except IOError, e:
     print >> sys.stderr, ("I/O Error reading file %s(%s): %s" %
                           (filename, e.errno, e.strerror))
     raise e
   if not file:
     return None

   data = []
   contents = file.read()
   contentslist = contents.split("\n")
   for jsontext in contentslist:
     if jsontext is None or len(jsontext) == 0:
       continue
     try:
       json = simplejson.loads(jsontext,
                               object_pairs_hook=simplejson.OrderedDict)
     except ValueError, e:
       print >> sys.stderr, ("Error parsing file %s: '%s'" %
                             (filename, jsontext))
       raise e
     data.append(json)
   file.close()
   return data


 def ConvertDataIntoBuckets(perfid, test, graph, data):
   """Convert an array of JSON data into a groomed collection of perf data.

   For each item in the JSON data array, locate the '_ref' trace which indicates
   the reference build's data.  Locate the current build's trace corresponding to
   the reference build.  Store both values in new arrays within the groomed
   collection under a key composed of the perf system ID, the test ID, the graph
   ID, and the trace name.

   Assume that the data has stabilized recently and only keep the first 30
   values for each bucket."""
   buckets = {}
   for cl in data:
     keys = cl['traces'].keys()
     for refkey in keys:
       m = re.match(r'^(.*)_ref$', refkey)
       if not m:
         continue
       tracekey = m.group(1)
       if tracekey not in cl['traces']:
         # Sometimes the current build measure is missing due to a failure.
         continue

       perfkey = "%s/%s/%s/%s" % (perfid, test, graph, tracekey)
       buckets.setdefault(perfkey, {})

       # Get current build data.
       buckets[perfkey].setdefault('current', [])
       current = float(cl['traces'][tracekey][0])
       buckets[perfkey]['current'].append(current)

       # Get reference build data.
       buckets[perfkey].setdefault('ref', [])
       ref = float(cl['traces'][refkey][0])
       buckets[perfkey]['ref'].append(ref)

       # Get delta.
       buckets[perfkey].setdefault('delta', [])
       buckets[perfkey]['delta'].append(current - ref)

   # Only keep the first 30 values for each bucket.
   buckets[perfkey]['current'] = buckets[perfkey]['current'][:30]
   buckets[perfkey]['ref'] = buckets[perfkey]['ref'][:30]
   buckets[perfkey]['delta'] = buckets[perfkey]['delta'][:30]
   return buckets


 def GetDeltas(data, value=None):
   """Returns a list composed by subtracting each item by a given value.

   If the value is not given or is None, the previous item is used and the
   returned list will be one item less than the given list."""
   deltas = []
   for i in range(len(data)):
     if value is None:
       if i == 0:
         continue
       value = data[i-1]
     deltas.append(data[i] - value)
   return deltas


 def GetAbsoluteValueList(data):
   """Returns a list composed of the absolute value of each item."""
   absv = []
   for i in range(len(data)):
     absv.append(abs(data[i]))
   return absv


 def GetNormalizedList(data, base_value):
   """Returns a list composed by dividing each item over the given base value.

   If the base value is less than 0.0001, it is adjusted to 0.0001 to avoid
   division by zero."""
   if base_value < 0.0001:
     base_value = 0.0001
   normalized = []
   for i in range(len(data)):
     normalized.append(data[i] / float(base_value))
   return normalized


 def GetMedian(data):
   """Returns the median for a list of data."""
   if len(data) == 0:
     return None
   sorted_data = sorted(data)
   if len(sorted_data) % 2 == 0:
     high = sorted_data[len(sorted_data) / 2]
     low = sorted_data[(len(sorted_data) / 2) - 1]
     return (float(high) + float(low)) / 2
   else:
     return sorted_data[len(sorted_data) / 2]


 def RemoveNoise(data, max_to_remove=3):
   """Removes noise from a list by identifying potentially unusual values.

   This is a simple filter.  The process is:
     1. Calculate the list's b coefficient using the least fit squares algorithm.
     2. Create a list of deltas using the b coefficient and take its absolute
        value.
     3. Normalize the deltas list over the median value.
     4. For each normalized delta, if that value is greater than 5, mark that
        position, value, and delta as a candidate for removal.

   The default number of noisy values to remove is 5 unless otherwise overridden.
   This method sorts the candidates by the delta descending, then reduces the
   candidate list to the maximum number allowed.  The candidate values are
   then removed from the input list."""
   mean = sum(data) / len(data)
   deltas = GetAbsoluteValueList(GetDeltas(data, mean))
   normalized_deltas = GetNormalizedList(deltas, GetMedian(deltas))

   candidates = []
   for i in range(len(normalized_deltas)):
     if normalized_deltas[i] > 3:
       candidates.append({'position': i,
                          'value': data[i],
                          'delta': normalized_deltas[i]})
   # Sort by delta descending, then drop all but max_to_remove elements.
   candidates.sort(cmp=lambda a, b: cmp(b['delta'], a['delta']))
   candidates = candidates[:max_to_remove]
   # Sort by position descending, then pop those items from the list.
   candidates.sort(cmp=lambda a, b: cmp(b['position'], a['position']))
   for candidate in candidates:
     data.pop(candidate['position'])


 def CalculateDeltaAndVar(perfkey, buckets, expectations):
   """Calculate the delta and variance for a bucket.

   Store the resulting delta and variance in the expectations dictionary."""
   data = buckets[perfkey]['delta']

   # Calculate the delta of the data set by averaging the observed max and min
   # values.
   delta_max = round(max(data), 0)
   delta_min = round(min(data), 0)
   delta = int((delta_max + delta_min)/2.0)

   # Get the mean and median of the data set.
   mean = sum(data) / len(data)
   median = GetMedian(data)

   # Find the noise level by tripling the median/mean difference.  Also see
   # RemoveNoise().
   buffer = abs((median-mean)*3)

   # Get the max allowed point by subtracting the delta from the max delta, and
   # adding the buffer.
   max_allowed_point = abs(delta_max - delta + buffer)

   # The variance is the max allowed point divided by 1.5 (the amount the
   # variance is multiplied by in the Buildbot).
   var = int(round(max_allowed_point/1.5 + 1, 0))
   if var < 0:
     raise Exception("variance for %s is negative" % perfkey)
   expectations[perfkey] = {'delta': delta, 'var': var}


 def WriteJson(filename, data):
   """Write a list of hashes in |data| to the file specified in |filename|."""
   try:
     file = open(filename, 'w')
   except IOError, e:
     print >> sys.stderr, ("I/O Error writing file %s(%s): %s" %
                           (filename, e.errno, e.strerror))
   if file:
     contentslist = []
     contentslist.append('{')
     keys = data.keys()
     keys.sort()
     for json in keys:
       delta = '{"delta": %s, "var": %s}' % (data[json]['delta'],
                                             data[json]['var'])
       contentslist.append('  "%s": %s,' % (json, delta))
     contentslist.append('  "load": true')
     contentslist.append('}')
     contents = "\n".join(contentslist) + "\n"
     file.write(contents)
   return True


 def IsWhitelistedPath(path):
   """Check path against a whitelisted set of regular expressions."""
   whitelist = [r'^(.*/)?([^/]+)/([^/]+)/times-summary.dat',
                r'^(.*/)?xp-release-dual-core/moz/([^/]+)-summary.dat',
                r'^(.*/)?([^/]+)/dromaeo/score-summary.dat',
                r'^(.*/)?([^/]+)/startup/warm-summary.dat']
   for match in whitelist:
     if re.match(match, path):
       return True
   return False


 def IsBlacklistedPath(path):
   """Check path against a blacklisted set of regular expressions."""
   blacklist = [r'^(.*/)?linux-release/([^/]+)/([^/]+)-summary.dat',
                r'^(.*/)?linux-release-lowmem/([^/]+)/([^/]+)-summary.dat',
                r'^(.*/)?linux-release-w([^/]+)/([^/]+)/([^/]+)-summary.dat',
                r'^(.*/)?mac-release-10.5-([^/]+)/([^/]+)/([^/]+)-summary.dat',
                r'^(.*/)?mac-release/([^/]+)/([^/]+)-summary.dat',
                r'^(.*/)?vista-release-single-core/([^/]+)/([^/]+)-summary.dat',
                r'^(.*/)?xp-release-single-core/([^/]+)/([^/]+)-summary.dat',
                r'^(.*/)?xp-release-v8-latest/([^/]+)/([^/]+)-summary.dat',
                r'^(.*/)?xp-release-webkit-latest/([^/]+)/([^/]+)-summary.dat',
                r'^(.*/)?xp-release/([^/]+)/([^/]+)-summary.dat']
   for match in blacklist:
     if re.match(match, path):
       return True
   return False


 def Main(args):
   parser = optparse.OptionParser(usage=USAGE, version=__version__)
   options, args = parser.parse_args(args)

   # Get the given directories the user wants to work in.
   options.dir = []
   if len(args) > 1:
     options.dir.extend(args[1:len(args)])
   # If no directories are given, assume the current working directory.
   if len(options.dir) == 0:
     options.dir.append('.')

   expectations = {}
   for dir in options.dir:
     for filename in GetSummaryFilelist(dir=dir):
       if not IsWhitelistedPath(filename) or IsBlacklistedPath(filename):
         continue

       # Get the perfid, test, and graph from filename.
       print filename
       m = re.match(r'^(.*/)?([^/]+)/([^/]+)/([^/]+)-summary.dat', filename)
       if not m:
         raise Exception('%s did not match expected format' % filename)
       perfid = m.group(2)
       test = m.group(3)
       graph = m.group(4)

       # Only look at the first 200 lines since some dat files contain old trace
       # names we can safely skip.
       jsondata = ReadJson(filename)[:200]
       buckets = ConvertDataIntoBuckets(perfid, test, graph, jsondata)
       for perfkey in buckets:
         RemoveNoise(buckets[perfkey]['delta'])
         CalculateDeltaAndVar(perfkey, buckets, expectations)
   WriteJson('perf_expectations.json', expectations)
   return 0


 if __name__ == '__main__':
   sys.exit(Main(sys.argv))
	#!/usr/bin/python
	#
	# Copyright (c) 2010 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	import os
	import sys

	# Prepend the buildbot pylibs directory to our import path.
	sys.path.reverse()
	sys.path.append(os.path.join(os.path.dirname(__file__),
	'../../buildbot/pylibs'))
	sys.path.reverse()

	import fnmatch
	import glob
	import itertools
	import math
	import optparse
	import re
	import simplejson
	import subprocess

	__version__ = '1.0'
	USAGE = ""


	def GetSummaryFilelist(dir=None):
	"""Finds all summary .dat files to clean up."""
	if not dir:
	raise Exception("No directory supplied.")
	if not os.path.exists(dir):
	raise Exception("Directory does not exist.")
	files = itertools.chain(glob.iglob('%s/*' % dir),
	glob.iglob('%s//' % dir),
	glob.iglob('%s///*' % dir))
	summaries = [summary for summary in files
	if fnmatch.fnmatch(summary, '*-summary.dat')]
	summaries.sort()
	return summaries


	def ReadJson(filename):
	"""Read a JSON file and convert its contents into a Python datatype."""
	try:
	file = open(filename, 'r')
	except IOError, e:
	print >> sys.stderr, ("I/O Error reading file %s(%s): %s" %
	(filename, e.errno, e.strerror))
	raise e
	if not file:
	return None

	data = []
	contents = file.read()
	contentslist = contents.split("\n")
	for jsontext in contentslist:
	if jsontext is None or len(jsontext) == 0:
	continue
	try:
	json = simplejson.loads(jsontext,
	object_pairs_hook=simplejson.OrderedDict)
	except ValueError, e:
	print >> sys.stderr, ("Error parsing file %s: '%s'" %
	(filename, jsontext))
	raise e
	data.append(json)
	file.close()
	return data


	def ConvertDataIntoBuckets(perfid, test, graph, data):
	"""Convert an array of JSON data into a groomed collection of perf data.

	For each item in the JSON data array, locate the '_ref' trace which indicates
	the reference build's data. Locate the current build's trace corresponding to
	the reference build. Store both values in new arrays within the groomed
	collection under a key composed of the perf system ID, the test ID, the graph
	ID, and the trace name.

	Assume that the data has stabilized recently and only keep the first 30
	values for each bucket."""
	buckets = {}
	for cl in data:
	keys = cl['traces'].keys()
	for refkey in keys:
	m = re.match(r'^(.*)_ref$', refkey)
	if not m:
	continue
	tracekey = m.group(1)
	if tracekey not in cl['traces']:
	# Sometimes the current build measure is missing due to a failure.
	continue

	perfkey = "%s/%s/%s/%s" % (perfid, test, graph, tracekey)
	buckets.setdefault(perfkey, {})

	# Get current build data.
	buckets[perfkey].setdefault('current', [])
	current = float(cl['traces'][tracekey][0])
	buckets[perfkey]['current'].append(current)

	# Get reference build data.
	buckets[perfkey].setdefault('ref', [])
	ref = float(cl['traces'][refkey][0])
	buckets[perfkey]['ref'].append(ref)

	# Get delta.
	buckets[perfkey].setdefault('delta', [])
	buckets[perfkey]['delta'].append(current - ref)

	# Only keep the first 30 values for each bucket.
	buckets[perfkey]['current'] = buckets[perfkey]['current'][:30]
	buckets[perfkey]['ref'] = buckets[perfkey]['ref'][:30]
	buckets[perfkey]['delta'] = buckets[perfkey]['delta'][:30]
	return buckets


	def GetDeltas(data, value=None):
	"""Returns a list composed by subtracting each item by a given value.

	If the value is not given or is None, the previous item is used and the
	returned list will be one item less than the given list."""
	deltas = []
	for i in range(len(data)):
	if value is None:
	if i == 0:
	continue
	value = data[i-1]
	deltas.append(data[i] - value)
	return deltas


	def GetAbsoluteValueList(data):
	"""Returns a list composed of the absolute value of each item."""
	absv = []
	for i in range(len(data)):
	absv.append(abs(data[i]))
	return absv


	def GetNormalizedList(data, base_value):
	"""Returns a list composed by dividing each item over the given base value.

	If the base value is less than 0.0001, it is adjusted to 0.0001 to avoid
	division by zero."""
	if base_value < 0.0001:
	base_value = 0.0001
	normalized = []
	for i in range(len(data)):
	normalized.append(data[i] / float(base_value))
	return normalized


	def GetMedian(data):
	"""Returns the median for a list of data."""
	if len(data) == 0:
	return None
	sorted_data = sorted(data)
	if len(sorted_data) % 2 == 0:
	high = sorted_data[len(sorted_data) / 2]
	low = sorted_data[(len(sorted_data) / 2) - 1]
	return (float(high) + float(low)) / 2
	else:
	return sorted_data[len(sorted_data) / 2]


	def RemoveNoise(data, max_to_remove=3):
	"""Removes noise from a list by identifying potentially unusual values.

	This is a simple filter. The process is:
	1. Calculate the list's b coefficient using the least fit squares algorithm.
	2. Create a list of deltas using the b coefficient and take its absolute
	value.
	3. Normalize the deltas list over the median value.
	4. For each normalized delta, if that value is greater than 5, mark that
	position, value, and delta as a candidate for removal.

	The default number of noisy values to remove is 5 unless otherwise overridden.
	This method sorts the candidates by the delta descending, then reduces the
	candidate list to the maximum number allowed. The candidate values are
	then removed from the input list."""
	mean = sum(data) / len(data)
	deltas = GetAbsoluteValueList(GetDeltas(data, mean))
	normalized_deltas = GetNormalizedList(deltas, GetMedian(deltas))

	candidates = []
	for i in range(len(normalized_deltas)):
	if normalized_deltas[i] > 3:
	candidates.append({'position': i,
	'value': data[i],
	'delta': normalized_deltas[i]})
	# Sort by delta descending, then drop all but max_to_remove elements.
	candidates.sort(cmp=lambda a, b: cmp(b['delta'], a['delta']))
	candidates = candidates[:max_to_remove]
	# Sort by position descending, then pop those items from the list.
	candidates.sort(cmp=lambda a, b: cmp(b['position'], a['position']))
	for candidate in candidates:
	data.pop(candidate['position'])


	def CalculateDeltaAndVar(perfkey, buckets, expectations):
	"""Calculate the delta and variance for a bucket.

	Store the resulting delta and variance in the expectations dictionary."""
	data = buckets[perfkey]['delta']

	# Calculate the delta of the data set by averaging the observed max and min
	# values.
	delta_max = round(max(data), 0)
	delta_min = round(min(data), 0)
	delta = int((delta_max + delta_min)/2.0)

	# Get the mean and median of the data set.
	mean = sum(data) / len(data)
	median = GetMedian(data)

	# Find the noise level by tripling the median/mean difference. Also see
	# RemoveNoise().
	buffer = abs((median-mean)*3)

	# Get the max allowed point by subtracting the delta from the max delta, and
	# adding the buffer.
	max_allowed_point = abs(delta_max - delta + buffer)

	# The variance is the max allowed point divided by 1.5 (the amount the
	# variance is multiplied by in the Buildbot).
	var = int(round(max_allowed_point/1.5 + 1, 0))
	if var < 0:
	raise Exception("variance for %s is negative" % perfkey)
	expectations[perfkey] = {'delta': delta, 'var': var}


	def WriteJson(filename, data):
	"""Write a list of hashes in \|data\| to the file specified in \|filename\|."""
	try:
	file = open(filename, 'w')
	except IOError, e:
	print >> sys.stderr, ("I/O Error writing file %s(%s): %s" %
	(filename, e.errno, e.strerror))
	if file:
	contentslist = []
	contentslist.append('{')
	keys = data.keys()
	keys.sort()
	for json in keys:
	delta = '{"delta": %s, "var": %s}' % (data[json]['delta'],
	data[json]['var'])
	contentslist.append(' "%s": %s,' % (json, delta))
	contentslist.append(' "load": true')
	contentslist.append('}')
	contents = "\n".join(contentslist) + "\n"
	file.write(contents)
	return True


	def IsWhitelistedPath(path):
	"""Check path against a whitelisted set of regular expressions."""
	whitelist = [r'^(.*/)?([^/]+)/([^/]+)/times-summary.dat',
	r'^(.*/)?xp-release-dual-core/moz/([^/]+)-summary.dat',
	r'^(.*/)?([^/]+)/dromaeo/score-summary.dat',
	r'^(.*/)?([^/]+)/startup/warm-summary.dat']
	for match in whitelist:
	if re.match(match, path):
	return True
	return False


	def IsBlacklistedPath(path):
	"""Check path against a blacklisted set of regular expressions."""
	blacklist = [r'^(.*/)?linux-release/([^/]+)/([^/]+)-summary.dat',
	r'^(.*/)?linux-release-lowmem/([^/]+)/([^/]+)-summary.dat',
	r'^(.*/)?linux-release-w([^/]+)/([^/]+)/([^/]+)-summary.dat',
	r'^(.*/)?mac-release-10.5-([^/]+)/([^/]+)/([^/]+)-summary.dat',
	r'^(.*/)?mac-release/([^/]+)/([^/]+)-summary.dat',
	r'^(.*/)?vista-release-single-core/([^/]+)/([^/]+)-summary.dat',
	r'^(.*/)?xp-release-single-core/([^/]+)/([^/]+)-summary.dat',
	r'^(.*/)?xp-release-v8-latest/([^/]+)/([^/]+)-summary.dat',
	r'^(.*/)?xp-release-webkit-latest/([^/]+)/([^/]+)-summary.dat',
	r'^(.*/)?xp-release/([^/]+)/([^/]+)-summary.dat']
	for match in blacklist:
	if re.match(match, path):
	return True
	return False


	def Main(args):
	parser = optparse.OptionParser(usage=USAGE, version=__version__)
	options, args = parser.parse_args(args)

	# Get the given directories the user wants to work in.
	options.dir = []
	if len(args) > 1:
	options.dir.extend(args[1:len(args)])
	# If no directories are given, assume the current working directory.
	if len(options.dir) == 0:
	options.dir.append('.')

	expectations = {}
	for dir in options.dir:
	for filename in GetSummaryFilelist(dir=dir):
	if not IsWhitelistedPath(filename) or IsBlacklistedPath(filename):
	continue

	# Get the perfid, test, and graph from filename.
	print filename
	m = re.match(r'^(.*/)?([^/]+)/([^/]+)/([^/]+)-summary.dat', filename)
	if not m:
	raise Exception('%s did not match expected format' % filename)
	perfid = m.group(2)
	test = m.group(3)
	graph = m.group(4)

	# Only look at the first 200 lines since some dat files contain old trace
	# names we can safely skip.
	jsondata = ReadJson(filename)[:200]
	buckets = ConvertDataIntoBuckets(perfid, test, graph, jsondata)
	for perfkey in buckets:
	RemoveNoise(buckets[perfkey]['delta'])
	CalculateDeltaAndVar(perfkey, buckets, expectations)
	WriteJson('perf_expectations.json', expectations)
	return 0


	if __name__ == '__main__':
	sys.exit(Main(sys.argv))