tools/mac/power/compare.py - chromium/src.git - Git at Google

 #!/usr/bin/env python3

 # Copyright 2021 The Chromium Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 import argparse
 import logging
 import os
 import pandas as pd
 import numpy

 from scipy import stats as scipy_stats


 def get_diamond_string(diamond_count: int):
   if diamond_count == 0:
     return "~"
   elif diamond_count == 1:
     return "◆"
   elif diamond_count == 2:
     return "◆◆"
   elif diamond_count == 3:
     return "◆◆◆"
   elif diamond_count == 4:
     return "◆◆◆◆"


 def get_diamonds_count(significance: pd.DataFrame):
   """
   This function emulates the "diamond" significance representation
   that is familiar to UMA users.
   """

   assert (not (significance > 1).any().any())
   assert (not (significance < 0).any().any())

   # Avoid log10(0) which is undefined.
   significance = numpy.clip(significance, 0, 0.999999)

   # scipy_stats.norm.cdf(1.96) = 0.975 and we're interested in 2 tail
   # test. 1.96 gives a 0.05 p-value. Multiply by 2 here to correct.
   p_value = (1 - significance) * 2

   # floor() to avoid exaggerating results and to round.
   # absolute() to make the result positive.
   log_p_value = numpy.floor(numpy.absolute(numpy.log10(p_value)))

   # Clip because 4 diamond is the max no matter the p-value.
   return numpy.clip(log_p_value, 0, 4)


 def compute_mean_and_stderr(summary_path: str):
   df = pd.read_csv(summary_path)

   # skipna because no line has as all measurements. This is because of the
   # different sampling rates of the data sources in power_sampler
   # and power_metrics.
   means = df.mean(skipna=True)

   # Calculate the standard error of each column.
   stderrs = df.std(skipna=True) / numpy.sqrt(df.count())
   stats = means.to_frame().join(stderrs.to_frame(),
                                 lsuffix='mean',
                                 rsuffix='stderr')
   stats = stats.rename(columns={"0mean": "mean", "0stderr": "stderr"})

   return stats


 def percent_difference(first_value: pd.DataFrame, second_value: pd.DataFrame):
   """
   Returns the comparative percentage difference between two
   values/columns.

   The result is to be read as :
     |second_value| is X% smaller/larger than |first_value|.

   Ex: percent_difference(20, 10) --> -50
   Ex: percent_difference(10, 50) --> 500
   """

   return ((second_value - first_value) / first_value) * 100


 def compare(data_dir: str, baseline_summary: str, alternative_summary: str):
   """Open two summary files and compare their values. Saves the results
   in data_dir.

   Args:
     data_dir: The directory to save the comparison csv in.
     baseline_summary: summary.csv for the baseline.
     alternative_summary: summary.csv for the comparison.
   """

   # Get names of the browsers being compared from the paths.
   baseline_name = os.path.basename(
       os.path.dirname(baseline_summary)).split("_")[0]
   alternative_name = os.path.basename(
       os.path.dirname(alternative_summary)).split("_")[0]

   all_stats = []

   # Extract mean and std values for each column of |summary| into a new
   # dataframe.
   baseline_stats = compute_mean_and_stderr(baseline_summary)
   alternative_stats = compute_mean_and_stderr(alternative_summary)

   # Join the calculated values for both browsers into a single dataframe.
   comparison_summary = baseline_stats.join(alternative_stats,
                                            lsuffix=f"_{baseline_name}",
                                            rsuffix=f"_{alternative_name}")

   # Calculate the difference in percent between the baseline and comparison.
   comparison_summary["difference"] = percent_difference(
       baseline_stats["mean"], alternative_stats["mean"])

   # See https://www.cliffsnotes.com/study-guides/statistics/univariate-inferential-tests/two-sample-z-test-for-comparing-two-means
   comparison_summary["z_score"] = (baseline_stats["mean"] -
                                    alternative_stats["mean"]) / numpy.sqrt(
                                        pow(baseline_stats["stderr"], 2) +
                                        pow(alternative_stats["stderr"], 2))

   # See  https://machinelearningmastery.com/critical-values-for-statistical-hypothesis-testing/
   comparison_summary["significance_level"] = scipy_stats.norm.cdf(
       abs(comparison_summary["z_score"]))

   diamond_count = get_diamonds_count(comparison_summary["significance_level"])
   comparison_summary["diamonds"] = diamond_count.apply(get_diamond_string)

   # Drop results for which comparing the mean makes no sense.
   comparison_summary = comparison_summary.drop([
       'battery_max_capacity', 'battery_current_capacity', 'sample_time',
       'elapsed_ns'
   ])

   # Display and save results.
   logging.info(comparison_summary)
   comparison_summary.to_csv(f"{data_dir}/comparison_summary.csv")


 def main():
   parser = argparse.ArgumentParser(
       description='Compares two summary files for analysis.')
   parser.add_argument("--output_dir",
                       help="Directory where to write the comparison file.",
                       required=True)
   parser.add_argument("--baseline_dir",
                       help="Directory containing the baseline benchmark data.",
                       required=True)
   parser.add_argument(
       "--alternative_dir",
       help="Directory containing the alternative benchmark data.",
       required=True)
   parser.add_argument('--verbose',
                       action='store_true',
                       help='Print verbose output.')
   args = parser.parse_args()

   if args.verbose:
     log_level = logging.DEBUG
   else:
     log_level = logging.INFO
   logging.basicConfig(format='%(levelname)s: %(message)s', level=log_level)

   baseline_summary_path = os.path.join(args.baseline_dir, "summary.csv")
   alternative_summary_path = os.path.join(args.alternative_dir, "summary.csv")
   summaries = [baseline_summary_path, alternative_summary_path]

   for summary in summaries:
     if not os.path.isfile(summary):
       logging.error(f"summary.csv missing in {summary}.")
       sys.exit(-1)

   compare(args.output_dir, summaries[0], summaries[1])


 if __name__ == "__main__":

   # Avoid scientific notation when printing numbers.
   pd.options.display.float_format = '{:.6f}'.format

   main()
	#!/usr/bin/env python3

	# Copyright 2021 The Chromium Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	import argparse
	import logging
	import os
	import pandas as pd
	import numpy

	from scipy import stats as scipy_stats


	def get_diamond_string(diamond_count: int):
	if diamond_count == 0:
	return "~"
	elif diamond_count == 1:
	return "◆"
	elif diamond_count == 2:
	return "◆◆"
	elif diamond_count == 3:
	return "◆◆◆"
	elif diamond_count == 4:
	return "◆◆◆◆"


	def get_diamonds_count(significance: pd.DataFrame):
	"""
	This function emulates the "diamond" significance representation
	that is familiar to UMA users.
	"""

	assert (not (significance > 1).any().any())
	assert (not (significance < 0).any().any())

	# Avoid log10(0) which is undefined.
	significance = numpy.clip(significance, 0, 0.999999)

	# scipy_stats.norm.cdf(1.96) = 0.975 and we're interested in 2 tail
	# test. 1.96 gives a 0.05 p-value. Multiply by 2 here to correct.
	p_value = (1 - significance) * 2

	# floor() to avoid exaggerating results and to round.
	# absolute() to make the result positive.
	log_p_value = numpy.floor(numpy.absolute(numpy.log10(p_value)))

	# Clip because 4 diamond is the max no matter the p-value.
	return numpy.clip(log_p_value, 0, 4)


	def compute_mean_and_stderr(summary_path: str):
	df = pd.read_csv(summary_path)

	# skipna because no line has as all measurements. This is because of the
	# different sampling rates of the data sources in power_sampler
	# and power_metrics.
	means = df.mean(skipna=True)

	# Calculate the standard error of each column.
	stderrs = df.std(skipna=True) / numpy.sqrt(df.count())
	stats = means.to_frame().join(stderrs.to_frame(),
	lsuffix='mean',
	rsuffix='stderr')
	stats = stats.rename(columns={"0mean": "mean", "0stderr": "stderr"})

	return stats


	def percent_difference(first_value: pd.DataFrame, second_value: pd.DataFrame):
	"""
	Returns the comparative percentage difference between two
	values/columns.

	The result is to be read as :
	\|second_value\| is X% smaller/larger than \|first_value\|.

	Ex: percent_difference(20, 10) --> -50
	Ex: percent_difference(10, 50) --> 500
	"""

	return ((second_value - first_value) / first_value) * 100


	def compare(data_dir: str, baseline_summary: str, alternative_summary: str):
	"""Open two summary files and compare their values. Saves the results
	in data_dir.

	Args:
	data_dir: The directory to save the comparison csv in.
	baseline_summary: summary.csv for the baseline.
	alternative_summary: summary.csv for the comparison.
	"""

	# Get names of the browsers being compared from the paths.
	baseline_name = os.path.basename(
	os.path.dirname(baseline_summary)).split("_")[0]
	alternative_name = os.path.basename(
	os.path.dirname(alternative_summary)).split("_")[0]

	all_stats = []

	# Extract mean and std values for each column of \|summary\| into a new
	# dataframe.
	baseline_stats = compute_mean_and_stderr(baseline_summary)
	alternative_stats = compute_mean_and_stderr(alternative_summary)

	# Join the calculated values for both browsers into a single dataframe.
	comparison_summary = baseline_stats.join(alternative_stats,
	lsuffix=f"_{baseline_name}",
	rsuffix=f"_{alternative_name}")

	# Calculate the difference in percent between the baseline and comparison.
	comparison_summary["difference"] = percent_difference(
	baseline_stats["mean"], alternative_stats["mean"])

	# See https://www.cliffsnotes.com/study-guides/statistics/univariate-inferential-tests/two-sample-z-test-for-comparing-two-means
	comparison_summary["z_score"] = (baseline_stats["mean"] -
	alternative_stats["mean"]) / numpy.sqrt(
	pow(baseline_stats["stderr"], 2) +
	pow(alternative_stats["stderr"], 2))

	# See https://machinelearningmastery.com/critical-values-for-statistical-hypothesis-testing/
	comparison_summary["significance_level"] = scipy_stats.norm.cdf(
	abs(comparison_summary["z_score"]))

	diamond_count = get_diamonds_count(comparison_summary["significance_level"])
	comparison_summary["diamonds"] = diamond_count.apply(get_diamond_string)

	# Drop results for which comparing the mean makes no sense.
	comparison_summary = comparison_summary.drop([
	'battery_max_capacity', 'battery_current_capacity', 'sample_time',
	'elapsed_ns'
	])

	# Display and save results.
	logging.info(comparison_summary)
	comparison_summary.to_csv(f"{data_dir}/comparison_summary.csv")


	def main():
	parser = argparse.ArgumentParser(
	description='Compares two summary files for analysis.')
	parser.add_argument("--output_dir",
	help="Directory where to write the comparison file.",
	required=True)
	parser.add_argument("--baseline_dir",
	help="Directory containing the baseline benchmark data.",
	required=True)
	parser.add_argument(
	"--alternative_dir",
	help="Directory containing the alternative benchmark data.",
	required=True)
	parser.add_argument('--verbose',
	action='store_true',
	help='Print verbose output.')
	args = parser.parse_args()

	if args.verbose:
	log_level = logging.DEBUG
	else:
	log_level = logging.INFO
	logging.basicConfig(format='%(levelname)s: %(message)s', level=log_level)

	baseline_summary_path = os.path.join(args.baseline_dir, "summary.csv")
	alternative_summary_path = os.path.join(args.alternative_dir, "summary.csv")
	summaries = [baseline_summary_path, alternative_summary_path]

	for summary in summaries:
	if not os.path.isfile(summary):
	logging.error(f"summary.csv missing in {summary}.")
	sys.exit(-1)

	compare(args.output_dir, summaries[0], summaries[1])


	if __name__ == "__main__":

	# Avoid scientific notation when printing numbers.
	pd.options.display.float_format = '{:.6f}'.format

	main()