media/learning/impl/distribution_reporter.cc - chromium/src - Git at Google

 // Copyright 2018 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "media/learning/impl/distribution_reporter.h"

 #include "base/bind.h"
 #include "base/metrics/histogram_functions.h"
 #include "services/metrics/public/cpp/ukm_builders.h"
 #include "services/metrics/public/cpp/ukm_recorder.h"

 namespace media {
 namespace learning {

 // UMA histogram base names.
 static const char* kAggregateBase = "Media.Learning.BinaryThreshold.Aggregate.";
 static const char* kByTrainingWeightBase =
     "Media.Learning.BinaryThreshold.ByTrainingWeight.";
 static const char* kByFeatureBase = "Media.Learning.BinaryThreshold.ByFeature.";

 enum /* not class */ Bits {
   // These are meant to be bitwise-or'd together, so both false cases just mean
   // "don't set any bits".
   PredictedFalse = 0x00,
   ObservedFalse = 0x00,
   ObservedTrue = 0x01,
   PredictedTrue = 0x02,
   // Special value to mean that no prediction was made.
   PredictedNothing = 0x04,
 };

 // Low order bit is "observed", second bit is "predicted", third bit is
 // "could not make a prediction".
 enum class ConfusionMatrix {
   TrueNegative = Bits::PredictedFalse | Bits::ObservedFalse,
   FalseNegative = Bits::PredictedFalse | Bits::ObservedTrue,
   FalsePositive = Bits::PredictedTrue | Bits::ObservedFalse,
   TruePositive = Bits::PredictedTrue | Bits::ObservedTrue,
   SkippedNegative = Bits::PredictedNothing | Bits::ObservedFalse,
   SkippedPositive = Bits::PredictedNothing | Bits::ObservedTrue,
   kMaxValue = SkippedPositive
 };

 // TODO(liberato): Currently, this implementation is a hack to collect some
 // sanity-checking data for local learning with MediaCapabilities.  We assume
 // that the prediction is the "percentage of dropped frames".
 class UmaRegressionReporter : public DistributionReporter {
  public:
   UmaRegressionReporter(const LearningTask& task)
       : DistributionReporter(task) {}

   void OnPrediction(const PredictionInfo& info,
                     TargetHistogram predicted) override {
     DCHECK_EQ(task().target_description.ordering,
               LearningTask::Ordering::kNumeric);

     // As a complete hack, record accuracy with a fixed threshold.  The average
     // is the observed / predicted percentage of dropped frames.
     bool observed_smooth = info.observed.value() <= task().smoothness_threshold;

     // See if we made a prediction.
     int prediction_bit_mask = Bits::PredictedNothing;
     if (predicted.total_counts() != 0) {
       bool predicted_smooth =
           predicted.Average() <= task().smoothness_threshold;
       DVLOG(2) << "Learning: " << task().name
                << ": predicted: " << predicted_smooth << " ("
                << predicted.Average() << ") observed: " << observed_smooth
                << " (" << info.observed.value() << ")";
       prediction_bit_mask =
           predicted_smooth ? Bits::PredictedTrue : Bits::PredictedFalse;
     } else {
       DVLOG(2) << "Learning: " << task().name
                << ": predicted: N/A observed: " << observed_smooth << " ("
                << info.observed.value() << ")";
     }

     // Figure out the ConfusionMatrix enum value.
     ConfusionMatrix confusion_matrix_value = static_cast<ConfusionMatrix>(
         (observed_smooth ? Bits::ObservedTrue : Bits::ObservedFalse) |
         prediction_bit_mask);

     // |uma_bucket_number| is the bucket number that we'll fill in with this
     // count.  It ranges from 0 to |max_buckets-1|, inclusive.  Each bucket is
     // is separated from the start of the previous bucket by |uma_bucket_size|.
     int uma_bucket_number = 0;
     constexpr int matrix_size =
         static_cast<int>(ConfusionMatrix::kMaxValue) + 1;

     // The enum.xml entries separate the buckets by 10, to make it easy to see
     // by inspection what bucket number we're in (e.g., x-axis position 23 is
     // bucket 2 * 10 + PredictedTrue|ObservedTrue).  The label in enum.xml for
     // MegaConfusionMatrix also provides the bucket number for easy reading.
     constexpr int uma_bucket_size = 10;
     DCHECK_LE(matrix_size, uma_bucket_size);

     // Maximum number of buckets defined in enums.xml, numbered from 0.
     constexpr int max_buckets = 16;

     // Sparse histograms can technically go past 100 exactly-stored elements,
     // but we limit it anyway.  Note that we don't care about |uma_bucket_size|,
     // since it's a sparse histogram.  Only |matrix_size| elements are used in
     // each bucket.
     DCHECK_LE(max_buckets * matrix_size, 100);

     // If we're splitting by feature, then record it and stop.  The others
     // aren't meaningful to record if we're using random feature subsets.
     if (task().uma_hacky_by_feature_subset_confusion_matrix &&
         feature_indices() && feature_indices()->size() == 1) {
       // The bucket number is just the feature number that was selected.
       uma_bucket_number =
           std::min(*feature_indices()->begin(), max_buckets - 1);

       std::string base(kByFeatureBase);
       base::UmaHistogramSparse(base + task().name,
                                static_cast<int>(confusion_matrix_value) +
                                    uma_bucket_number * uma_bucket_size);

       // Early return since no other measurements are meaningful when we're
       // using feature subsets.
       return;
     }

     // If we're selecting a feature subset that's bigger than one but smaller
     // than all of them, then we don't know how to report that.
     if (feature_indices() &&
         feature_indices()->size() != task().feature_descriptions.size()) {
       return;
     }

     // Do normal reporting.

     // Record the aggregate confusion matrix.
     if (task().uma_hacky_aggregate_confusion_matrix) {
       std::string base(kAggregateBase);
       base::UmaHistogramEnumeration(base + task().name, confusion_matrix_value);
     }

     if (task().uma_hacky_by_training_weight_confusion_matrix) {
       // Adjust |uma_bucket_offset| by the training weight, and store the
       // results in that bucket in the ByTrainingWeight histogram.
       //
       // This will bucket from 0 in even steps, with the last bucket holding
       // |max_reporting_weight+1| and everything above it.

       const int n_buckets = task().num_reporting_weight_buckets;
       DCHECK_LE(n_buckets, max_buckets);

       // If the max reporting weight is zero, then default to splitting the
       // buckets evenly, with the last bucket being "completely full set".
       const int max_reporting_weight = task().max_reporting_weight
                                            ? task().max_reporting_weight
                                            : task().max_data_set_size - 1;

       // We use one fewer buckets, to save one for the overflow.  Buckets are
       // numbered from 0 to |n_buckets-1|, inclusive.  In other words, when the
       // training weight is equal to |max_reporting_weight|, we still want to
       // be in bucket |n_buckets - 2|.  That's why we add one to the max before
       // we divide; only things over the max go into the last bucket.
       uma_bucket_number =
           std::min<int>((n_buckets - 1) * info.total_training_weight /
                             (max_reporting_weight + 1),
                         n_buckets - 1);

       std::string base(kByTrainingWeightBase);
       base::UmaHistogramSparse(base + task().name,
                                static_cast<int>(confusion_matrix_value) +
                                    uma_bucket_number * uma_bucket_size);
     }
   }
 };

 // Ukm-based reporter.
 class UkmRegressionReporter : public DistributionReporter {
  public:
   UkmRegressionReporter(const LearningTask& task)
       : DistributionReporter(task) {}

   void OnPrediction(const PredictionInfo& info,
                     TargetHistogram predicted) override {
     DCHECK_EQ(task().target_description.ordering,
               LearningTask::Ordering::kNumeric);

     DCHECK_NE(info.source_id, ukm::kInvalidSourceId);

     ukm::UkmRecorder* ukm_recorder = ukm::UkmRecorder::Get();
     if (!ukm_recorder)
       return;

     ukm::builders::Media_Learning_PredictionRecord builder(info.source_id);
     builder.SetLearningTask(task().GetId());
     builder.SetObservedValue(Bucketize(info.observed.value()));
     builder.SetPredictedValue(Bucketize(predicted.Average()));
     builder.SetTrainingDataTotalWeight(info.total_training_weight);
     builder.SetTrainingDataSize(info.total_training_examples);
     // TODO(liberato): we'd add feature subsets here.

     builder.Record(ukm_recorder);
   }

   // Scale and translate |value| from the range specified in the task to 0-100.
   // We scale it so that the buckets have an equal amount of the input range in
   // each of them.
   int Bucketize(double value) {
     const int output_min = 0;
     const int output_max = 100;
     // Scale it so that input_min -> output_min and input_max -> output_max.
     // Note that the input width is |input_max - input_min|, but there are
     // |output_max - output_min + 1| output buckets.  That's why we don't
     // add one to the denominator, but we do add one to the numerator.
     double scaled_value =
         ((output_max - output_min + 1) * (value - task().ukm_min_input_value)) /
             (task().ukm_max_input_value - task().ukm_min_input_value) +
         output_min;
     // Clip to [0, 100] and truncate to an integer.
     return std::min(std::max(static_cast<int>(scaled_value), output_min),
                     output_max);
   }
 };

 std::unique_ptr<DistributionReporter> DistributionReporter::Create(
     const LearningTask& task) {
   // We only know how to report regression tasks right now.
   if (task.target_description.ordering != LearningTask::Ordering::kNumeric)
     return nullptr;

   // We can report hacky UMA or non-hacky UKM.  We could report both if we had
   // a DistributionReporter that forwarded predictions to each, but we don't.
   if (task.uma_hacky_aggregate_confusion_matrix ||
       task.uma_hacky_by_training_weight_confusion_matrix ||
       task.uma_hacky_by_feature_subset_confusion_matrix) {
     return std::make_unique<UmaRegressionReporter>(task);
   } else if (task.report_via_ukm) {
     return std::make_unique<UkmRegressionReporter>(task);
   }

   return nullptr;
 }

 DistributionReporter::DistributionReporter(const LearningTask& task)
     : task_(task), weak_factory_(this) {}

 DistributionReporter::~DistributionReporter() = default;

 Model::PredictionCB DistributionReporter::GetPredictionCallback(
     const PredictionInfo& info) {
   return base::BindOnce(&DistributionReporter::OnPrediction,
                         weak_factory_.GetWeakPtr(), info);
 }

 void DistributionReporter::SetFeatureSubset(
     const std::set<int>& feature_indices) {
   feature_indices_ = feature_indices;
 }

 }  // namespace learning
 }  // namespace media
	// Copyright 2018 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "media/learning/impl/distribution_reporter.h"

	#include "base/bind.h"
	#include "base/metrics/histogram_functions.h"
	#include "services/metrics/public/cpp/ukm_builders.h"
	#include "services/metrics/public/cpp/ukm_recorder.h"

	namespace media {
	namespace learning {

	// UMA histogram base names.
	static const char* kAggregateBase = "Media.Learning.BinaryThreshold.Aggregate.";
	static const char* kByTrainingWeightBase =
	"Media.Learning.BinaryThreshold.ByTrainingWeight.";
	static const char* kByFeatureBase = "Media.Learning.BinaryThreshold.ByFeature.";

	enum /* not class */ Bits {
	// These are meant to be bitwise-or'd together, so both false cases just mean
	// "don't set any bits".
	PredictedFalse = 0x00,
	ObservedFalse = 0x00,
	ObservedTrue = 0x01,
	PredictedTrue = 0x02,
	// Special value to mean that no prediction was made.
	PredictedNothing = 0x04,
	};

	// Low order bit is "observed", second bit is "predicted", third bit is
	// "could not make a prediction".
	enum class ConfusionMatrix {
	TrueNegative = Bits::PredictedFalse \| Bits::ObservedFalse,
	FalseNegative = Bits::PredictedFalse \| Bits::ObservedTrue,
	FalsePositive = Bits::PredictedTrue \| Bits::ObservedFalse,
	TruePositive = Bits::PredictedTrue \| Bits::ObservedTrue,
	SkippedNegative = Bits::PredictedNothing \| Bits::ObservedFalse,
	SkippedPositive = Bits::PredictedNothing \| Bits::ObservedTrue,
	kMaxValue = SkippedPositive
	};

	// TODO(liberato): Currently, this implementation is a hack to collect some
	// sanity-checking data for local learning with MediaCapabilities. We assume
	// that the prediction is the "percentage of dropped frames".
	class UmaRegressionReporter : public DistributionReporter {
	public:
	UmaRegressionReporter(const LearningTask& task)
	: DistributionReporter(task) {}

	void OnPrediction(const PredictionInfo& info,
	TargetHistogram predicted) override {
	DCHECK_EQ(task().target_description.ordering,
	LearningTask::Ordering::kNumeric);

	// As a complete hack, record accuracy with a fixed threshold. The average
	// is the observed / predicted percentage of dropped frames.
	bool observed_smooth = info.observed.value() <= task().smoothness_threshold;

	// See if we made a prediction.
	int prediction_bit_mask = Bits::PredictedNothing;
	if (predicted.total_counts() != 0) {
	bool predicted_smooth =
	predicted.Average() <= task().smoothness_threshold;
	DVLOG(2) << "Learning: " << task().name
	<< ": predicted: " << predicted_smooth << " ("
	<< predicted.Average() << ") observed: " << observed_smooth
	<< " (" << info.observed.value() << ")";
	prediction_bit_mask =
	predicted_smooth ? Bits::PredictedTrue : Bits::PredictedFalse;
	} else {
	DVLOG(2) << "Learning: " << task().name
	<< ": predicted: N/A observed: " << observed_smooth << " ("
	<< info.observed.value() << ")";
	}

	// Figure out the ConfusionMatrix enum value.
	ConfusionMatrix confusion_matrix_value = static_cast<ConfusionMatrix>(
	(observed_smooth ? Bits::ObservedTrue : Bits::ObservedFalse) \|
	prediction_bit_mask);

	// \|uma_bucket_number\| is the bucket number that we'll fill in with this
	// count. It ranges from 0 to \|max_buckets-1\|, inclusive. Each bucket is
	// is separated from the start of the previous bucket by \|uma_bucket_size\|.
	int uma_bucket_number = 0;
	constexpr int matrix_size =
	static_cast<int>(ConfusionMatrix::kMaxValue) + 1;

	// The enum.xml entries separate the buckets by 10, to make it easy to see
	// by inspection what bucket number we're in (e.g., x-axis position 23 is
	// bucket 2 * 10 + PredictedTrue\|ObservedTrue). The label in enum.xml for
	// MegaConfusionMatrix also provides the bucket number for easy reading.
	constexpr int uma_bucket_size = 10;
	DCHECK_LE(matrix_size, uma_bucket_size);

	// Maximum number of buckets defined in enums.xml, numbered from 0.
	constexpr int max_buckets = 16;

	// Sparse histograms can technically go past 100 exactly-stored elements,
	// but we limit it anyway. Note that we don't care about \|uma_bucket_size\|,
	// since it's a sparse histogram. Only \|matrix_size\| elements are used in
	// each bucket.
	DCHECK_LE(max_buckets * matrix_size, 100);

	// If we're splitting by feature, then record it and stop. The others
	// aren't meaningful to record if we're using random feature subsets.
	if (task().uma_hacky_by_feature_subset_confusion_matrix &&
	feature_indices() && feature_indices()->size() == 1) {
	// The bucket number is just the feature number that was selected.
	uma_bucket_number =
	std::min(*feature_indices()->begin(), max_buckets - 1);

	std::string base(kByFeatureBase);
	base::UmaHistogramSparse(base + task().name,
	static_cast<int>(confusion_matrix_value) +
	uma_bucket_number * uma_bucket_size);

	// Early return since no other measurements are meaningful when we're
	// using feature subsets.
	return;
	}

	// If we're selecting a feature subset that's bigger than one but smaller
	// than all of them, then we don't know how to report that.
	if (feature_indices() &&
	feature_indices()->size() != task().feature_descriptions.size()) {
	return;
	}

	// Do normal reporting.

	// Record the aggregate confusion matrix.
	if (task().uma_hacky_aggregate_confusion_matrix) {
	std::string base(kAggregateBase);
	base::UmaHistogramEnumeration(base + task().name, confusion_matrix_value);
	}

	if (task().uma_hacky_by_training_weight_confusion_matrix) {
	// Adjust \|uma_bucket_offset\| by the training weight, and store the
	// results in that bucket in the ByTrainingWeight histogram.
	//
	// This will bucket from 0 in even steps, with the last bucket holding
	// \|max_reporting_weight+1\| and everything above it.

	const int n_buckets = task().num_reporting_weight_buckets;
	DCHECK_LE(n_buckets, max_buckets);

	// If the max reporting weight is zero, then default to splitting the
	// buckets evenly, with the last bucket being "completely full set".
	const int max_reporting_weight = task().max_reporting_weight
	? task().max_reporting_weight
	: task().max_data_set_size - 1;

	// We use one fewer buckets, to save one for the overflow. Buckets are
	// numbered from 0 to \|n_buckets-1\|, inclusive. In other words, when the
	// training weight is equal to \|max_reporting_weight\|, we still want to
	// be in bucket \|n_buckets - 2\|. That's why we add one to the max before
	// we divide; only things over the max go into the last bucket.
	uma_bucket_number =
	std::min<int>((n_buckets - 1) * info.total_training_weight /
	(max_reporting_weight + 1),
	n_buckets - 1);

	std::string base(kByTrainingWeightBase);
	base::UmaHistogramSparse(base + task().name,
	static_cast<int>(confusion_matrix_value) +
	uma_bucket_number * uma_bucket_size);
	}
	}
	};

	// Ukm-based reporter.
	class UkmRegressionReporter : public DistributionReporter {
	public:
	UkmRegressionReporter(const LearningTask& task)
	: DistributionReporter(task) {}

	void OnPrediction(const PredictionInfo& info,
	TargetHistogram predicted) override {
	DCHECK_EQ(task().target_description.ordering,
	LearningTask::Ordering::kNumeric);

	DCHECK_NE(info.source_id, ukm::kInvalidSourceId);

	ukm::UkmRecorder* ukm_recorder = ukm::UkmRecorder::Get();
	if (!ukm_recorder)
	return;

	ukm::builders::Media_Learning_PredictionRecord builder(info.source_id);
	builder.SetLearningTask(task().GetId());
	builder.SetObservedValue(Bucketize(info.observed.value()));
	builder.SetPredictedValue(Bucketize(predicted.Average()));
	builder.SetTrainingDataTotalWeight(info.total_training_weight);
	builder.SetTrainingDataSize(info.total_training_examples);
	// TODO(liberato): we'd add feature subsets here.

	builder.Record(ukm_recorder);
	}

	// Scale and translate \|value\| from the range specified in the task to 0-100.
	// We scale it so that the buckets have an equal amount of the input range in
	// each of them.
	int Bucketize(double value) {
	const int output_min = 0;
	const int output_max = 100;
	// Scale it so that input_min -> output_min and input_max -> output_max.
	// Note that the input width is \|input_max - input_min\|, but there are
	// \|output_max - output_min + 1\| output buckets. That's why we don't
	// add one to the denominator, but we do add one to the numerator.
	double scaled_value =
	((output_max - output_min + 1) * (value - task().ukm_min_input_value)) /
	(task().ukm_max_input_value - task().ukm_min_input_value) +
	output_min;
	// Clip to [0, 100] and truncate to an integer.
	return std::min(std::max(static_cast<int>(scaled_value), output_min),
	output_max);
	}
	};

	std::unique_ptr<DistributionReporter> DistributionReporter::Create(
	const LearningTask& task) {
	// We only know how to report regression tasks right now.
	if (task.target_description.ordering != LearningTask::Ordering::kNumeric)
	return nullptr;

	// We can report hacky UMA or non-hacky UKM. We could report both if we had
	// a DistributionReporter that forwarded predictions to each, but we don't.
	if (task.uma_hacky_aggregate_confusion_matrix \|\|
	task.uma_hacky_by_training_weight_confusion_matrix \|\|
	task.uma_hacky_by_feature_subset_confusion_matrix) {
	return std::make_unique<UmaRegressionReporter>(task);
	} else if (task.report_via_ukm) {
	return std::make_unique<UkmRegressionReporter>(task);
	}

	return nullptr;
	}

	DistributionReporter::DistributionReporter(const LearningTask& task)
	: task_(task), weak_factory_(this) {}

	DistributionReporter::~DistributionReporter() = default;

	Model::PredictionCB DistributionReporter::GetPredictionCallback(
	const PredictionInfo& info) {
	return base::BindOnce(&DistributionReporter::OnPrediction,
	weak_factory_.GetWeakPtr(), info);
	}

	void DistributionReporter::SetFeatureSubset(
	const std::set<int>& feature_indices) {
	feature_indices_ = feature_indices;
	}

	} // namespace learning
	} // namespace media