Update media learning experiments.

First, it adds (Base | Enhanced)UnweightedTree200 experiments, which
are like the originals but with 200 training examples maximum
instead of 100.

Since we're focusing on unweighted regression, this CL removes:

 - BaseTable
     This didn't seem to be representative of real MediaCapabilities
     probably due to lack of data / no data persistence.  Since we
     are focusing on unweighted anyway, BaseUnweightedTable or
     BaseUnweightedTree are better baselines.  We can also compare
     those to the upcoming unweighted MediaCapabilities experiment.

 - BaseTree
     This was an okay stand-in for MediaCapabilities, but since we
     are focusing on unweighted tasks, it's unclear that it means
     much as a baseline anymore.  BaseUnweightedTree is better to
     use as a baseline now.

 - EnhancedTree
     This didn't perform much differently than BaseTree, and was
     weighted anyway.

 - EnhancedUnweightedTree
     This didn't perform much differently than the unweighted tree,
     so no need to maintain it.  The newly-added 200 variant has
     a larger training set maximum size (200(!)), so we can see if
     that lets us get some more value from the new features.  To
     check the value of the features, compare the 200 variant to the
     BaseUnweightedTree200 instead.

 - BinarySmoothness
     This isn't a regression task.

Change-Id: I18b5a8bd042d7e44ba0c2e7e3780fa4fbc31c6a5
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1606596
Reviewed-by: Ilya Sherman <isherman@chromium.org>
Reviewed-by: Chrome Cunningham <chcunningham@chromium.org>
Commit-Queue: Frank Liberato <liberato@chromium.org>
Cr-Commit-Position: refs/heads/master@{#660682}
diff --git a/media/capabilities/learning_helper.cc b/media/capabilities/learning_helper.cc
index 1903c54..eee53f0 100644
--- a/media/capabilities/learning_helper.cc
+++ b/media/capabilities/learning_helper.cc
@@ -23,27 +23,24 @@
 
 // Remember that these are used to construct UMA histogram names!  Be sure to
 // update histograms.xml if you change them!
-// Dropped frame ratio, default features, regression tree.
-const char* const kDroppedFrameRatioBaseTreeTaskName = "BaseTree";
-// Same as BaseTree, but with unweighted examples.
+
+// Dropped frame ratio, default features, unweighted regression tree.
 const char* const kDroppedFrameRatioBaseUnweightedTreeTaskName =
     "BaseUnweightedTree";
-// Dropped frame ratio, default+FeatureLibrary features, regression tree.
-const char* const kDroppedFrameRatioEnhancedTreeTaskName = "EnhancedTree";
-// Dropped frame ratio, default+FeatureLibrary features, regression tree,
-// examples are unweighted.
-const char* const kDroppedFrameRatioEnhancedUnweightedTreeTaskName =
-    "EnhancedUnweightedTree";
-// Binary smoothness, default+FeatureLibrary features, regression tree,
-// examples are unweighted.
-const char* const kBinarySmoothnessEnhancedUnweightedTreeTaskName =
-    "BinarySmoothnessTree";
-// Dropped frame ratio, default features, lookup table.
-const char* const kDroppedFrameRatioBaseTableTaskName = "BaseTable";
-// Same as BaseTable, but with unweighted examples.
+
+// Dropped frame ratio, default features, unweighted examples, lookup table.
 const char* const kDroppedFrameRatioBaseUnweightedTableTaskName =
     "BaseUnweightedTable";
 
+// Same as BaseUnweightedTree, but with 200 training examples max.
+const char* const kDroppedFrameRatioBaseUnweightedTree200TaskName =
+    "BaseUnweightedTree200";
+
+// Dropped frame ratio, default+FeatureLibrary features, regression tree with
+// unweighted examples and 200 training examples max.
+const char* const kDroppedFrameRatioEnhancedUnweightedTree200TaskName =
+    "EnhancedUnweightedTree200";
+
 // Threshold for the dropped frame to total frame ratio, at which we'll decide
 // that the playback was not smooth.
 constexpr double kSmoothnessThreshold = 0.1;
@@ -63,7 +60,7 @@
   // We only do this here since we own the session.  Normally, whatever creates
   // the session would register all the learning tasks.
   LearningTask dropped_frame_task(
-      kDroppedFrameRatioBaseTableTaskName, LearningTask::Model::kLookupTable,
+      "no name", LearningTask::Model::kLookupTable,
       {
           {"codec_profile",
            ::media::learning::LearningTask::Ordering::kUnordered},
@@ -79,52 +76,40 @@
   dropped_frame_task.uma_hacky_aggregate_confusion_matrix = true;
   dropped_frame_task.uma_hacky_by_training_weight_confusion_matrix = true;
 
-  // Pick a max reporting weight that represents the total number of frames.
-  // This will record in bucket [0, 4999], [5000, 9999], etc.  Unlike the
-  // existing mcap thresholds, these are not per-bucket.  That's why they're 10x
-  // higher than the per-bucket thresholds we're using there.  Mcap allows on
-  // the order of 2,500 frames in each of {resolution X fps X codec} buckets,
-  // while the reported training weight here would be total for the whole set.
-  // So, we multiply by about 20 to approximate the number of buckets to keep
-  // it about the same as the size of the cross product.
-  const double weighted_reporting_max = 49999.;
-  const double unweighted_reporting_max = 99.;
-  dropped_frame_task.max_reporting_weight = weighted_reporting_max;
-
-  learning_session_->RegisterTask(dropped_frame_task,
-                                  SequenceBoundFeatureProvider());
-  base_table_controller_ =
-      learning_session_->GetController(dropped_frame_task.name);
+  // Buckets will have 10 examples each, or 20 for the 200-set tasks.
+  const double data_set_size = 100;
+  const double big_data_set_size = 200;
 
   // Unweighted table
   dropped_frame_task.name = kDroppedFrameRatioBaseUnweightedTableTaskName;
-  dropped_frame_task.max_reporting_weight = unweighted_reporting_max;
+  dropped_frame_task.max_data_set_size = data_set_size;
   learning_session_->RegisterTask(dropped_frame_task,
                                   SequenceBoundFeatureProvider());
   base_unweighted_table_controller_ =
       learning_session_->GetController(dropped_frame_task.name);
 
-  // Modify the task to use ExtraTrees.
-  dropped_frame_task.name = kDroppedFrameRatioBaseTreeTaskName;
-  dropped_frame_task.model = LearningTask::Model::kExtraTrees;
-  dropped_frame_task.max_reporting_weight = weighted_reporting_max;
-  learning_session_->RegisterTask(dropped_frame_task,
-                                  SequenceBoundFeatureProvider());
-  base_tree_controller_ =
-      learning_session_->GetController(dropped_frame_task.name);
-
   // Unweighted base tree.
   dropped_frame_task.name = kDroppedFrameRatioBaseUnweightedTreeTaskName;
-  dropped_frame_task.max_reporting_weight = unweighted_reporting_max;
+  dropped_frame_task.model = LearningTask::Model::kExtraTrees;
+  dropped_frame_task.max_data_set_size = data_set_size;
   learning_session_->RegisterTask(dropped_frame_task,
                                   SequenceBoundFeatureProvider());
   base_unweighted_tree_controller_ =
       learning_session_->GetController(dropped_frame_task.name);
 
+  // Unweighted tree with a larger training set.
+  dropped_frame_task.name = kDroppedFrameRatioBaseUnweightedTree200TaskName;
+  dropped_frame_task.max_data_set_size = big_data_set_size;
+  learning_session_->RegisterTask(dropped_frame_task,
+                                  SequenceBoundFeatureProvider());
+  base_unweighted_tree_200_controller_ =
+      learning_session_->GetController(dropped_frame_task.name);
+
   // Add common features, if we have a factory.
   if (feature_factory) {
-    dropped_frame_task.name = kDroppedFrameRatioEnhancedTreeTaskName;
-    dropped_frame_task.max_reporting_weight = weighted_reporting_max;
+    dropped_frame_task.name =
+        kDroppedFrameRatioEnhancedUnweightedTree200TaskName;
+    dropped_frame_task.max_data_set_size = big_data_set_size;
     dropped_frame_task.feature_descriptions.push_back(
         {"origin", ::media::learning::LearningTask::Ordering::kUnordered});
     dropped_frame_task.feature_descriptions.push_back(
@@ -133,38 +118,7 @@
         FeatureLibrary::BatteryPower());
     learning_session_->RegisterTask(dropped_frame_task,
                                     feature_factory.Run(dropped_frame_task));
-    enhanced_tree_controller_ =
-        learning_session_->GetController(dropped_frame_task.name);
-
-    // Duplicate the task with a new name and UMA histogram.  We'll add
-    // unweighted examples to it to see which one does better.
-    dropped_frame_task.name = kDroppedFrameRatioEnhancedUnweightedTreeTaskName;
-    // Adjust the reporting weight since we'll have 100 or fewer examples.
-    dropped_frame_task.max_reporting_weight = unweighted_reporting_max;
-    learning_session_->RegisterTask(dropped_frame_task,
-                                    feature_factory.Run(dropped_frame_task));
-    enhanced_unweighted_tree_controller_ =
-        learning_session_->GetController(dropped_frame_task.name);
-
-    // Set up the binary smoothness task.  This has a nominal target, with
-    // "smooth" as 0, and "not smooth" as 1.  This is so that the low numbers
-    // are still smooth, and the hight numbers are still not smooth.  It makes
-    // reporting the same for both.
-    dropped_frame_task.name = kBinarySmoothnessEnhancedUnweightedTreeTaskName;
-    /* TODO(liberato): DistributionReporter only supports regression, so we
-       leave it as kNumeric.  Since we only add 0,1 as targets, it's probably
-       fairly close to the same thing.
-    dropped_frame_task.target_description = {
-        "is_smooth", ::media::learning::LearningTask::Ordering::kUnordered};
-    */
-    // We'll threshold the ratio when figuring out the binary label, so we just
-    // want to pick the majority.  Note that I have no idea if this is actually
-    // the best threshold, but it seems like a good place to start.
-    dropped_frame_task.smoothness_threshold = 0.5;
-    dropped_frame_task.max_reporting_weight = unweighted_reporting_max;
-    learning_session_->RegisterTask(dropped_frame_task,
-                                    feature_factory.Run(dropped_frame_task));
-    binary_tree_controller_ =
+    enhanced_unweighted_tree_200_controller_ =
         learning_session_->GetController(dropped_frame_task.name);
   }
 }
@@ -203,29 +157,16 @@
   // the examples is the right thing to do.
   example.target_value = TargetValue(
       static_cast<double>(new_stats.frames_dropped) / new_stats.frames_decoded);
+  example.weight = 1u;
 
   // Add this example to all tasks.
-  example.weight = 1u;
   AddExample(base_unweighted_table_controller_.get(), example);
   AddExample(base_unweighted_tree_controller_.get(), example);
+  AddExample(base_unweighted_tree_200_controller_.get(), example);
 
-  example.weight = new_stats.frames_decoded;
-  AddExample(base_table_controller_.get(), example);
-  AddExample(base_tree_controller_.get(), example);
-
-  if (enhanced_tree_controller_) {
+  if (enhanced_unweighted_tree_200_controller_) {
     example.features.push_back(origin);
-    example.weight = new_stats.frames_decoded;
-    AddExample(enhanced_tree_controller_.get(), example);
-
-    // Also add to the unweighted model.
-    example.weight = 1u;
-    AddExample(enhanced_unweighted_tree_controller_.get(), example);
-
-    // Threshold the target to 0 for "smooth", and 1 for "not smooth".
-    example.target_value =
-        TargetValue(example.target_value.value() > kSmoothnessThreshold);
-    AddExample(binary_tree_controller_.get(), example);
+    AddExample(enhanced_unweighted_tree_200_controller_.get(), example);
   }
 }
 
diff --git a/media/capabilities/learning_helper.h b/media/capabilities/learning_helper.h
index f4b03e1..7c1a0ef 100644
--- a/media/capabilities/learning_helper.h
+++ b/media/capabilities/learning_helper.h
@@ -45,16 +45,14 @@
   std::unique_ptr<learning::LearningSessionImpl> learning_session_;
 
   // Controllers for each task.
-  std::unique_ptr<learning::LearningTaskController> base_table_controller_;
-  std::unique_ptr<learning::LearningTaskController> base_tree_controller_;
   std::unique_ptr<learning::LearningTaskController>
       base_unweighted_table_controller_;
   std::unique_ptr<learning::LearningTaskController>
       base_unweighted_tree_controller_;
-  std::unique_ptr<learning::LearningTaskController> enhanced_tree_controller_;
   std::unique_ptr<learning::LearningTaskController>
-      enhanced_unweighted_tree_controller_;
-  std::unique_ptr<learning::LearningTaskController> binary_tree_controller_;
+      base_unweighted_tree_200_controller_;
+  std::unique_ptr<learning::LearningTaskController>
+      enhanced_unweighted_tree_200_controller_;
 };
 
 }  // namespace media
diff --git a/media/learning/common/learning_task.h b/media/learning/common/learning_task.h
index 4980919..258dd39 100644
--- a/media/learning/common/learning_task.h
+++ b/media/learning/common/learning_task.h
@@ -178,7 +178,13 @@
   // into different confusion matrices in the same histogram, evenly spaced
   // from 0 to |max_reporting_weight|, with one additional bucket for everything
   // larger than that.  The number of buckets is |num_reporting_weight_buckets|.
-  double max_reporting_weight = 99.;
+  // The default value of 0 is special; it means that we should split up the
+  // buckets such that the last bucket means "entirely full training set", while
+  // the remainder are evenly spaced.  This is the same as setting it to
+  // |max_data_set_size - 1|.  Of course, |max_data_set_size| is a number of
+  // examples, not a weight, so this only makes any sense at all if all of the
+  // examples have the default weight of 1.
+  double max_reporting_weight = 0.;
 
   // Number of buckets that we'll use to split out the confusion matrix by
   // training weight.  The last one is reserved for "all", while the others are
diff --git a/media/learning/impl/distribution_reporter.cc b/media/learning/impl/distribution_reporter.cc
index 6627644..3f110be 100644
--- a/media/learning/impl/distribution_reporter.cc
+++ b/media/learning/impl/distribution_reporter.cc
@@ -146,6 +146,12 @@
       const int n_buckets = task().num_reporting_weight_buckets;
       DCHECK_LE(n_buckets, max_buckets);
 
+      // If the max reporting weight is zero, then default to splitting the
+      // buckets evenly, with the last bucket being "completely full set".
+      const int max_reporting_weight = task().max_reporting_weight
+                                           ? task().max_reporting_weight
+                                           : task().max_data_set_size - 1;
+
       // We use one fewer buckets, to save one for the overflow.  Buckets are
       // numbered from 0 to |n_buckets-1|, inclusive.  In other words, when the
       // training weight is equal to |max_reporting_weight|, we still want to
@@ -153,7 +159,7 @@
       // we divide; only things over the max go into the last bucket.
       uma_bucket_number =
           std::min<int>((n_buckets - 1) * info.total_training_weight /
-                            (task().max_reporting_weight + 1),
+                            (max_reporting_weight + 1),
                         n_buckets - 1);
 
       std::string base(kByTrainingWeightBase);
diff --git a/tools/metrics/histograms/histograms.xml b/tools/metrics/histograms/histograms.xml
index 9154bb4..c424de0 100644
--- a/tools/metrics/histograms/histograms.xml
+++ b/tools/metrics/histograms/histograms.xml
@@ -150426,17 +150426,36 @@
 
 <histogram_suffixes name="Media.Learning.BinaryThreshold" separator=".">
   <suffix name="BaseTable" label="Basic features, lookup table model"/>
-  <suffix name="BaseTree" label="Basic features, ExtraTrees model"/>
+  <suffix name="BaseTree" label="Basic features, ExtraTrees model">
+    <obsolete>
+      Removed as of 5/15/2019.
+    </obsolete>
+  </suffix>
   <suffix name="BaseUnweightedTable"
       label="Basic features, unweighted lookup table model"/>
   <suffix name="BaseUnweightedTree"
       label="Basic features, unweighted ExtraTrees model"/>
+  <suffix name="BaseUnweightedTree200"
+      label="Basic features, unweighted ExtraTrees model, 200 examples"/>
   <suffix name="BinarySmoothnessTree"
-      label="Basic+extra features, unweighted pre-thresholded ExtraTrees
-             model"/>
-  <suffix name="EnhancedTree" label="Basic+extra features, ExtraTrees model"/>
+      label="Basic+extra features, unweighted pre-thresholded ExtraTrees model">
+    <obsolete>
+      Removed as of 5/15/2019.
+    </obsolete>
+  </suffix>
+  <suffix name="EnhancedTree" label="Basic+extra features, ExtraTrees model">
+    <obsolete>
+      Removed as of 5/15/2019.
+    </obsolete>
+  </suffix>
   <suffix name="EnhancedUnweightedTree"
-      label="Basic+extra features, unweighted ExtraTrees model"/>
+      label="Basic+extra features, unweighted ExtraTrees model">
+    <obsolete>
+      Removed as of 5/15/2019.
+    </obsolete>
+  </suffix>
+  <suffix name="EnhancedUnweightedTree200"
+      label="Basic+extra features, unweighted ExtraTrees model, 200 examples"/>
   <affected-histogram name="Media.Learning.BinaryThreshold.Aggregate"/>
   <affected-histogram name="Media.Learning.BinaryThreshold.ByFeature"/>
   <affected-histogram name="Media.Learning.BinaryThreshold.ByTrainingWeight"/>