blob: ed5f4f76ac3eecb6d6792071c8c05cca0ed459bc [file] [log] [blame]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <initializer_list>
#include <string>
#include <vector>
#include "base/component_export.h"
#include "base/optional.h"
#include "media/learning/common/value.h"
namespace media {
namespace learning {
// Description of a learning task. This includes both the description of the
// inputs (features) and output (target value), plus a choice of the model and
// parameters for learning.
// TODO(liberato): Consider separating the task from the choice of model.
// TODO(liberato): should this be in impl? Probably not if we want to allow
// registering tasks.
// Not all models support all feature / target descriptions. For example,
// NaiveBayes requires kUnordered features. Similarly, LogLinear woudln't
// support kUnordered features or targets. kRandomForest might support more
// combination of orderings and types.
enum class Model {
// For the fuzzer.
kMaxValue = kLookupTable
enum class Ordering {
// Values are not ordered; nearby values might have wildly different
// meanings. For example, two ints that are computed by taking the hash
// of a string are unordered; it's categorical data. Values of type DOUBLE
// should almost certainly not be kUnordered; discretize them in some way
// if you really want to make discrete, unordered buckets out of them.
// Values may be interpreted as being in numeric order. For example, two
// ints that represent the number of elapsed milliseconds are numerically
// ordered in a meaningful way.
// For the fuzzer.
kMaxValue = kNumeric
enum class PrivacyMode {
// Value represents private information, such as a URL that was visited by
// the user.
// Value does not represent private information, such as video width.
// For the fuzzer.
kMaxValue = kPublic
// Description of how a Value should be interpreted.
struct ValueDescription {
// Name of this value, such as "source_url" or "width".
std::string name;
// Is this value nominal or not?
Ordering ordering = Ordering::kUnordered;
// Should this value be treated as being private?
PrivacyMode privacy_mode = PrivacyMode::kPublic;
LearningTask(const std::string& name,
Model model,
std::initializer_list<ValueDescription> feature_init_list,
ValueDescription target_description);
LearningTask(const LearningTask&);
// Unique name for this learner.
std::string name;
Model model = Model::kExtraTrees;
std::vector<ValueDescription> feature_descriptions;
// Note that kUnordered targets indicate classification, while kOrdered
// targes indicate regression.
ValueDescription target_description;
// TODO(liberato): add training parameters, like smoothing constants. It's
// okay if some of these are model-specific.
// TODO(liberato): switch to base::DictionaryValue?
// Maximum data set size until we start replacing examples.
size_t max_data_set_size = 100u;
// Fraction of examples that must be new before the task controller will train
// a new model. Note that this is a fraction of the number of examples that
// we currently have, which might be less than |max_data_set_size|.
double min_new_data_fraction = 0.1;
// If provided, then we'll randomly select a |*feature_subset_size|-sized set
// of feature to train the model with, to allow for feature importance
// measurement. Note that UMA reporting only supports subsets of size one, or
// the whole set.
base::Optional<int> feature_subset_size;
// RandomForest parameters
// Number of trees in the random forest.
size_t rf_number_of_trees = 100;
// Should ExtraTrees apply one-hot conversion automatically? RandomTree has
// been modified to support nominals directly, though it isn't exactly the
// same as one-hot conversion. It is, however, much faster.
bool use_one_hot_conversion = false;
// Reporting parameters
// This is a hack for the initial media capabilities investigation. It
// represents the threshold that we'll use to decide if a prediction would be
// T / F. We should not do this -- instead we should report the distribution
// average for the prediction and the observation via UKM.
// In particular, if the percentage of dropped frames is greater than this,
// then report "false" (not smooth), else we report true.
// A better, non-hacky approach would be to report the predictions and
// observations directly, and do offline analysis with whatever threshold we
// like. This would remove the thresholding requirement, and also permit
// additional types of analysis for general regression tasks, such measuring
// the prediction error directly.
// The UKM reporter will support this.
double smoothness_threshold = 0.1;
// If set, then we'll record a confusion matrix (hackily, see
// |smoothness_threshold|, above, for what that means) to UMA for all
// predictions. Add this task's name to histograms.xml, in the histogram
// suffixes for "Media.Learning.BinaryThreshold.Aggregate". The threshold is
// chosen by |smoothness_threshold|.
// This option is ignored if feature subset selection is in use.
bool uma_hacky_aggregate_confusion_matrix = false;
// If set, then we'll record a histogram of many confusion matrices, split out
// by the total training data weight that was used to construct the model. Be
// sure to add this task's name to histograms.xml, in the histogram suffixes
// for "Media.Learning.BinaryThreshold.ByTrainingWeight". The threshold is
// chosen by |smoothness_threshold|.
// This option is ignored if feature subset selection is in use.
bool uma_hacky_by_training_weight_confusion_matrix = false;
// If set, then we'll record a histogram of many confusion matrices, split out
// by the (single) selected feature subset. This does nothing if we're not
// using feature subsets, or if the subset size isn't one. Be sure to add
// this tasks' name to histograms.xml, in the histogram suffixes for
// "Media.Learning.BinaryThreshold.ByFeature" too.
bool uma_hacky_by_feature_subset_confusion_matrix = false;
// Maximum training weight for UMA reporting. We'll report results offset
// into different confusion matrices in the same histogram, evenly spaced
// from 0 to |max_reporting_weight|, with one additional bucket for everything
// larger than that. The number of buckets is |num_reporting_weight_buckets|.
double max_reporting_weight = 99.;
// Number of buckets that we'll use to split out the confusion matrix by
// training weight. The last one is reserved for "all", while the others are
// split evenly from 0 to |max_reporting_weight|, inclusive. One can select
// up to 15 buckets. We use 11 by default, so it breaks up the default weight
// into buckets of size 10.
// In other words, the defaults will make these buckets:
// [0-9] [10-19] ... [90-99] [100 and up]. This makes sense if the training
// set maximum size is the default of 100, and each example has a weight of 1.
int num_reporting_weight_buckets = 11;
} // namespace learning
} // namespace media