blob: c45b757716ecb1069c62dc2ce73825de482a8017 [file] [log] [blame]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "media/learning/impl/random_tree_trainer.h"
#include <math.h>
#include "base/bind.h"
#include "base/logging.h"
#include "base/optional.h"
#include "base/threading/sequenced_task_runner_handle.h"
namespace media {
namespace learning {
RandomTreeTrainer::Split::Split() = default;
RandomTreeTrainer::Split::Split(int index) : split_index(index) {}
RandomTreeTrainer::Split::Split(Split&& rhs) = default;
RandomTreeTrainer::Split::~Split() = default;
RandomTreeTrainer::Split& RandomTreeTrainer::Split::operator=(Split&& rhs) =
default;
RandomTreeTrainer::Split::BranchInfo::BranchInfo() = default;
RandomTreeTrainer::Split::BranchInfo::BranchInfo(BranchInfo&& rhs) = default;
RandomTreeTrainer::Split::BranchInfo::~BranchInfo() = default;
struct InteriorNode : public Model {
InteriorNode(const LearningTask& task,
int split_index,
FeatureValue split_point)
: split_index_(split_index),
ordering_(task.feature_descriptions[split_index].ordering),
split_point_(split_point) {}
// Model
TargetHistogram PredictDistribution(const FeatureVector& features) override {
// Figure out what feature value we should use for the split.
FeatureValue f;
switch (ordering_) {
case LearningTask::Ordering::kUnordered:
// Use 0 for "!=" and 1 for "==".
f = FeatureValue(features[split_index_] == split_point_);
break;
case LearningTask::Ordering::kNumeric:
// Use 0 for "<=" and 1 for ">".
f = FeatureValue(features[split_index_] > split_point_);
break;
}
auto iter = children_.find(f);
// If we've never seen this feature value, then return nothing.
if (iter == children_.end())
return TargetHistogram();
return iter->second->PredictDistribution(features);
}
TargetHistogram PredictDistributionWithMissingValues(
const FeatureVector& features) {
TargetHistogram total;
for (auto& child_pair : children_) {
TargetHistogram predicted =
child_pair.second->PredictDistribution(features);
// TODO(liberato): Normalize? Weight?
total += predicted;
}
return total;
}
// Add |child| has the node for feature value |v|.
void AddChild(FeatureValue v, std::unique_ptr<Model> child) {
DCHECK_EQ(children_.count(v), 0u);
children_.emplace(v, std::move(child));
}
private:
// Feature value that we split on.
int split_index_ = -1;
base::flat_map<FeatureValue, std::unique_ptr<Model>> children_;
// How is our feature value ordered?
LearningTask::Ordering ordering_;
// For kNumeric features, this is the split point.
FeatureValue split_point_;
};
struct LeafNode : public Model {
LeafNode(const TrainingData& training_data,
const std::vector<size_t> training_idx,
LearningTask::Ordering ordering) {
for (size_t idx : training_idx)
distribution_ += training_data[idx];
// Each leaf gets one vote.
// See https://en.wikipedia.org/wiki/Bootstrap_aggregating . TL;DR: the
// individual trees should average (regression) or vote (classification).
//
// TODO(liberato): It's unclear that a leaf should get to vote with an
// entire distribution; we might want to take the max for kUnordered here.
// If so, then we might also want to Average() for kNumeric targets, though
// in that case, the results would be the same anyway. That's not, of
// course, guaranteed for all methods of converting |distribution_| into a
// numeric prediction. In general, we should provide a single estimate.
distribution_.Normalize();
}
// TreeNode
TargetHistogram PredictDistribution(const FeatureVector&) override {
return distribution_;
}
private:
TargetHistogram distribution_;
};
RandomTreeTrainer::RandomTreeTrainer(RandomNumberGenerator* rng)
: HasRandomNumberGenerator(rng) {}
RandomTreeTrainer::~RandomTreeTrainer() {}
void RandomTreeTrainer::Train(const LearningTask& task,
const TrainingData& training_data,
TrainedModelCB model_cb) {
// Start with all the training data.
std::vector<size_t> training_idx;
training_idx.reserve(training_data.size());
for (size_t idx = 0; idx < training_data.size(); idx++)
training_idx.push_back(idx);
// It's a little odd that we don't post training. Perhaps we should.
auto model = Train(task, training_data, training_idx);
base::SequencedTaskRunnerHandle::Get()->PostTask(
FROM_HERE, base::BindOnce(std::move(model_cb), std::move(model)));
}
std::unique_ptr<Model> RandomTreeTrainer::Train(
const LearningTask& task,
const TrainingData& training_data,
const std::vector<size_t>& training_idx) {
if (training_data.empty()) {
return std::make_unique<LeafNode>(training_data, std::vector<size_t>(),
LearningTask::Ordering::kUnordered);
}
DCHECK_EQ(task.feature_descriptions.size(), training_data[0].features.size());
// Start with all features unused.
FeatureSet unused_set;
for (size_t idx = 0; idx < task.feature_descriptions.size(); idx++)
unused_set.insert(idx);
return Build(task, training_data, training_idx, unused_set);
}
std::unique_ptr<Model> RandomTreeTrainer::Build(
const LearningTask& task,
const TrainingData& training_data,
const std::vector<size_t>& training_idx,
const FeatureSet& unused_set) {
DCHECK_GT(training_idx.size(), 0u);
// TODO: enforce a minimum number of samples. ExtraTrees uses 2 for
// classification, and 5 for regression.
// Remove any constant attributes in |training_data| from |unused_set|. Also
// check if our training data has a constant target value. For both features
// and the target value, if the Optional has a value then it's the singular
// value that we've found so far. If we find a second one, then we'll clear
// the Optional.
base::Optional<TargetValue> target_value(
training_data[training_idx[0]].target_value);
std::vector<base::Optional<FeatureValue>> feature_values;
feature_values.resize(training_data[0].features.size());
for (size_t feature_idx : unused_set) {
feature_values[feature_idx] =
training_data[training_idx[0]].features[feature_idx];
}
for (size_t idx : training_idx) {
const LabelledExample& example = training_data[idx];
// Record this target value to see if there is more than one. We skip the
// insertion if we've already determined that it's not constant.
if (target_value && target_value != example.target_value)
target_value.reset();
// For all features in |unused_set|, see if it's a constant in our subset of
// the training data.
for (size_t feature_idx : unused_set) {
auto& value = feature_values[feature_idx];
if (value && *value != example.features[feature_idx])
value.reset();
}
}
// Is the output constant in |training_data|? If so, then generate a leaf.
// If we're not normalizing leaves, then this matters since this training data
// might be split across multiple leaves.
if (target_value) {
return std::make_unique<LeafNode>(training_data, training_idx,
task.target_description.ordering);
}
// Remove any constant features from the unused set, so that we don't try to
// split on them. It would work, but it would be trivially useless. We also
// don't want to use one of our potential splits on it.
FeatureSet new_unused_set = unused_set;
for (size_t feature_idx : unused_set) {
auto& value = feature_values[feature_idx];
if (value)
new_unused_set.erase(feature_idx);
}
// Select the feature subset to consider at this leaf.
// TODO(liberato): For nominals, with one-hot encoding, we'd give an equal
// chance to each feature's value. For example, if F1 has {A, B} and F2 has
// {C,D,E,F}, then we would pick uniformly over {A,B,C,D,E,F}. However, now
// we pick between {F1, F2} then pick between either {A,B} or {C,D,E,F}. We
// do this because it's simpler and doesn't seem to hurt anything.
FeatureSet feature_candidates = new_unused_set;
// TODO(liberato): Let our caller override this.
const size_t features_per_split =
std::max(static_cast<int>(sqrt(feature_candidates.size())), 3);
// Note that it's okay if there are fewer features left; we'll select all of
// them instead.
while (feature_candidates.size() > features_per_split) {
// Remove a random feature.
size_t which = rng()->Generate(feature_candidates.size());
auto iter = feature_candidates.begin();
for (; which; which--, iter++)
;
feature_candidates.erase(iter);
}
// TODO(liberato): Does it help if we refuse to split without an info gain?
Split best_potential_split;
// Find the best split among the candidates that we have.
for (int i : feature_candidates) {
Split potential_split =
ConstructSplit(task, training_data, training_idx, i);
if (potential_split.nats_remaining < best_potential_split.nats_remaining) {
best_potential_split = std::move(potential_split);
}
}
// Note that we can have a split with no index (i.e., no features left, or no
// feature was an improvement in nats), or with a single index (had features,
// but all had the same value). Either way, we should end up with a leaf.
if (best_potential_split.branch_infos.size() < 2) {
// Stop when there is no more tree.
return std::make_unique<LeafNode>(training_data, training_idx,
task.target_description.ordering);
}
// Build an interior node
std::unique_ptr<InteriorNode> node = std::make_unique<InteriorNode>(
task, best_potential_split.split_index, best_potential_split.split_point);
for (auto& branch_iter : best_potential_split.branch_infos) {
node->AddChild(branch_iter.first,
Build(task, training_data, branch_iter.second.training_idx,
new_unused_set));
}
return node;
}
RandomTreeTrainer::Split RandomTreeTrainer::ConstructSplit(
const LearningTask& task,
const TrainingData& training_data,
const std::vector<size_t>& training_idx,
int split_index) {
// We should not be given a training set of size 0, since there's no need to
// check an empty split.
DCHECK_GT(training_idx.size(), 0u);
Split split(split_index);
bool is_numeric = task.feature_descriptions[split_index].ordering ==
LearningTask::Ordering::kNumeric;
// TODO(liberato): Consider removing nominal feature support and RF. That
// would make this code somewhat simpler.
// For a numeric split, find the split point. Otherwise, we'll split on every
// nominal value that this feature has in |training_data|.
if (is_numeric) {
split.split_point =
FindSplitPoint_Numeric(split.split_index, training_data, training_idx);
} else {
split.split_point =
FindSplitPoint_Nominal(split.split_index, training_data, training_idx);
}
// Find the split's feature values and construct the training set for each.
// I think we want to iterate on the underlying vector, and look up the int in
// the training data directly. |total_weight| will hold the total weight of
// all examples that come into this node.
double total_weight = 0.;
for (size_t idx : training_idx) {
const LabelledExample& example = training_data[idx];
total_weight += example.weight;
// Get the value of the |index|-th feature for |example|.
FeatureValue v_i = example.features[split.split_index];
// Figure out what value this example would use for splitting. For nominal,
// it's 1 or 0, based on whether |v_i| is equal to the split or not. For
// numeric, it's whether |v_i| is <= the split point or not (0 for <=, and 1
// for >).
FeatureValue split_feature;
if (is_numeric)
split_feature = FeatureValue(v_i > split.split_point);
else
split_feature = FeatureValue(v_i == split.split_point);
// Add |v_i| to the right training set. Remember that emplace will do
// nothing if the key already exists.
auto result =
split.branch_infos.emplace(split_feature, Split::BranchInfo());
auto iter = result.first;
Split::BranchInfo& branch_info = iter->second;
branch_info.training_idx.push_back(idx);
branch_info.target_histogram += example;
}
// Figure out how good / bad this split is.
switch (task.target_description.ordering) {
case LearningTask::Ordering::kUnordered:
ComputeSplitScore_Nominal(&split, total_weight);
break;
case LearningTask::Ordering::kNumeric:
ComputeSplitScore_Numeric(&split, total_weight);
break;
}
return split;
}
void RandomTreeTrainer::ComputeSplitScore_Nominal(
Split* split,
double total_incoming_weight) {
// Compute the nats given that we're at this node.
split->nats_remaining = 0;
for (auto& info_iter : split->branch_infos) {
Split::BranchInfo& branch_info = info_iter.second;
// |weight_along_branch| is the total weight of examples that would follow
// this branch in the tree.
const double weight_along_branch =
branch_info.target_histogram.total_counts();
// |p_branch| is the probability of following this branch.
const double p_branch = weight_along_branch / total_incoming_weight;
for (auto& iter : branch_info.target_histogram) {
double p = iter.second / total_incoming_weight;
// p*log(p) is the expected nats if the answer is |iter|. We multiply
// that by the probability of being in this bucket at all.
split->nats_remaining -= (p * log(p)) * p_branch;
}
}
}
void RandomTreeTrainer::ComputeSplitScore_Numeric(
Split* split,
double total_incoming_weight) {
// Compute the nats given that we're at this node.
split->nats_remaining = 0;
for (auto& info_iter : split->branch_infos) {
Split::BranchInfo& branch_info = info_iter.second;
// |weight_along_branch| is the total weight of examples that would follow
// this branch in the tree.
const double weight_along_branch =
branch_info.target_histogram.total_counts();
// |p_branch| is the probability of following this branch.
const double p_branch = weight_along_branch / total_incoming_weight;
// Compute the average at this node. Note that we have no idea if the leaf
// node would actually use an average, but really it should match. It would
// be really nice if we could compute the value (or TargetHistogram) as
// part of computing the split, and have somebody just hand that target
// distribution to the leaf if it ends up as one.
double average = branch_info.target_histogram.Average();
for (auto& iter : branch_info.target_histogram) {
// Compute the squared error for all |iter.second| counts that each have a
// value of |iter.first|, when this leaf approximates them as |average|.
double sq_err = (iter.first.value() - average) *
(iter.first.value() - average) * iter.second;
split->nats_remaining += sq_err * p_branch;
}
}
}
FeatureValue RandomTreeTrainer::FindSplitPoint_Numeric(
size_t split_index,
const TrainingData& training_data,
const std::vector<size_t>& training_idx) {
// We should not be given a training set of size 0, since there's no need to
// check an empty split.
DCHECK_GT(training_idx.size(), 0u);
// We should either (a) choose the single best split point given all our
// training data (i.e., choosing between the splits that are equally between
// adjacent feature values), or (b) choose the best split point by drawing
// uniformly over the range that contains our feature values. (a) is
// appropriate with RandomForest, while (b) is appropriate with ExtraTrees.
FeatureValue v_min = training_data[training_idx[0]].features[split_index];
FeatureValue v_max = training_data[training_idx[0]].features[split_index];
for (size_t idx : training_idx) {
const LabelledExample& example = training_data[idx];
// Get the value of the |split_index|-th feature for
FeatureValue v_i = example.features[split_index];
if (v_i < v_min)
v_min = v_i;
if (v_i > v_max)
v_max = v_i;
}
FeatureValue v_split;
if (v_max == v_min) {
// Pick |v_split| to return a trivial split, so that this ends up as a
// leaf node anyway.
v_split = v_max;
} else {
// Choose a random split point. Note that we want to end up with two
// buckets, so we don't have a trivial split. By picking [v_min, v_max),
// |v_min| will always be in one bucket and |v_max| will always not be.
v_split = FeatureValue(
rng()->GenerateDouble(v_max.value() - v_min.value()) + v_min.value());
}
return v_split;
}
FeatureValue RandomTreeTrainer::FindSplitPoint_Nominal(
size_t split_index,
const TrainingData& training_data,
const std::vector<size_t>& training_idx) {
// We should not be given a training set of size 0, since there's no need to
// check an empty split.
DCHECK_GT(training_idx.size(), 0u);
// Construct a set of all values for |training_idx|. We don't care about
// their relative frequency, since one-hot encoding doesn't.
// For example, if a feature has 10 "yes" instances and 1 "no" instance, then
// there's a 50% chance for each to be chosen here. This is because one-hot
// encoding would do roughly the same thing: when choosing features, the
// "is_yes" and "is_no" features that come out of one-hot encoding would be
// equally likely to be chosen.
//
// Important but subtle note: we can't choose a value that's been chosen
// before for this feature, since that would be like splitting on the same
// one-hot feature more than once. Luckily, we won't be asked to do that. If
// we choose "Yes" at some level in the tree, then the "==" branch will have
// trivial features which will be removed from consideration early (we never
// consider features with only one value), and the != branch won't have any
// "Yes" values for us to pick at a lower level.
std::set<FeatureValue> values;
for (size_t idx : training_idx) {
const LabelledExample& example = training_data[idx];
values.insert(example.features[split_index]);
}
// Select one uniformly at random.
size_t which = rng()->Generate(values.size());
auto it = values.begin();
for (; which > 0; it++, which--)
;
return *it;
}
} // namespace learning
} // namespace media