Support input reduction.

When enabled (by default), the fuzzing engine would replace a corpus input with a mutant if the size is smaller and the original features are covered.

It is supposed to improve the mutation effectiveness.

PiperOrigin-RevId: 889242168
diff --git a/centipede/centipede.cc b/centipede/centipede.cc
index f204cce..2dea1ad 100644
--- a/centipede/centipede.cc
+++ b/centipede/centipede.cc
@@ -474,31 +474,56 @@
     bool input_gained_new_coverage = fs_.PruneFeaturesAndCountUnseen(fv) != 0;
     if (env_.use_pcpair_features && AddPcPairFeatures(fv) != 0)
       input_gained_new_coverage = true;
+    bool canonicalized = false;
     if (unconditional_features_file != nullptr) {
+      fs_.CanonicalizeFeatures(fv);
+      canonicalized = true;
       FUZZTEST_CHECK_OK(unconditional_features_file->Write(
           PackFeaturesAndHash(inputs[i], fv)));
     }
+
+    const auto& result = batch_result.results()[i];
+    const bool may_reduce =
+        mutants[i].origin != Mutant::kOriginNone && env_.reduce_inputs &&
+        // Consider mutator-provided signals for smaller-ness.
+        inputs[i].size() < corpus_.Records()[mutants[i].origin].data.size();
+
+    if (!input_gained_new_coverage && !may_reduce) continue;
+    // TODO: [impl] add stats for filtered-out inputs.
+    if (!InputPassesFilter(inputs[i])) continue;
+
+    // Need `fv` to be canonicalized at this point.
+    if (!canonicalized) {
+      fs_.CanonicalizeFeatures(fv);
+    }
+
+    bool write_to_files = false;
     if (input_gained_new_coverage) {
-      // TODO(kcc): [impl] add stats for filtered-out inputs.
-      if (!InputPassesFilter(inputs[i])) continue;
       fs_.MergeFeatures(fv);
       LogFeaturesAsSymbols(fv);
       batch_gained_new_coverage = true;
       FUZZTEST_CHECK_GT(fv.size(), 0UL);
       if (function_filter_passed) {
-        corpus_.Add(inputs[i], fv, batch_result.results()[i].metadata(),
-                    batch_result.results()[i].stats(), fs_, coverage_frontier_);
+        corpus_.Add(inputs[i], fv, result.metadata(), result.stats(), fs_,
+                    coverage_frontier_);
       }
+      write_to_files = true;
+    } else if (may_reduce &&
+               corpus_.TryReduceInput(mutants[i].origin, inputs[i], fv,
+                                      result.metadata(), result.stats())) {
+      write_to_files = true;
+    }
+    if (write_to_files) {
       if (corpus_file != nullptr) {
         FUZZTEST_CHECK_OK(corpus_file->Write(inputs[i]));
       }
-      if (!env_.corpus_dir.empty() && !env_.corpus_dir[0].empty()) {
-        WriteToLocalHashedFileInDir(env_.corpus_dir[0], inputs[i]);
-      }
       if (features_file != nullptr) {
         FUZZTEST_CHECK_OK(
             features_file->Write(PackFeaturesAndHash(inputs[i], fv)));
       }
+      if (!env_.corpus_dir.empty() && !env_.corpus_dir[0].empty()) {
+        WriteToLocalHashedFileInDir(env_.corpus_dir[0], inputs[i]);
+      }
     }
   }
   num_runs_ += batch_result.num_outputs_read();
diff --git a/centipede/centipede_flags.inc b/centipede/centipede_flags.inc
index 650bd7c..ad7b1f4 100644
--- a/centipede/centipede_flags.inc
+++ b/centipede/centipede_flags.inc
@@ -199,6 +199,9 @@
     bool, exec_time_weight_scaling, true,
     "If true, scale the corpus weight by the execution time of each input.")
 CENTIPEDE_FLAG(
+    bool, reduce_inputs, true,
+    "If true, attempt to reduce input size of the corpus during fuzzing.")
+CENTIPEDE_FLAG(
     bool, use_coverage_frontier, false,
     "If true, use coverage frontier when choosing the corpus element to "
     "mutate. This flag is mostly for Centipede developers.")
diff --git a/centipede/centipede_test.cc b/centipede/centipede_test.cc
index d5497f3..0dd9356 100644
--- a/centipede/centipede_test.cc
+++ b/centipede/centipede_test.cc
@@ -54,7 +54,9 @@
 
 using ::testing::AllOf;
 using ::testing::Contains;
+using ::testing::ContainsRegex;
 using ::testing::Each;
+using ::testing::ExitedWithCode;
 using ::testing::HasSubstr;
 using ::testing::IsEmpty;
 using ::testing::IsSupersetOf;
@@ -1170,6 +1172,77 @@
   EXPECT_GE(mock.execute_count(), 2);
 }
 
+// A mock for CentipedeCallbacks to test input reduction.
+class CentipedeMockForInputReduction : public CentipedeCallbacks {
+ public:
+  CentipedeMockForInputReduction(const Environment& env)
+      : CentipedeCallbacks(env) {}
+  // Doesn't execute anything
+  // Sets `batch_result.results()` based on the first 4 bits of `inputs`:
+  bool Execute(std::string_view binary, absl::Span<const ByteSpan> inputs,
+               BatchResult& batch_result) override {
+    batch_result.results().clear();
+    for (auto& input : inputs) {
+      FeatureVec features;
+      FUZZTEST_CHECK(!input.empty());
+      features.push_back(feature_domains::kPCs.ConvertToMe(input[0] % 16));
+      batch_result.results().emplace_back(ExecutionResult{features});
+    }
+    return true;
+  }
+
+  // Use unminimized inputs as seeds.
+  size_t GetSeeds(size_t num_seeds, std::vector<ByteArray>& seeds) override {
+    seeds.resize(num_seeds);
+    for (size_t i = 0; i < num_seeds; ++i) {
+      seeds[i] = {static_cast<uint8_t>(i), static_cast<uint8_t>(i)};
+    }
+    return num_seeds;
+  }
+};
+
+TEST(Centipede, DoesNotReduceInputWhenTheOptionIsUnset) {
+  TempCorpusDir tmp_dir{test_info_->name()};
+  Environment env;
+  env.workdir = tmp_dir.path();
+  env.num_runs = 1000000;  // Should be enough
+  env.batch_size = 7;      // Just some small number.
+  env.require_pc_table = false;
+  env.reduce_inputs = false;
+  CentipedeMockForInputReduction mock(env);
+  NonOwningCallbacksFactory factory(mock);
+  // Match the corpus size logging.
+  EXPECT_EXIT(
+      [&] {
+        CentipedeMain(env, factory);
+        std::exit(0);
+      }(),
+      ExitedWithCode(0),
+      ContainsRegex("end-fuzz: ft: 16 cov: 16 corp: 16/16 max/avg: "
+                    "([2-9]|1[0-9])[0-9]*/"));
+}
+
+TEST(Centipede, ReducesInputWhenTheOptionIsSet) {
+  TempCorpusDir tmp_dir{test_info_->name()};
+  Environment env;
+  env.workdir = tmp_dir.path();
+  env.num_runs = 1000000;  // Should be enough
+  env.batch_size = 7;      // Just some small number.
+  env.require_pc_table = false;
+  env.reduce_inputs = true;
+  CentipedeMockForInputReduction mock(env);
+  NonOwningCallbacksFactory factory(mock);
+  // Match the corpus size logging at the end to see the max size of corpus
+  // inputs is 1.
+  EXPECT_EXIT(
+      [&] {
+        CentipedeMain(env, factory);
+        std::exit(0);
+      }(),
+      ExitedWithCode(0),
+      HasSubstr("end-fuzz: ft: 16 cov: 16 corp: 16/16 max/avg: 1/1"));
+}
+
 TEST_F(CentipedeWithTemporaryLocalDir, UsesProvidedCustomMutator) {
   Environment env;
   env.binary = GetDataDependencyFilepath(
diff --git a/centipede/corpus.cc b/centipede/corpus.cc
index ba1df9b..5c194b5 100644
--- a/centipede/corpus.cc
+++ b/centipede/corpus.cc
@@ -242,6 +242,20 @@
   weighted_distribution_.AddWeight(0);
 }
 
+bool Corpus::TryReduceInput(size_t index, ByteSpan data, const FeatureVec& fv,
+                            const ExecutionMetadata& metadata,
+                            const ExecutionResult::Stats& stats) {
+  if (!IsFeatureVecSubsumed(records_[index].features, fv)) {
+    return false;
+  }
+  // Replace the record except the features to keep reducing for the original
+  // features.
+  records_[index].data = {data.begin(), data.end()};
+  records_[index].metadata = metadata;
+  records_[index].stats = stats;
+  return true;
+}
+
 size_t Corpus::WeightedRandom(absl::BitGenRef rng) const {
   return weighted_distribution_.RandomIndex(rng);
 }
diff --git a/centipede/corpus.h b/centipede/corpus.h
index 4c16ad3..36b843b 100644
--- a/centipede/corpus.h
+++ b/centipede/corpus.h
@@ -120,7 +120,8 @@
 
   // Adds a corpus element, consisting of 'data' (the input bytes, non-empty),
   // 'fv' (the features associated with this input), and execution `metadata`.
-  // `fs` is used to compute weights of `fv`.
+  // `fs` is used to compute weights of `fv`. Requires that `fv` is
+  // canonicalized using `FeatureSet::CanonicalizeFeatures`.
   void Add(ByteSpan data, const FeatureVec& fv,
            const ExecutionMetadata& metadata,
            const ExecutionResult::Stats& stats, const FeatureSet& fs,
@@ -131,6 +132,13 @@
   // Returns the number of removed elements.
   size_t Prune(const FeatureSet &fs, const CoverageFrontier &coverage_frontier,
                size_t max_corpus_size, Rng &rng);
+  // Checks if the input at `index` can be reduced using `data` with `fv`,
+  // `metadata`, and `stats`. Requires that `fv` is canonicalized using
+  // `FeatureSet::CanonicalizeFeatures`. Returns whether the input is reduced or
+  // not.
+  bool TryReduceInput(size_t index, ByteSpan data, const FeatureVec& fv,
+                      const ExecutionMetadata& metadata,
+                      const ExecutionResult::Stats& stats);
   // Updates the corpus weights according to `fs` and `coverage_frontier` using
   // the weight `method`. If `scale_by_exec_time` is set, scales the weights by
   // the corpus execution time relative to the average.
diff --git a/centipede/feature_set.cc b/centipede/feature_set.cc
index 86b741d..0df41a3 100644
--- a/centipede/feature_set.cc
+++ b/centipede/feature_set.cc
@@ -14,6 +14,7 @@
 
 #include "./centipede/feature_set.h"
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <ostream>
@@ -32,6 +33,27 @@
 //                                FeatureSet
 //------------------------------------------------------------------------------
 
+void FeatureSet::CanonicalizeFeatures(FeatureVec& features) {
+  std::sort(features.begin(), features.end());
+  size_t cursor = 0;
+  for (auto& feature : features) {
+    if (cursor > 0) {
+      auto& last_canonical = features[cursor - 1];
+      if (last_canonical == feature) continue;
+      if (feature_domains::IsComparisonScoreFeature(last_canonical) &&
+          feature_domains::IsComparisonScoreFeature(feature) &&
+          feature_domains::CMPScoreFeatureIndex(last_canonical) ==
+              feature_domains::CMPScoreFeatureIndex(feature)) {
+        std::swap(last_canonical, feature);
+        continue;
+      }
+    }
+    std::swap(features[cursor], feature);
+    cursor++;
+  }
+  features.resize(cursor);
+}
+
 // This implementation is slow (needs to iterate over the entire domain),
 // but there is no need for it to be fast.
 PCIndexVec FeatureSet::ToCoveragePCs() const {
@@ -171,6 +193,23 @@
   return os.str();
 }
 
+bool IsFeatureVecSubsumed(const FeatureVec& a, const FeatureVec& b) {
+  size_t cursor = 0;
+  for (const auto a_feature : a) {
+    while (cursor < b.size() && b[cursor] < a_feature) ++cursor;
+    if (cursor == b.size()) return false;
+    if (b[cursor] == a_feature) continue;
+    if (feature_domains::IsComparisonScoreFeature(b[cursor]) &&
+        feature_domains::IsComparisonScoreFeature(a_feature) &&
+        feature_domains::CMPScoreFeatureIndex(b[cursor]) ==
+            feature_domains::CMPScoreFeatureIndex(a_feature)) {
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
 std::ostream &operator<<(std::ostream &out, const FeatureSet &fs) {
   auto LogIfNotZero = [&out](size_t value, std::string_view name) {
     if (!value) return;
diff --git a/centipede/feature_set.h b/centipede/feature_set.h
index 484d7e0..3856a49 100644
--- a/centipede/feature_set.h
+++ b/centipede/feature_set.h
@@ -42,6 +42,9 @@
       : frequency_threshold_(frequency_threshold),
         should_discard_domain_(should_discard_domain) {}
 
+  // Processes `features` to be sorted in order without redundancy.
+  void CanonicalizeFeatures(FeatureVec& features);
+
   // Returns true if there are features in `features` not present in `this`.
   bool HasUnseenFeatures(const FeatureVec &features) const;
 
@@ -152,6 +155,13 @@
   FeatureDomainSet should_discard_domain_;
 };
 
+// Returns true if every feature in `a` is subsumed by some feature in `b`,
+// false otherwise.
+//
+// Time complexity is O(a.size() + b.size()). Requires `a` and `b` to be
+// processed with `FeatureSet::CanonicalizeFeatures`.
+bool IsFeatureVecSubsumed(const FeatureVec& a, const FeatureVec& b);
+
 // Stream out description and count of features in feature set.
 std::ostream &operator<<(std::ostream &out, const FeatureSet &fs);
 
diff --git a/centipede/feature_set_test.cc b/centipede/feature_set_test.cc
index 715c8ee..38438a4 100644
--- a/centipede/feature_set_test.cc
+++ b/centipede/feature_set_test.cc
@@ -17,12 +17,37 @@
 #include <bitset>
 #include <cstddef>
 
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "./centipede/feature.h"
 
 namespace fuzztest::internal {
 namespace {
 
+using ::testing::ElementsAre;
+
+TEST(FeatureSet, CanonicalizesFeatures) {
+  FeatureSet fs(10, {});
+  FeatureVec features{feature_domains::kPCs.ConvertToMe(3),
+                      feature_domains::kPCs.ConvertToMe(2),
+                      feature_domains::kPCs.ConvertToMe(3),
+                      feature_domains::kPCs.ConvertToMe(1),
+                      feature_domains::kPCs.ConvertToMe(2)};
+  fs.CanonicalizeFeatures(features);
+  // Should keep unique features.
+  EXPECT_THAT(features, ElementsAre(feature_domains::kPCs.ConvertToMe(1),
+                                    feature_domains::kPCs.ConvertToMe(2),
+                                    feature_domains::kPCs.ConvertToMe(3)));
+
+  FeatureVec cmp_features{feature_domains::kCMPModDiff.ConvertToMe(1),
+                          feature_domains::kCMPModDiff.ConvertToMe(2),
+                          feature_domains::kCMPModDiff.ConvertToMe(0)};
+  fs.CanonicalizeFeatures(cmp_features);
+  // Should keep only the highest score of a CMP feature index.
+  EXPECT_THAT(cmp_features,
+              ElementsAre(feature_domains::kCMPModDiff.ConvertToMe(2)));
+}
+
 TEST(FeatureSet, ComputeWeight) {
   FeatureSet feature_set(10, {});
 
@@ -260,5 +285,37 @@
   EXPECT_EQ(fs.Frequency(cmp_highest_score), 1);
 }
 
+TEST(IsFeatureVecSubsumed, ReturnsTrueForSubsumedFeatureVec) {
+  FeatureSet fs(/*frequency_threshold=*/123, /*should_discard_domain=*/false);
+  FeatureVec a{feature_domains::kPCs.ConvertToMe(2),
+               feature_domains::kCMPModDiff.ConvertToMe(0)};
+  FeatureVec b{feature_domains::kPCs.ConvertToMe(2),
+               feature_domains::kCMPModDiff.ConvertToMe(1)};
+  fs.CanonicalizeFeatures(a);
+  fs.CanonicalizeFeatures(b);
+  EXPECT_TRUE(IsFeatureVecSubsumed(a, b));
+}
+
+TEST(IsFeatureVecSubsumed, ReturnsFalseForMissingFeature) {
+  FeatureSet fs(/*frequency_threshold=*/123, /*should_discard_domain=*/false);
+  FeatureVec a{feature_domains::kPCs.ConvertToMe(2),
+               feature_domains::kCMPModDiff.ConvertToMe(0)};
+  FeatureVec b{feature_domains::kPCs.ConvertToMe(2)};
+  fs.CanonicalizeFeatures(a);
+  fs.CanonicalizeFeatures(b);
+  EXPECT_FALSE(IsFeatureVecSubsumed(a, b));
+}
+
+TEST(IsFeatureVecSubsumed, ReturnsFalseForLowerCmpScore) {
+  FeatureSet fs(/*frequency_threshold=*/123, /*should_discard_domain=*/false);
+  FeatureVec a{feature_domains::kPCs.ConvertToMe(2),
+               feature_domains::kCMPModDiff.ConvertToMe(1)};
+  FeatureVec b{feature_domains::kPCs.ConvertToMe(2),
+               feature_domains::kCMPModDiff.ConvertToMe(0)};
+  fs.CanonicalizeFeatures(a);
+  fs.CanonicalizeFeatures(b);
+  EXPECT_FALSE(IsFeatureVecSubsumed(a, b));
+}
+
 }  // namespace
 }  // namespace fuzztest::internal