Split crash metadata into description and signature.

Signature will be used for deduplication while the description is human-readable for reporting.

For runner, the signature will be the same as the description, which is compatible with the current behavior.

PiperOrigin-RevId: 769774505
diff --git a/centipede/centipede.cc b/centipede/centipede.cc
index 8510af8..84490e4 100644
--- a/centipede/centipede.cc
+++ b/centipede/centipede.cc
@@ -206,8 +206,9 @@
 absl::Status Centipede::CrashesToFiles(const Environment &env,
                                        std::string_view dir) {
   std::vector<std::string> reproducer_dirs;
+  const auto wd = WorkDir{env};
   auto reproducer_match_status = RemoteGlobMatch(
-      WorkDir{env}.CrashReproducerDirPaths().AllShardsGlob(), reproducer_dirs);
+      wd.CrashReproducerDirPaths().AllShardsGlob(), reproducer_dirs);
   if (!reproducer_match_status.ok() &&
       !absl::IsNotFound(reproducer_match_status)) {
     return reproducer_match_status;
@@ -225,27 +226,18 @@
       RETURN_IF_NOT_OK(RemoteFileCopy(
           reproducer_path,
           (std::filesystem::path{dir} / absl::StrCat(id, ".data")).string()));
-    }
-  }
-  std::vector<std::string> metadata_dirs;
-  auto metadata_match_status = RemoteGlobMatch(
-      WorkDir{env}.CrashMetadataDirPaths().AllShardsGlob(), metadata_dirs);
-  if (!metadata_match_status.ok() && !absl::IsNotFound(metadata_match_status)) {
-    return metadata_match_status;
-  }
-  for (const auto &metadata_dir : metadata_dirs) {
-    ASSIGN_OR_RETURN_IF_NOT_OK(
-        std::vector<std::string> metadata_paths,
-        RemoteListFiles(metadata_dir, /*recursively=*/false));
-    for (const auto &metadata_path : metadata_paths) {
-      std::string id = std::filesystem::path{metadata_path}.filename();
-      if (crash_ids.erase(id) == 0) {
-        continue;
-      }
+      const auto shard_index = wd.CrashReproducerDirPaths().GetShardIndex(
+          std::filesystem::path{reproducer_path}.parent_path().string());
+      CHECK(shard_index.has_value());
+      const auto metadata_dir = wd.CrashMetadataDirPaths().Shard(*shard_index);
+      const auto description_filename = absl::StrCat(id, ".desc");
+      const auto signature_filename = absl::StrCat(id, ".sig");
       RETURN_IF_NOT_OK(RemoteFileCopy(
-          metadata_path,
-          (std::filesystem::path{dir} / absl::StrCat(id, ".metadata"))
-              .string()));
+          (std::filesystem::path{metadata_dir} / description_filename).string(),
+          (std::filesystem::path{dir} / description_filename).string()));
+      RETURN_IF_NOT_OK(RemoteFileCopy(
+          (std::filesystem::path{metadata_dir} / signature_filename).string(),
+          (std::filesystem::path{dir} / signature_filename).string()));
     }
   }
   return absl::OkStatus();
@@ -888,6 +880,9 @@
               << "\nExit code            : " << batch_result.exit_code()
               << "\nFailure              : "
               << batch_result.failure_description()
+              << "\nSignature            : "
+              << AsPrintableString(AsByteSpan(batch_result.failure_signature()),
+                                   /*max_len=*/32)
               << "\nNumber of inputs     : " << input_vec.size()
               << "\nNumber of inputs read: " << batch_result.num_outputs_read()
               << (batch_result.IsSetupFailure()
@@ -976,7 +971,7 @@
       std::string input_file_path = std::filesystem::path(crash_dir) / hash;
       auto crash_metadata_dir = wd_.CrashMetadataDirPaths().MyShard();
       CHECK_OK(RemoteMkdir(crash_metadata_dir));
-      std::string crash_metadata_file_path =
+      std::string crash_metadata_path_prefix =
           std::filesystem::path(crash_metadata_dir) / hash;
       LOG(INFO) << log_prefix << "Detected crash-reproducing input:"
                 << "\nInput index    : " << input_idx << "\nInput bytes    : "
@@ -984,13 +979,20 @@
                 << "\nExit code      : " << one_input_batch_result.exit_code()
                 << "\nFailure        : "
                 << one_input_batch_result.failure_description()
+                << "\nSignature      : "
+                << AsPrintableString(
+                       AsByteSpan(one_input_batch_result.failure_signature()),
+                       /*max_len=*/32)
                 << "\nSaving input to: " << input_file_path
                 << "\nSaving crash"  //
-                << "\nmetadata to    : " << crash_metadata_file_path;
+                << "\nmetadata to    : " << crash_metadata_path_prefix << ".*";
       CHECK_OK(RemoteFileSetContents(input_file_path, one_input));
-      CHECK_OK(
-          RemoteFileSetContents(crash_metadata_file_path,
-                                one_input_batch_result.failure_description()));
+      CHECK_OK(RemoteFileSetContents(
+          absl::StrCat(crash_metadata_path_prefix, ".desc"),
+          one_input_batch_result.failure_description()));
+      CHECK_OK(RemoteFileSetContents(
+          absl::StrCat(crash_metadata_path_prefix, ".sig"),
+          one_input_batch_result.failure_signature()));
       return;
     }
   }
diff --git a/centipede/centipede_callbacks.cc b/centipede/centipede_callbacks.cc
index 819e800..90e189d 100644
--- a/centipede/centipede_callbacks.cc
+++ b/centipede/centipede_callbacks.cc
@@ -157,7 +157,7 @@
       absl::StrCat(":shmem:arg1=", inputs_blobseq_.path(),
                    ":arg2=", outputs_blobseq_.path(),
                    ":failure_description_path=", failure_description_path_,
-                   ":"),
+                   ":failure_signature_path=", failure_signature_path_, ":"),
       disable_coverage)};
 
   if (env_.clang_coverage_binary == binary)
@@ -243,9 +243,18 @@
     ReadFromLocalFile(execute_log_path_, batch_result.log());
     ReadFromLocalFile(failure_description_path_,
                       batch_result.failure_description());
-    // Remove failure_description_ here so that it doesn't stay until another
-    // failed execution.
+    if (std::filesystem::exists(failure_signature_path_)) {
+      ReadFromLocalFile(failure_signature_path_,
+                        batch_result.failure_signature());
+    } else {
+      // TODO(xinhaoyuan): Refactor runner to use dispatcher so this branch can
+      // be removed.
+      batch_result.failure_signature() = batch_result.failure_description();
+    }
+    // Remove the failure description and signature files here so that they do
+    // not stay until another failed execution.
     std::filesystem::remove(failure_description_path_);
+    std::filesystem::remove(failure_signature_path_);
   }
   VLOG(1) << __FUNCTION__ << " took " << (absl::Now() - start_time);
   return retval;
diff --git a/centipede/centipede_callbacks.h b/centipede/centipede_callbacks.h
index 792608d..57b9aab 100644
--- a/centipede/centipede_callbacks.h
+++ b/centipede/centipede_callbacks.h
@@ -174,6 +174,8 @@
       std::filesystem::path(temp_dir_).append("log");
   std::string failure_description_path_ =
       std::filesystem::path(temp_dir_).append("failure_description");
+  std::string failure_signature_path_ =
+      std::filesystem::path(temp_dir_).append("failure_signature");
   const std::string shmem_name1_ = ProcessAndThreadUniqueID("/ctpd-shm1-");
   const std::string shmem_name2_ = ProcessAndThreadUniqueID("/ctpd-shm2-");
 
diff --git a/centipede/centipede_flags.inc b/centipede/centipede_flags.inc
index 115ec38..091c8e7 100644
--- a/centipede/centipede_flags.inc
+++ b/centipede/centipede_flags.inc
@@ -293,11 +293,10 @@
 CENTIPEDE_FLAG(
     std::string, crashes_to_files, "",
     "When set to a directory path, save the crashing reproducers and "
-    "metadata from the workdir to the given path. Each crash with `ID`"
-    "will be saved with file `ID.data` for the reproducer and "
-    "`ID.metadata` the metadata, which currently contains the failure "
-    "description. If multiple crashes with the same ID exist, only one "
-    "crash will be saved.")
+    "metadata from the workdir to the given path: Each crash with `ID`"
+    "will be saved with file `ID.data` for the reproducer, `ID.desc` the "
+    "description, `ID.sig` the signature. If multiple crashes with the same ID "
+    "exist, only one crash will be saved.")
 CENTIPEDE_FLAG(
     std::string, corpus_from_files, "",
     "Export a corpus from a local directory with one file per input into "
diff --git a/centipede/centipede_interface.cc b/centipede/centipede_interface.cc
index 504ccf2..ca4c856 100644
--- a/centipede/centipede_interface.cc
+++ b/centipede/centipede_interface.cc
@@ -288,9 +288,9 @@
   return test_shard;
 }
 
-// Prunes non-reproducible and duplicate crashes and returns the crash metadata
-// of the remaining crashes.
-absl::flat_hash_set<std::string> PruneOldCrashesAndGetRemainingCrashMetadata(
+// Prunes non-reproducible and duplicate crashes and returns the crash
+// signatures of the remaining crashes.
+absl::flat_hash_set<std::string> PruneOldCrashesAndGetRemainingCrashSignatures(
     const std::filesystem::path &crashing_dir, const Environment &env,
     CentipedeCallbacksFactory &callbacks_factory) {
   const std::vector<std::string> crashing_input_files =
@@ -299,7 +299,7 @@
       ValueOrDie(RemoteListFiles(crashing_dir.c_str(), /*recursively=*/false));
   ScopedCentipedeCallbacks scoped_callbacks(callbacks_factory, env);
   BatchResult batch_result;
-  absl::flat_hash_set<std::string> remaining_crash_metadata;
+  absl::flat_hash_set<std::string> remaining_crash_signatures;
 
   for (const std::string &crashing_input_file : crashing_input_files) {
     ByteArray crashing_input;
@@ -308,7 +308,8 @@
         env.binary, {crashing_input}, batch_result);
     const bool is_duplicate =
         is_reproducible && !batch_result.IsSetupFailure() &&
-        !remaining_crash_metadata.insert(batch_result.failure_description())
+        !batch_result.failure_signature().empty() &&
+        !remaining_crash_signatures.insert(batch_result.failure_signature())
              .second;
     if (!is_reproducible || batch_result.IsSetupFailure() || is_duplicate) {
       CHECK_OK(RemotePathDelete(crashing_input_file, /*recursively=*/false));
@@ -316,13 +317,13 @@
       CHECK_OK(RemotePathTouchExistingFile(crashing_input_file));
     }
   }
-  return remaining_crash_metadata;
+  return remaining_crash_signatures;
 }
 
 // TODO(b/405382531): Add unit tests once the function is unit-testable.
 void DeduplicateAndStoreNewCrashes(
     const std::filesystem::path &crashing_dir, const WorkDir &workdir,
-    size_t total_shards, absl::flat_hash_set<std::string> crash_metadata) {
+    size_t total_shards, absl::flat_hash_set<std::string> crash_signatures) {
   for (size_t shard_idx = 0; shard_idx < total_shards; ++shard_idx) {
     const std::vector<std::string> new_crashing_input_files =
         // The crash reproducer directory may contain subdirectories with
@@ -338,19 +339,20 @@
     for (const std::string &crashing_input_file : new_crashing_input_files) {
       const std::string crashing_input_file_name =
           std::filesystem::path(crashing_input_file).filename();
-      const std::string crash_metadata_file =
-          crash_metadata_dir / crashing_input_file_name;
-      std::string new_crash_metadata;
+      const std::string crash_signature_path =
+          crash_metadata_dir / absl::StrCat(crashing_input_file_name, ".sig");
+      std::string new_crash_signature;
       const absl::Status status =
-          RemoteFileGetContents(crash_metadata_file, new_crash_metadata);
+          RemoteFileGetContents(crash_signature_path, new_crash_signature);
       if (!status.ok()) {
         LOG(WARNING) << "Ignoring crashing input " << crashing_input_file_name
-                     << " due to failure to read the crash metadata file: "
+                     << " due to failure to read the crash signature: "
                      << status;
         continue;
       }
       const bool is_duplicate =
-          !crash_metadata.insert(new_crash_metadata).second;
+          !new_crash_signature.empty() &&
+          !crash_signatures.insert(new_crash_signature).second;
       if (is_duplicate) continue;
       CHECK_OK(
           RemoteFileRename(crashing_input_file,
@@ -667,11 +669,11 @@
 
     // Deduplicate and update the crashing inputs.
     const std::filesystem::path crashing_dir = fuzztest_db_path / "crashing";
-    absl::flat_hash_set<std::string> crash_metadata =
-        PruneOldCrashesAndGetRemainingCrashMetadata(crashing_dir, env,
-                                                    callbacks_factory);
+    absl::flat_hash_set<std::string> crash_signatures =
+        PruneOldCrashesAndGetRemainingCrashSignatures(crashing_dir, env,
+                                                      callbacks_factory);
     DeduplicateAndStoreNewCrashes(crashing_dir, workdir, env.total_shards,
-                                  std::move(crash_metadata));
+                                  std::move(crash_signatures));
   }
 
   return EXIT_SUCCESS;
diff --git a/centipede/centipede_test.cc b/centipede/centipede_test.cc
index 3254b5e..c66e7ab 100644
--- a/centipede/centipede_test.cc
+++ b/centipede/centipede_test.cc
@@ -685,6 +685,7 @@
   std::string binary;
   unsigned char input = 0;
   std::string description;
+  std::string signature;
 };
 
 // A mock for ExtraBinaries test.
@@ -703,6 +704,7 @@
       for (const Crash &crash : crashes_) {
         if (binary == crash.binary && input[0] == crash.input) {
           batch_result.failure_description() = crash.description;
+          batch_result.failure_signature() = crash.signature;
           res = false;
         }
       }
@@ -766,9 +768,9 @@
   env.binary = "b1";
   env.extra_binaries = {"b2", "b3", "b4"};
   env.require_pc_table = false;  // No PC table here.
-  ExtraBinariesMock mock(
-      env, {Crash{"b1", 10, "b1-crash"}, Crash{"b2", 30, "b2-crash"},
-            Crash{"b3", 50, "b3-crash"}});
+  ExtraBinariesMock mock(env, {Crash{"b1", 10, "b1-crash", "b1-sig"},
+                               Crash{"b2", 30, "b2-crash", "b2-sig"},
+                               Crash{"b3", 50, "b3-crash", "b3-sig"}});
   MockFactory factory(mock);
   CentipedeMain(env, factory);
 
@@ -789,11 +791,15 @@
   auto crash_metadata_dir_path = WorkDir{env}.CrashMetadataDirPaths().MyShard();
   ASSERT_TRUE(std::filesystem::exists(crash_metadata_dir_path))
       << VV(crash_metadata_dir_path);
-  EXPECT_THAT(crash_metadata_dir_path,
-              HasFilesWithContents(testing::UnorderedElementsAre(
-                  FileAndContents{Hash({10}), "b1-crash"},
-                  FileAndContents{Hash({30}), "b2-crash"},
-                  FileAndContents{Hash({50}), "b3-crash"})));
+  EXPECT_THAT(
+      crash_metadata_dir_path,
+      HasFilesWithContents(testing::UnorderedElementsAre(
+          FileAndContents{absl::StrCat(Hash({10}), ".desc"), "b1-crash"},
+          FileAndContents{absl::StrCat(Hash({10}), ".sig"), "b1-sig"},
+          FileAndContents{absl::StrCat(Hash({30}), ".desc"), "b2-crash"},
+          FileAndContents{absl::StrCat(Hash({30}), ".sig"), "b2-sig"},
+          FileAndContents{absl::StrCat(Hash({50}), ".desc"), "b3-crash"},
+          FileAndContents{absl::StrCat(Hash({50}), ".sig"), "b3-sig"})));
 }
 
 // A mock for UndetectedCrashingInput test.
diff --git a/centipede/runner_result.h b/centipede/runner_result.h
index fd4e9f7..17ecd57 100644
--- a/centipede/runner_result.h
+++ b/centipede/runner_result.h
@@ -162,6 +162,8 @@
   const std::string& failure_description() const {
     return failure_description_;
   }
+  std::string& failure_signature() { return failure_signature_; }
+  const std::string& failure_signature() const { return failure_signature_; }
 
  private:
   friend class MultiInputMock;
@@ -169,9 +171,13 @@
   std::vector<ExecutionResult> results_;
   std::string log_;  // log_ is populated optionally, e.g. if there was a crash.
   int exit_code_ = EXIT_SUCCESS;  // Process exit code.
-  // If the batch execution fails, this may optionally contain a failure
-  // description, e.g., the crash type, stack trace...
+  // If the batch execution fails, this may optionally contain a human-readable
+  // failure description, e.g., the crash type, stack trace...
   std::string failure_description_;
+  // A signature uniquely identifying the failure, which does not need to be
+  // human-readable. Specially, failures with empty signatures are always
+  // considered unique.
+  std::string failure_signature_;
   size_t num_outputs_read_ = 0;
 };
 
diff --git a/centipede/workdir.cc b/centipede/workdir.cc
index 3e96506..59b2e0f 100644
--- a/centipede/workdir.cc
+++ b/centipede/workdir.cc
@@ -16,6 +16,7 @@
 
 #include <cstddef>
 #include <filesystem>  // NOLINT
+#include <optional>
 #include <string>
 #include <string_view>
 #include <system_error>  // NOLINT
@@ -28,6 +29,7 @@
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/strip.h"
 #include "./centipede/environment.h"
 #include "./common/logging.h"
 
@@ -80,6 +82,15 @@
   return absl::StartsWith(path, prefix_);
 }
 
+std::optional<size_t> WorkDir::PathShards::GetShardIndex(
+    std::string_view path) const {
+  if (!absl::ConsumePrefix(&path, prefix_)) return std::nullopt;
+  if (path.size() != kDigitsInShardIndex) return std::nullopt;
+  size_t shard = 0;
+  if (!absl::SimpleAtoi(path, &shard)) return std::nullopt;
+  return shard;
+}
+
 //------------------------------------------------------------------------------
 //                                 WorkDir
 
diff --git a/centipede/workdir.h b/centipede/workdir.h
index 9aefffd..5192b93 100644
--- a/centipede/workdir.h
+++ b/centipede/workdir.h
@@ -16,6 +16,7 @@
 #define THIRD_PARTY_CENTIPEDE_WORKDIR_MGR_H_
 
 #include <cstddef>
+#include <optional>
 #include <ostream>
 #include <string>
 #include <string_view>
@@ -47,6 +48,9 @@
     // but `path` must have the exact `base_dir`/`rel_prefix` prefix,
     // including any relative "." and ".." path elements.
     bool IsShard(std::string_view path) const;
+    // Returns the shard index of `path` if it is a shard parth, `nullopt`
+    // otherwise.
+    std::optional<size_t> GetShardIndex(std::string_view path) const;
 
    private:
     friend class WorkDir;
diff --git a/fuzztest/internal/centipede_adaptor.cc b/fuzztest/internal/centipede_adaptor.cc
index eff5706..0e31571 100644
--- a/fuzztest/internal/centipede_adaptor.cc
+++ b/fuzztest/internal/centipede_adaptor.cc
@@ -760,17 +760,18 @@
                     absl::StrCat(read_reproducer_status));
       continue;
     }
-    const std::string metadata_file = std::filesystem::path{exported_crash_file}
-                                          .replace_extension("metadata")
-                                          .string();
-    std::string metadata;
-    const absl::Status read_metadata_status =
-        RemoteFileGetContents(metadata_file, metadata);
-    if (!read_metadata_status.ok()) {
+    const std::string description_file =
+        std::filesystem::path{exported_crash_file}
+            .replace_extension("desc")
+            .string();
+    std::string description;
+    const absl::Status read_description_status =
+        RemoteFileGetContents(description_file, description);
+    if (!read_description_status.ok()) {
       absl::FPrintF(
           GetStderr(),
-          "[!] Got error while reading the metadata for crash id %s: %s\n",
-          crash_id, absl::StrCat(read_metadata_status));
+          "[!] Got error while reading the description for crash id %s: %s\n",
+          crash_id, absl::StrCat(read_description_status));
       continue;
     }
     std::string reproducer_path = WriteDataToDir(reproducer, output.dir_path);
@@ -782,8 +783,8 @@
       continue;
     }
     absl::FPrintF(GetStderr(),
-                  "[.] Saved reproducer with ID %s and crash metadata %s\n",
-                  Basename(reproducer_path), metadata);
+                  "[.] Saved reproducer with ID %s and crash description %s\n",
+                  Basename(reproducer_path), description);
     if (!single_reproducer_path.has_value()) {
       single_reproducer_path = reproducer_path;
     } else {