Split crash metadata into description and signature. Signature will be used for deduplication while the description is human-readable for reporting. For runner, the signature will be the same as the description, which is compatible with the current behavior. PiperOrigin-RevId: 769774505
diff --git a/centipede/centipede.cc b/centipede/centipede.cc index 8510af8..84490e4 100644 --- a/centipede/centipede.cc +++ b/centipede/centipede.cc
@@ -206,8 +206,9 @@ absl::Status Centipede::CrashesToFiles(const Environment &env, std::string_view dir) { std::vector<std::string> reproducer_dirs; + const auto wd = WorkDir{env}; auto reproducer_match_status = RemoteGlobMatch( - WorkDir{env}.CrashReproducerDirPaths().AllShardsGlob(), reproducer_dirs); + wd.CrashReproducerDirPaths().AllShardsGlob(), reproducer_dirs); if (!reproducer_match_status.ok() && !absl::IsNotFound(reproducer_match_status)) { return reproducer_match_status; @@ -225,27 +226,18 @@ RETURN_IF_NOT_OK(RemoteFileCopy( reproducer_path, (std::filesystem::path{dir} / absl::StrCat(id, ".data")).string())); - } - } - std::vector<std::string> metadata_dirs; - auto metadata_match_status = RemoteGlobMatch( - WorkDir{env}.CrashMetadataDirPaths().AllShardsGlob(), metadata_dirs); - if (!metadata_match_status.ok() && !absl::IsNotFound(metadata_match_status)) { - return metadata_match_status; - } - for (const auto &metadata_dir : metadata_dirs) { - ASSIGN_OR_RETURN_IF_NOT_OK( - std::vector<std::string> metadata_paths, - RemoteListFiles(metadata_dir, /*recursively=*/false)); - for (const auto &metadata_path : metadata_paths) { - std::string id = std::filesystem::path{metadata_path}.filename(); - if (crash_ids.erase(id) == 0) { - continue; - } + const auto shard_index = wd.CrashReproducerDirPaths().GetShardIndex( + std::filesystem::path{reproducer_path}.parent_path().string()); + CHECK(shard_index.has_value()); + const auto metadata_dir = wd.CrashMetadataDirPaths().Shard(*shard_index); + const auto description_filename = absl::StrCat(id, ".desc"); + const auto signature_filename = absl::StrCat(id, ".sig"); RETURN_IF_NOT_OK(RemoteFileCopy( - metadata_path, - (std::filesystem::path{dir} / absl::StrCat(id, ".metadata")) - .string())); + (std::filesystem::path{metadata_dir} / description_filename).string(), + (std::filesystem::path{dir} / description_filename).string())); + RETURN_IF_NOT_OK(RemoteFileCopy( + (std::filesystem::path{metadata_dir} / signature_filename).string(), + (std::filesystem::path{dir} / signature_filename).string())); } } return absl::OkStatus(); @@ -888,6 +880,9 @@ << "\nExit code : " << batch_result.exit_code() << "\nFailure : " << batch_result.failure_description() + << "\nSignature : " + << AsPrintableString(AsByteSpan(batch_result.failure_signature()), + /*max_len=*/32) << "\nNumber of inputs : " << input_vec.size() << "\nNumber of inputs read: " << batch_result.num_outputs_read() << (batch_result.IsSetupFailure() @@ -976,7 +971,7 @@ std::string input_file_path = std::filesystem::path(crash_dir) / hash; auto crash_metadata_dir = wd_.CrashMetadataDirPaths().MyShard(); CHECK_OK(RemoteMkdir(crash_metadata_dir)); - std::string crash_metadata_file_path = + std::string crash_metadata_path_prefix = std::filesystem::path(crash_metadata_dir) / hash; LOG(INFO) << log_prefix << "Detected crash-reproducing input:" << "\nInput index : " << input_idx << "\nInput bytes : " @@ -984,13 +979,20 @@ << "\nExit code : " << one_input_batch_result.exit_code() << "\nFailure : " << one_input_batch_result.failure_description() + << "\nSignature : " + << AsPrintableString( + AsByteSpan(one_input_batch_result.failure_signature()), + /*max_len=*/32) << "\nSaving input to: " << input_file_path << "\nSaving crash" // - << "\nmetadata to : " << crash_metadata_file_path; + << "\nmetadata to : " << crash_metadata_path_prefix << ".*"; CHECK_OK(RemoteFileSetContents(input_file_path, one_input)); - CHECK_OK( - RemoteFileSetContents(crash_metadata_file_path, - one_input_batch_result.failure_description())); + CHECK_OK(RemoteFileSetContents( + absl::StrCat(crash_metadata_path_prefix, ".desc"), + one_input_batch_result.failure_description())); + CHECK_OK(RemoteFileSetContents( + absl::StrCat(crash_metadata_path_prefix, ".sig"), + one_input_batch_result.failure_signature())); return; } }
diff --git a/centipede/centipede_callbacks.cc b/centipede/centipede_callbacks.cc index 819e800..90e189d 100644 --- a/centipede/centipede_callbacks.cc +++ b/centipede/centipede_callbacks.cc
@@ -157,7 +157,7 @@ absl::StrCat(":shmem:arg1=", inputs_blobseq_.path(), ":arg2=", outputs_blobseq_.path(), ":failure_description_path=", failure_description_path_, - ":"), + ":failure_signature_path=", failure_signature_path_, ":"), disable_coverage)}; if (env_.clang_coverage_binary == binary) @@ -243,9 +243,18 @@ ReadFromLocalFile(execute_log_path_, batch_result.log()); ReadFromLocalFile(failure_description_path_, batch_result.failure_description()); - // Remove failure_description_ here so that it doesn't stay until another - // failed execution. + if (std::filesystem::exists(failure_signature_path_)) { + ReadFromLocalFile(failure_signature_path_, + batch_result.failure_signature()); + } else { + // TODO(xinhaoyuan): Refactor runner to use dispatcher so this branch can + // be removed. + batch_result.failure_signature() = batch_result.failure_description(); + } + // Remove the failure description and signature files here so that they do + // not stay until another failed execution. std::filesystem::remove(failure_description_path_); + std::filesystem::remove(failure_signature_path_); } VLOG(1) << __FUNCTION__ << " took " << (absl::Now() - start_time); return retval;
diff --git a/centipede/centipede_callbacks.h b/centipede/centipede_callbacks.h index 792608d..57b9aab 100644 --- a/centipede/centipede_callbacks.h +++ b/centipede/centipede_callbacks.h
@@ -174,6 +174,8 @@ std::filesystem::path(temp_dir_).append("log"); std::string failure_description_path_ = std::filesystem::path(temp_dir_).append("failure_description"); + std::string failure_signature_path_ = + std::filesystem::path(temp_dir_).append("failure_signature"); const std::string shmem_name1_ = ProcessAndThreadUniqueID("/ctpd-shm1-"); const std::string shmem_name2_ = ProcessAndThreadUniqueID("/ctpd-shm2-");
diff --git a/centipede/centipede_flags.inc b/centipede/centipede_flags.inc index 115ec38..091c8e7 100644 --- a/centipede/centipede_flags.inc +++ b/centipede/centipede_flags.inc
@@ -293,11 +293,10 @@ CENTIPEDE_FLAG( std::string, crashes_to_files, "", "When set to a directory path, save the crashing reproducers and " - "metadata from the workdir to the given path. Each crash with `ID`" - "will be saved with file `ID.data` for the reproducer and " - "`ID.metadata` the metadata, which currently contains the failure " - "description. If multiple crashes with the same ID exist, only one " - "crash will be saved.") + "metadata from the workdir to the given path: Each crash with `ID`" + "will be saved with file `ID.data` for the reproducer, `ID.desc` the " + "description, `ID.sig` the signature. If multiple crashes with the same ID " + "exist, only one crash will be saved.") CENTIPEDE_FLAG( std::string, corpus_from_files, "", "Export a corpus from a local directory with one file per input into "
diff --git a/centipede/centipede_interface.cc b/centipede/centipede_interface.cc index 504ccf2..ca4c856 100644 --- a/centipede/centipede_interface.cc +++ b/centipede/centipede_interface.cc
@@ -288,9 +288,9 @@ return test_shard; } -// Prunes non-reproducible and duplicate crashes and returns the crash metadata -// of the remaining crashes. -absl::flat_hash_set<std::string> PruneOldCrashesAndGetRemainingCrashMetadata( +// Prunes non-reproducible and duplicate crashes and returns the crash +// signatures of the remaining crashes. +absl::flat_hash_set<std::string> PruneOldCrashesAndGetRemainingCrashSignatures( const std::filesystem::path &crashing_dir, const Environment &env, CentipedeCallbacksFactory &callbacks_factory) { const std::vector<std::string> crashing_input_files = @@ -299,7 +299,7 @@ ValueOrDie(RemoteListFiles(crashing_dir.c_str(), /*recursively=*/false)); ScopedCentipedeCallbacks scoped_callbacks(callbacks_factory, env); BatchResult batch_result; - absl::flat_hash_set<std::string> remaining_crash_metadata; + absl::flat_hash_set<std::string> remaining_crash_signatures; for (const std::string &crashing_input_file : crashing_input_files) { ByteArray crashing_input; @@ -308,7 +308,8 @@ env.binary, {crashing_input}, batch_result); const bool is_duplicate = is_reproducible && !batch_result.IsSetupFailure() && - !remaining_crash_metadata.insert(batch_result.failure_description()) + !batch_result.failure_signature().empty() && + !remaining_crash_signatures.insert(batch_result.failure_signature()) .second; if (!is_reproducible || batch_result.IsSetupFailure() || is_duplicate) { CHECK_OK(RemotePathDelete(crashing_input_file, /*recursively=*/false)); @@ -316,13 +317,13 @@ CHECK_OK(RemotePathTouchExistingFile(crashing_input_file)); } } - return remaining_crash_metadata; + return remaining_crash_signatures; } // TODO(b/405382531): Add unit tests once the function is unit-testable. void DeduplicateAndStoreNewCrashes( const std::filesystem::path &crashing_dir, const WorkDir &workdir, - size_t total_shards, absl::flat_hash_set<std::string> crash_metadata) { + size_t total_shards, absl::flat_hash_set<std::string> crash_signatures) { for (size_t shard_idx = 0; shard_idx < total_shards; ++shard_idx) { const std::vector<std::string> new_crashing_input_files = // The crash reproducer directory may contain subdirectories with @@ -338,19 +339,20 @@ for (const std::string &crashing_input_file : new_crashing_input_files) { const std::string crashing_input_file_name = std::filesystem::path(crashing_input_file).filename(); - const std::string crash_metadata_file = - crash_metadata_dir / crashing_input_file_name; - std::string new_crash_metadata; + const std::string crash_signature_path = + crash_metadata_dir / absl::StrCat(crashing_input_file_name, ".sig"); + std::string new_crash_signature; const absl::Status status = - RemoteFileGetContents(crash_metadata_file, new_crash_metadata); + RemoteFileGetContents(crash_signature_path, new_crash_signature); if (!status.ok()) { LOG(WARNING) << "Ignoring crashing input " << crashing_input_file_name - << " due to failure to read the crash metadata file: " + << " due to failure to read the crash signature: " << status; continue; } const bool is_duplicate = - !crash_metadata.insert(new_crash_metadata).second; + !new_crash_signature.empty() && + !crash_signatures.insert(new_crash_signature).second; if (is_duplicate) continue; CHECK_OK( RemoteFileRename(crashing_input_file, @@ -667,11 +669,11 @@ // Deduplicate and update the crashing inputs. const std::filesystem::path crashing_dir = fuzztest_db_path / "crashing"; - absl::flat_hash_set<std::string> crash_metadata = - PruneOldCrashesAndGetRemainingCrashMetadata(crashing_dir, env, - callbacks_factory); + absl::flat_hash_set<std::string> crash_signatures = + PruneOldCrashesAndGetRemainingCrashSignatures(crashing_dir, env, + callbacks_factory); DeduplicateAndStoreNewCrashes(crashing_dir, workdir, env.total_shards, - std::move(crash_metadata)); + std::move(crash_signatures)); } return EXIT_SUCCESS;
diff --git a/centipede/centipede_test.cc b/centipede/centipede_test.cc index 3254b5e..c66e7ab 100644 --- a/centipede/centipede_test.cc +++ b/centipede/centipede_test.cc
@@ -685,6 +685,7 @@ std::string binary; unsigned char input = 0; std::string description; + std::string signature; }; // A mock for ExtraBinaries test. @@ -703,6 +704,7 @@ for (const Crash &crash : crashes_) { if (binary == crash.binary && input[0] == crash.input) { batch_result.failure_description() = crash.description; + batch_result.failure_signature() = crash.signature; res = false; } } @@ -766,9 +768,9 @@ env.binary = "b1"; env.extra_binaries = {"b2", "b3", "b4"}; env.require_pc_table = false; // No PC table here. - ExtraBinariesMock mock( - env, {Crash{"b1", 10, "b1-crash"}, Crash{"b2", 30, "b2-crash"}, - Crash{"b3", 50, "b3-crash"}}); + ExtraBinariesMock mock(env, {Crash{"b1", 10, "b1-crash", "b1-sig"}, + Crash{"b2", 30, "b2-crash", "b2-sig"}, + Crash{"b3", 50, "b3-crash", "b3-sig"}}); MockFactory factory(mock); CentipedeMain(env, factory); @@ -789,11 +791,15 @@ auto crash_metadata_dir_path = WorkDir{env}.CrashMetadataDirPaths().MyShard(); ASSERT_TRUE(std::filesystem::exists(crash_metadata_dir_path)) << VV(crash_metadata_dir_path); - EXPECT_THAT(crash_metadata_dir_path, - HasFilesWithContents(testing::UnorderedElementsAre( - FileAndContents{Hash({10}), "b1-crash"}, - FileAndContents{Hash({30}), "b2-crash"}, - FileAndContents{Hash({50}), "b3-crash"}))); + EXPECT_THAT( + crash_metadata_dir_path, + HasFilesWithContents(testing::UnorderedElementsAre( + FileAndContents{absl::StrCat(Hash({10}), ".desc"), "b1-crash"}, + FileAndContents{absl::StrCat(Hash({10}), ".sig"), "b1-sig"}, + FileAndContents{absl::StrCat(Hash({30}), ".desc"), "b2-crash"}, + FileAndContents{absl::StrCat(Hash({30}), ".sig"), "b2-sig"}, + FileAndContents{absl::StrCat(Hash({50}), ".desc"), "b3-crash"}, + FileAndContents{absl::StrCat(Hash({50}), ".sig"), "b3-sig"}))); } // A mock for UndetectedCrashingInput test.
diff --git a/centipede/runner_result.h b/centipede/runner_result.h index fd4e9f7..17ecd57 100644 --- a/centipede/runner_result.h +++ b/centipede/runner_result.h
@@ -162,6 +162,8 @@ const std::string& failure_description() const { return failure_description_; } + std::string& failure_signature() { return failure_signature_; } + const std::string& failure_signature() const { return failure_signature_; } private: friend class MultiInputMock; @@ -169,9 +171,13 @@ std::vector<ExecutionResult> results_; std::string log_; // log_ is populated optionally, e.g. if there was a crash. int exit_code_ = EXIT_SUCCESS; // Process exit code. - // If the batch execution fails, this may optionally contain a failure - // description, e.g., the crash type, stack trace... + // If the batch execution fails, this may optionally contain a human-readable + // failure description, e.g., the crash type, stack trace... std::string failure_description_; + // A signature uniquely identifying the failure, which does not need to be + // human-readable. Specially, failures with empty signatures are always + // considered unique. + std::string failure_signature_; size_t num_outputs_read_ = 0; };
diff --git a/centipede/workdir.cc b/centipede/workdir.cc index 3e96506..59b2e0f 100644 --- a/centipede/workdir.cc +++ b/centipede/workdir.cc
@@ -16,6 +16,7 @@ #include <cstddef> #include <filesystem> // NOLINT +#include <optional> #include <string> #include <string_view> #include <system_error> // NOLINT @@ -28,6 +29,7 @@ #include "absl/strings/numbers.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" +#include "absl/strings/strip.h" #include "./centipede/environment.h" #include "./common/logging.h" @@ -80,6 +82,15 @@ return absl::StartsWith(path, prefix_); } +std::optional<size_t> WorkDir::PathShards::GetShardIndex( + std::string_view path) const { + if (!absl::ConsumePrefix(&path, prefix_)) return std::nullopt; + if (path.size() != kDigitsInShardIndex) return std::nullopt; + size_t shard = 0; + if (!absl::SimpleAtoi(path, &shard)) return std::nullopt; + return shard; +} + //------------------------------------------------------------------------------ // WorkDir
diff --git a/centipede/workdir.h b/centipede/workdir.h index 9aefffd..5192b93 100644 --- a/centipede/workdir.h +++ b/centipede/workdir.h
@@ -16,6 +16,7 @@ #define THIRD_PARTY_CENTIPEDE_WORKDIR_MGR_H_ #include <cstddef> +#include <optional> #include <ostream> #include <string> #include <string_view> @@ -47,6 +48,9 @@ // but `path` must have the exact `base_dir`/`rel_prefix` prefix, // including any relative "." and ".." path elements. bool IsShard(std::string_view path) const; + // Returns the shard index of `path` if it is a shard parth, `nullopt` + // otherwise. + std::optional<size_t> GetShardIndex(std::string_view path) const; private: friend class WorkDir;
diff --git a/fuzztest/internal/centipede_adaptor.cc b/fuzztest/internal/centipede_adaptor.cc index eff5706..0e31571 100644 --- a/fuzztest/internal/centipede_adaptor.cc +++ b/fuzztest/internal/centipede_adaptor.cc
@@ -760,17 +760,18 @@ absl::StrCat(read_reproducer_status)); continue; } - const std::string metadata_file = std::filesystem::path{exported_crash_file} - .replace_extension("metadata") - .string(); - std::string metadata; - const absl::Status read_metadata_status = - RemoteFileGetContents(metadata_file, metadata); - if (!read_metadata_status.ok()) { + const std::string description_file = + std::filesystem::path{exported_crash_file} + .replace_extension("desc") + .string(); + std::string description; + const absl::Status read_description_status = + RemoteFileGetContents(description_file, description); + if (!read_description_status.ok()) { absl::FPrintF( GetStderr(), - "[!] Got error while reading the metadata for crash id %s: %s\n", - crash_id, absl::StrCat(read_metadata_status)); + "[!] Got error while reading the description for crash id %s: %s\n", + crash_id, absl::StrCat(read_description_status)); continue; } std::string reproducer_path = WriteDataToDir(reproducer, output.dir_path); @@ -782,8 +783,8 @@ continue; } absl::FPrintF(GetStderr(), - "[.] Saved reproducer with ID %s and crash metadata %s\n", - Basename(reproducer_path), metadata); + "[.] Saved reproducer with ID %s and crash description %s\n", + Basename(reproducer_path), description); if (!single_reproducer_path.has_value()) { single_reproducer_path = reproducer_path; } else {