blob: 545eb2dcff151a3f4d0e3b182fa2648fe068c8ac [file]
// Copyright 2025 The Centipede Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "./centipede/crash_deduplication.h"
#include <cstddef>
#include <cstdlib>
#include <filesystem> // NOLINT
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "absl/strings/str_cat.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "./centipede/centipede_callbacks.h"
#include "./centipede/crash_summary.h"
#include "./centipede/environment.h"
#include "./centipede/runner_result.h"
#include "./centipede/workdir.h"
#include "./common/crashing_input_filename.h"
#include "./common/defs.h"
#include "./common/hash.h"
#include "./common/logging.h"
#include "./common/remote_file.h"
#include "./common/status_macros.h"
namespace fuzztest::internal {
namespace {
std::string GetInputFileName(std::string_view bug_id,
std::string_view crash_signature,
std::string_view input_signature) {
return absl::StrCat(bug_id, "-", crash_signature, "-", input_signature);
}
} // namespace
absl::flat_hash_map<std::string, CrashDetails> GetCrashesFromWorkdir(
const WorkDir& workdir, size_t total_shards) {
const bool fail_on_empty_crash_metadata =
std::getenv("FUZZTEST_FAIL_ON_EMPTY_CRASH_METADATA") != nullptr;
absl::flat_hash_map<std::string, CrashDetails> crashes;
for (size_t shard_idx = 0; shard_idx < total_shards; ++shard_idx) {
std::vector<std::string> crashing_input_paths =
// The crash reproducer directory may contain subdirectories with
// input files that don't individually cause a crash. We ignore those
// for now and don't list the files recursively.
ValueOrDie(
RemoteListFiles(workdir.CrashReproducerDirPaths().Shard(shard_idx),
/*recursively=*/false));
const std::filesystem::path crash_metadata_dir =
workdir.CrashMetadataDirPaths().Shard(shard_idx);
for (std::string& crashing_input_path : crashing_input_paths) {
std::string crashing_input_file_name =
std::filesystem::path(crashing_input_path).filename();
const std::string crash_signature_path =
crash_metadata_dir / absl::StrCat(crashing_input_file_name, ".sig");
std::string crash_signature;
const absl::Status status =
RemoteFileGetContents(crash_signature_path, crash_signature);
if (!status.ok()) {
FUZZTEST_LOG(WARNING)
<< "Ignoring crashing input " << crashing_input_file_name
<< " due to failure to read the crash signature: " << status;
continue;
}
if (crash_signature.empty()) {
FUZZTEST_LOG_IF(FATAL, fail_on_empty_crash_metadata)
<< "Empty crash signature for " << crashing_input_file_name;
FUZZTEST_LOG(ERROR)
<< "Ignoring crashing input " << crashing_input_file_name
<< " due to empty crash signature. This is an internal error; "
"please report it to the FuzzTest team!";
continue;
}
if (crashes.contains(crash_signature)) continue;
const std::string crash_description_path =
crash_metadata_dir / absl::StrCat(crashing_input_file_name, ".desc");
std::string crash_description;
const absl::Status description_status =
RemoteFileGetContents(crash_description_path, crash_description);
if (!description_status.ok()) {
FUZZTEST_LOG(WARNING)
<< "Ignoring crashing input " << crashing_input_file_name
<< " due to failure to read the crash description: "
<< description_status;
continue;
}
if (crash_description.empty()) {
FUZZTEST_LOG_IF(FATAL, fail_on_empty_crash_metadata)
<< "Empty crash description for " << crashing_input_file_name;
FUZZTEST_LOG(ERROR)
<< "Ignoring crashing input " << crashing_input_file_name
<< " due to empty crash description. This is an internal error; "
"please report it to the FuzzTest team!";
continue;
}
crashes.insert(
{std::move(crash_signature),
// Centipede uses the input signature (i.e., the hash of the input)
// for the crashing input's file name in the workdir.
CrashDetails{/*input_signature=*/std::move(crashing_input_file_name),
/*description=*/std::move(crash_description),
/*input_path=*/std::move(crashing_input_path)}});
}
}
return crashes;
}
void OrganizeCrashingInputs(
const std::filesystem::path& regression_dir,
const std::filesystem::path& crashing_dir, const Environment& env,
CentipedeCallbacksFactory& callbacks_factory,
const absl::flat_hash_map<std::string, CrashDetails>&
new_crashes_by_signature,
CrashSummary& crash_summary) {
FUZZTEST_CHECK_OK(RemoteMkdir(crashing_dir.c_str()));
FUZZTEST_CHECK_OK(RemoteMkdir(regression_dir.c_str()));
// The corpus database layout assumes the crash input files are located
// directly in the crashing directory, so we don't list recursively.
std::vector<std::string> old_input_files =
ValueOrDie(RemoteListFiles(crashing_dir.c_str(), /*recursively=*/false));
size_t crash_input_count = old_input_files.size();
ScopedCentipedeCallbacks scoped_callbacks(callbacks_factory, env);
BatchResult batch_result;
absl::flat_hash_map<std::string, CrashDetails> reproduced_crashes;
for (const std::string& old_input_file : old_input_files) {
ByteArray old_input;
FUZZTEST_CHECK_OK(RemoteFileGetContents(old_input_file, old_input));
const bool is_reproducible = !scoped_callbacks.callbacks()->Execute(
env.binary, {old_input}, batch_result) &&
batch_result.IsInputFailure();
auto input_file_components = ParseCrashingInputFilename(old_input_file);
FUZZTEST_LOG_IF(WARNING, !input_file_components.ok())
<< "Failed to get input file components for " << old_input_file
<< ". Status: " << input_file_components.status();
if (is_reproducible) {
if (input_file_components.ok()) {
// Overwrite the old crash signature with the new one.
input_file_components->crash_signature =
batch_result.failure_signature();
} else {
// We'll rename the input file to the new format using the input
// signature as the bug ID.
const std::string input_signature = Hash(old_input);
input_file_components = InputFileComponents{
/*bug_id=*/input_signature,
/*crash_signature=*/batch_result.failure_signature(),
/*input_signature=*/input_signature,
};
}
std::string new_input_file_name = GetInputFileName(
input_file_components->bug_id, input_file_components->crash_signature,
input_file_components->input_signature);
std::string new_input_file = crashing_dir / new_input_file_name;
if (old_input_file == new_input_file) {
const auto status = RemotePathTouchExistingFile(new_input_file);
FUZZTEST_LOG_IF(ERROR, !status.ok())
<< "Failed to touch file " << new_input_file
<< ". Status: " << status;
} else {
const auto status = RemoteFileRename(old_input_file, new_input_file);
if (!status.ok()) {
FUZZTEST_LOG(ERROR)
<< "Failed to rename file " << old_input_file << " to "
<< new_input_file << ". Status: " << status;
new_input_file_name =
std::filesystem::path(old_input_file).filename();
new_input_file = old_input_file;
}
}
// In crash reports we report the full file name as the crash ID. This is
// what the user can use to replay or export the crash.
crash_summary.AddCrash({/*id=*/new_input_file_name,
/*category=*/batch_result.failure_description(),
batch_result.failure_signature(),
batch_result.failure_description()});
reproduced_crashes.try_emplace(
batch_result.failure_signature(),
CrashDetails{
/*input_signature=*/input_file_components->input_signature,
/*description=*/batch_result.failure_description(),
/*input_path=*/new_input_file,
});
continue;
}
FUZZTEST_CHECK(!is_reproducible);
if (!input_file_components.ok()) {
// Irreproducible, no bug ID, and no crash signature. Nothing to do with
// this input but move it to the regression directory.
const std::string regression_input_file =
regression_dir / Hash(old_input);
const auto status =
RemoteFileRename(old_input_file, regression_input_file);
if (status.ok()) {
--crash_input_count;
} else {
FUZZTEST_LOG(ERROR)
<< "Failed to rename file " << old_input_file << " to "
<< regression_input_file << ". Status: " << status;
}
continue;
}
auto crash_it =
reproduced_crashes.find(input_file_components->crash_signature);
auto new_crash_it = crash_it == reproduced_crashes.end()
? new_crashes_by_signature.find(
input_file_components->crash_signature)
: new_crashes_by_signature.end();
if (crash_it != reproduced_crashes.end() ||
new_crash_it == new_crashes_by_signature.end()) {
const std::string regression_input_file =
regression_dir / input_file_components->input_signature;
const auto status = RemoteFileCopy(old_input_file, regression_input_file);
FUZZTEST_LOG_IF(ERROR, !status.ok())
<< "Failed to copy file " << old_input_file << " to "
<< regression_input_file << ". Status: " << status;
continue;
}
crash_it = reproduced_crashes.insert(*new_crash_it).first;
const std::string new_input_file_name = GetInputFileName(
input_file_components->bug_id, input_file_components->crash_signature,
crash_it->second.input_signature);
const std::string new_input_file = crashing_dir / new_input_file_name;
absl::Status replace_status;
if (new_input_file == old_input_file) {
// For some reason, the old input couldn't reproduce the crash during
// reproduction, but it was re-discovered during fuzzing, so it is
// flaky. We keep the input and don't store it as a regression.
replace_status = RemotePathTouchExistingFile(new_input_file);
FUZZTEST_LOG_IF(ERROR, !replace_status.ok())
<< "Failed to touch file " << new_input_file
<< ". Status: " << replace_status;
} else {
const std::string regression_input_file =
regression_dir / input_file_components->input_signature;
replace_status = RemoteFileRename(old_input_file, regression_input_file);
if (replace_status.ok()) {
--crash_input_count;
replace_status =
RemoteFileCopy(crash_it->second.input_path, new_input_file);
if (replace_status.ok()) {
++crash_input_count;
} else {
FUZZTEST_LOG(ERROR)
<< "Failed to copy file " << crash_it->second.input_path << " to "
<< new_input_file << ". Status: " << replace_status;
}
} else {
FUZZTEST_LOG(ERROR)
<< "Failed to rename file " << old_input_file << " to "
<< regression_input_file << ". Status: " << replace_status;
}
}
if (replace_status.ok()) {
crash_summary.AddCrash({/*id=*/new_input_file_name,
/*category=*/crash_it->second.description,
input_file_components->crash_signature,
crash_it->second.description});
} else {
reproduced_crashes.erase(crash_it);
}
}
static constexpr int kMaxCrashInputCount = 10;
for (auto& [crash_signature, details] : new_crashes_by_signature) {
if (reproduced_crashes.contains(crash_signature)) continue;
if (crash_input_count >= kMaxCrashInputCount) {
FUZZTEST_LOG(WARNING)
<< "Reached the maximum number of crash inputs: "
<< kMaxCrashInputCount << ". Not storing any new crashes.";
break;
}
const std::string bug_id = Hash(
absl::StrCat(absl::FormatTime(absl::Now()), details.input_signature));
const std::string new_input_file_name =
GetInputFileName(bug_id, crash_signature, details.input_signature);
const std::string new_input_file = crashing_dir / new_input_file_name;
const auto status = RemoteFileCopy(details.input_path, new_input_file);
if (!status.ok()) {
FUZZTEST_LOG(ERROR) << "Failed to copy file " << details.input_path
<< " to " << new_input_file << ". Status: " << status;
continue;
}
crash_summary.AddCrash({/*id=*/new_input_file_name,
/*category=*/details.description, crash_signature,
details.description});
reproduced_crashes.insert({std::move(crash_signature), std::move(details)});
++crash_input_count;
}
}
} // namespace fuzztest::internal