blob: ccd1ea215c8030cef7819543ab19565d6c86a1a2 [file] [log] [blame]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/threading/hang_watcher.h"
#include <algorithm>
#include <atomic>
#include <utility>
#include "base/bind.h"
#include "base/callback_helpers.h"
#include "base/debug/alias.h"
#include "base/debug/crash_logging.h"
#include "base/debug/dump_without_crashing.h"
#include "base/feature_list.h"
#include "base/no_destructor.h"
#include "base/strings/string_number_conversions.h"
#include "base/synchronization/lock.h"
#include "base/synchronization/waitable_event.h"
#include "base/threading/platform_thread.h"
#include "base/threading/thread_checker.h"
#include "base/threading/thread_restrictions.h"
#include "base/time/default_tick_clock.h"
#include "base/time/time.h"
#include "build/build_config.h"
namespace base {
// static
constexpr base::Feature HangWatcher::kEnableHangWatcher{
"EnableHangWatcher", base::FEATURE_DISABLED_BY_DEFAULT};
constexpr base::TimeDelta HangWatchScope::kDefaultHangWatchTime =
base::TimeDelta::FromSeconds(10);
namespace {
HangWatcher* g_instance = nullptr;
}
constexpr const char* kThreadName = "HangWatcher";
// The time that the HangWatcher thread will sleep for between calls to
// Monitor(). Increasing or decreasing this does not modify the type of hangs
// that can be detected. It instead increases the probability that a call to
// Monitor() will happen at the right time to catch a hang. This has to be
// balanced with power/cpu use concerns as busy looping would catch amost all
// hangs but present unacceptable overhead.
const base::TimeDelta kMonitoringPeriod = base::TimeDelta::FromSeconds(10);
HangWatchScope::HangWatchScope(TimeDelta timeout) {
internal::HangWatchState* current_hang_watch_state =
internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
DCHECK(timeout >= base::TimeDelta()) << "Negative timeouts are invalid.";
// TODO(crbug.com/1034046): Remove when all threads using HangWatchScope are
// monitored. Thread is not monitored, noop.
if (!current_hang_watch_state) {
return;
}
DCHECK(current_hang_watch_state)
<< "A scope can only be used on a thread that "
"registered for hang watching with HangWatcher::RegisterThread.";
#if DCHECK_IS_ON()
previous_scope_ = current_hang_watch_state->GetCurrentHangWatchScope();
current_hang_watch_state->SetCurrentHangWatchScope(this);
#endif
// TODO(crbug.com/1034046): Check whether we are over deadline already for the
// previous scope here by issuing only one TimeTicks::Now() and resuing the
// value.
previous_deadline_ = current_hang_watch_state->GetDeadline();
TimeTicks deadline = TimeTicks::Now() + timeout;
current_hang_watch_state->SetDeadline(deadline);
}
HangWatchScope::~HangWatchScope() {
DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
internal::HangWatchState* current_hang_watch_state =
internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
// TODO(crbug.com/1034046): Remove when all threads using HangWatchScope are
// monitored. Thread is not monitored, noop.
if (!current_hang_watch_state) {
return;
}
// If a hang is currently being captured we should block here so execution
// stops and the relevant stack frames are recorded.
base::HangWatcher::GetInstance()->BlockIfCaptureInProgress();
#if DCHECK_IS_ON()
// Verify that no Scope was destructed out of order.
DCHECK_EQ(this, current_hang_watch_state->GetCurrentHangWatchScope());
current_hang_watch_state->SetCurrentHangWatchScope(previous_scope_);
#endif
// Reset the deadline to the value it had before entering this scope.
current_hang_watch_state->SetDeadline(previous_deadline_);
// TODO(crbug.com/1034046): Log when a HangWatchScope exits after its deadline
// and that went undetected by the HangWatcher.
}
HangWatcher::HangWatcher()
: monitor_period_(kMonitoringPeriod),
should_monitor_(WaitableEvent::ResetPolicy::AUTOMATIC),
thread_(this, kThreadName),
tick_clock_(base::DefaultTickClock::GetInstance()) {
// |thread_checker_| should not be bound to the constructing thread.
DETACH_FROM_THREAD(hang_watcher_thread_checker_);
should_monitor_.declare_only_used_while_idle();
DCHECK(!g_instance);
g_instance = this;
}
HangWatcher::~HangWatcher() {
DCHECK_EQ(g_instance, this);
DCHECK(watch_states_.empty());
g_instance = nullptr;
Stop();
}
void HangWatcher::Start() {
thread_.Start();
}
void HangWatcher::Stop() {
keep_monitoring_.store(false, std::memory_order_relaxed);
should_monitor_.Signal();
thread_.Join();
}
bool HangWatcher::IsWatchListEmpty() {
AutoLock auto_lock(watch_state_lock_);
return watch_states_.empty();
}
void HangWatcher::Wait() {
while (true) {
// Amount by which the actual time spent sleeping can deviate from
// the target time and still be considered timely.
constexpr base::TimeDelta kWaitDriftTolerance =
base::TimeDelta::FromMilliseconds(100);
const base::TimeTicks time_before_wait = tick_clock_->NowTicks();
// Sleep until next scheduled monitoring or until signaled.
const bool was_signaled = should_monitor_.TimedWait(monitor_period_);
if (after_wait_callback_)
after_wait_callback_.Run(time_before_wait);
const base::TimeTicks time_after_wait = tick_clock_->NowTicks();
const base::TimeDelta wait_time = time_after_wait - time_before_wait;
const bool wait_was_normal =
wait_time <= (monitor_period_ + kWaitDriftTolerance);
if (!wait_was_normal) {
// If the time spent waiting was too high it might indicate the machine is
// very slow or that that it went to sleep. In any case we can't trust the
// hang watch scopes that are currently live. Update the ignore threshold
// to make sure they don't trigger a hang on subsequent monitors then keep
// waiting.
base::AutoLock auto_lock(watch_state_lock_);
// Find the latest deadline among the live watch states. They might change
// atomically while iterating but that's fine because if they do that
// means the new HangWatchScope was constructed very soon after the
// abnormal sleep happened and might be affected by the root cause still.
// Ignoring it is cautious and harmless.
base::TimeTicks latest_deadline;
for (const auto& state : watch_states_) {
base::TimeTicks deadline = state->GetDeadline();
if (deadline > latest_deadline) {
latest_deadline = deadline;
}
}
deadline_ignore_threshold_ = latest_deadline;
}
// Stop waiting.
if (wait_was_normal || was_signaled)
return;
}
}
void HangWatcher::Run() {
// Monitor() should only run on |thread_|. Bind |thread_checker_| here to make
// sure of that.
DCHECK_CALLED_ON_VALID_THREAD(hang_watcher_thread_checker_);
while (keep_monitoring_.load(std::memory_order_relaxed)) {
Wait();
if (!IsWatchListEmpty() &&
keep_monitoring_.load(std::memory_order_relaxed)) {
Monitor();
if (after_monitor_closure_for_testing_) {
after_monitor_closure_for_testing_.Run();
}
}
}
}
// static
HangWatcher* HangWatcher::GetInstance() {
return g_instance;
}
// static
void HangWatcher::RecordHang() {
base::debug::DumpWithoutCrashing();
// Inhibit code folding.
const int line_number = __LINE__;
base::debug::Alias(&line_number);
}
ScopedClosureRunner HangWatcher::RegisterThread() {
AutoLock auto_lock(watch_state_lock_);
watch_states_.push_back(
internal::HangWatchState::CreateHangWatchStateForCurrentThread());
return ScopedClosureRunner(BindOnce(&HangWatcher::UnregisterThread,
Unretained(HangWatcher::GetInstance())));
}
base::TimeTicks HangWatcher::WatchStateSnapShot::GetHighestDeadline() const {
DCHECK(!hung_watch_state_copies_.empty());
// Since entries are sorted in increasing order the last entry is the largest
// one.
return hung_watch_state_copies_.back().deadline;
}
HangWatcher::WatchStateSnapShot::WatchStateSnapShot(
const HangWatchStates& watch_states,
base::TimeTicks snapshot_time,
base::TimeTicks deadline_ignore_threshold)
: snapshot_time_(snapshot_time) {
// Initial copy of the values.
for (const auto& watch_state : watch_states) {
base::TimeTicks deadline = watch_state.get()->GetDeadline();
if (deadline <= deadline_ignore_threshold) {
hung_watch_state_copies_.clear();
return;
}
// Only copy hung threads.
if (deadline <= snapshot_time) {
hung_watch_state_copies_.push_back(
WatchStateCopy{deadline, watch_state.get()->GetThreadID()});
}
}
// Sort |hung_watch_state_copies_| by order of decreasing hang severity so the
// most severe hang is first in the list.
std::sort(hung_watch_state_copies_.begin(), hung_watch_state_copies_.end(),
[](const WatchStateCopy& lhs, const WatchStateCopy& rhs) {
return lhs.deadline < rhs.deadline;
});
}
HangWatcher::WatchStateSnapShot::WatchStateSnapShot(
const WatchStateSnapShot& other) = default;
HangWatcher::WatchStateSnapShot::~WatchStateSnapShot() = default;
std::string HangWatcher::WatchStateSnapShot::PrepareHungThreadListCrashKey()
const {
// Build a crash key string that contains the ids of the hung threads.
constexpr char kSeparator{'|'};
std::string list_of_hung_thread_ids;
// Add as many thread ids to the crash key as possible.
for (const WatchStateCopy& copy : hung_watch_state_copies_) {
std::string fragment = base::NumberToString(copy.thread_id) + kSeparator;
if (list_of_hung_thread_ids.size() + fragment.size() <
static_cast<std::size_t>(debug::CrashKeySize::Size256)) {
list_of_hung_thread_ids += fragment;
} else {
// Respect the by priority ordering of thread ids in the crash key by
// stopping the construction as soon as one does not fit. This avoids
// including lesser priority ids while omitting more important ones.
break;
}
}
return list_of_hung_thread_ids;
}
HangWatcher::WatchStateSnapShot HangWatcher::GrabWatchStateSnapshotForTesting()
const {
WatchStateSnapShot snapshot(watch_states_, base::TimeTicks::Now(),
deadline_ignore_threshold_);
return snapshot;
}
void HangWatcher::Monitor() {
DCHECK_CALLED_ON_VALID_THREAD(hang_watcher_thread_checker_);
AutoLock auto_lock(watch_state_lock_);
// If all threads unregistered since this function was invoked there's
// nothing to do anymore.
if (watch_states_.empty())
return;
const base::TimeTicks now = base::TimeTicks::Now();
// See if any thread hung. We're holding |watch_state_lock_| so threads
// can't register or unregister but their deadline still can change
// atomically. This is fine. Detecting a hang is generally best effort and
// if a thread resumes from hang in the time it takes to move on to
// capturing then its ID will be absent from the crash keys.
bool any_thread_hung = std::any_of(
watch_states_.cbegin(), watch_states_.cend(),
[this, now](const std::unique_ptr<internal::HangWatchState>& state) {
base::TimeTicks deadline = state->GetDeadline();
return deadline > deadline_ignore_threshold_ && deadline < now;
});
// If at least a thread is hung we need to capture.
if (any_thread_hung)
CaptureHang(now);
}
void HangWatcher::CaptureHang(base::TimeTicks capture_time) {
capture_in_progress.store(true, std::memory_order_relaxed);
base::AutoLock scope_lock(capture_lock_);
WatchStateSnapShot watch_state_snapshot(watch_states_, capture_time,
deadline_ignore_threshold_);
// The hung thread(s) could detected at the start of Monitor() could have
// moved on from their scopes. If that happened and there are no more hung
// threads then abort capture.
std::string list_of_hung_thread_ids =
watch_state_snapshot.PrepareHungThreadListCrashKey();
if (list_of_hung_thread_ids.empty())
return;
#if not defined(OS_NACL)
static debug::CrashKeyString* crash_key = AllocateCrashKeyString(
"list-of-hung-threads", debug::CrashKeySize::Size256);
debug::ScopedCrashKeyString list_of_hung_threads_crash_key_string(
crash_key, list_of_hung_thread_ids);
#endif
// To avoid capturing more than one hang that blames a subset of the same
// threads it's necessary to keep track of what is the furthest deadline
// that contributed to declaring a hang. Only once
// all threads have deadlines past this point can we be sure that a newly
// discovered hang is not directly related.
// Example:
// **********************************************************************
// Timeline A : L------1-------2----------3-------4----------N-----------
// Timeline B : -------2----------3-------4----------L----5------N-------
// Timeline C : L----------------------------5------6----7---8------9---N
// **********************************************************************
// In the example when a Monitor() happens during timeline A
// |deadline_ignore_threshold_| (L) is at time zero and deadlines (1-4)
// are before Now() (N) . A hang is captured and L is updated. During
// the next Monitor() (timeline B) a new deadline is over but we can't
// capture a hang because deadlines 2-4 are still live and already counted
// toward a hang. During a third monitor (timeline C) all live deadlines
// are now after L and a second hang can be recorded.
base::TimeTicks latest_expired_deadline =
watch_state_snapshot.GetHighestDeadline();
if (on_hang_closure_for_testing_)
on_hang_closure_for_testing_.Run();
else
RecordHang();
// Update after running the actual capture.
deadline_ignore_threshold_ = latest_expired_deadline;
capture_in_progress.store(false, std::memory_order_relaxed);
}
void HangWatcher::SetAfterMonitorClosureForTesting(
base::RepeatingClosure closure) {
DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
after_monitor_closure_for_testing_ = std::move(closure);
}
void HangWatcher::SetOnHangClosureForTesting(base::RepeatingClosure closure) {
DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
on_hang_closure_for_testing_ = std::move(closure);
}
void HangWatcher::SetMonitoringPeriodForTesting(base::TimeDelta period) {
DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
monitor_period_ = period;
}
void HangWatcher::SetAfterWaitCallbackForTesting(
RepeatingCallback<void(TimeTicks)> callback) {
DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
after_wait_callback_ = callback;
}
void HangWatcher::SignalMonitorEventForTesting() {
DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_);
should_monitor_.Signal();
}
void HangWatcher::StopMonitoringForTesting() {
keep_monitoring_.store(false, std::memory_order_relaxed);
}
void HangWatcher::SetTickClockForTesting(const base::TickClock* tick_clock) {
tick_clock_ = tick_clock;
}
void HangWatcher::BlockIfCaptureInProgress() {
// Makes a best-effort attempt to block execution if a hang is currently being
// captured.Only block on |capture_lock| if |capture_in_progress| hints that
// it's already held to avoid serializing all threads on this function when no
// hang capture is in-progress.
if (capture_in_progress.load(std::memory_order_relaxed)) {
base::AutoLock hang_lock(capture_lock_);
}
}
void HangWatcher::UnregisterThread() {
AutoLock auto_lock(watch_state_lock_);
internal::HangWatchState* current_hang_watch_state =
internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
auto it =
std::find_if(watch_states_.cbegin(), watch_states_.cend(),
[current_hang_watch_state](
const std::unique_ptr<internal::HangWatchState>& state) {
return state.get() == current_hang_watch_state;
});
// Thread should be registered to get unregistered.
DCHECK(it != watch_states_.end());
watch_states_.erase(it);
}
namespace internal {
// |deadline_| starts at Max() to avoid validation problems
// when setting the first legitimate value.
HangWatchState::HangWatchState() : thread_id_(PlatformThread::CurrentId()) {
// There should not exist a state object for this thread already.
DCHECK(!GetHangWatchStateForCurrentThread()->Get());
// Bind the new instance to this thread.
GetHangWatchStateForCurrentThread()->Set(this);
}
HangWatchState::~HangWatchState() {
DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
DCHECK_EQ(GetHangWatchStateForCurrentThread()->Get(), this);
GetHangWatchStateForCurrentThread()->Set(nullptr);
#if DCHECK_IS_ON()
// Destroying the HangWatchState should not be done if there are live
// HangWatchScopes.
DCHECK(!current_hang_watch_scope_);
#endif
}
// static
std::unique_ptr<HangWatchState>
HangWatchState::CreateHangWatchStateForCurrentThread() {
// Allocate a watch state object for this thread.
std::unique_ptr<HangWatchState> hang_state =
std::make_unique<HangWatchState>();
// Setting the thread local worked.
DCHECK_EQ(GetHangWatchStateForCurrentThread()->Get(), hang_state.get());
// Transfer ownership to caller.
return hang_state;
}
TimeTicks HangWatchState::GetDeadline() const {
return deadline_.load(std::memory_order_relaxed);
}
void HangWatchState::SetDeadline(TimeTicks deadline) {
DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
deadline_.store(deadline, std::memory_order_relaxed);
}
bool HangWatchState::IsOverDeadline() const {
return TimeTicks::Now() > deadline_.load(std::memory_order_relaxed);
}
#if DCHECK_IS_ON()
void HangWatchState::SetCurrentHangWatchScope(HangWatchScope* scope) {
DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
current_hang_watch_scope_ = scope;
}
HangWatchScope* HangWatchState::GetCurrentHangWatchScope() {
DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
return current_hang_watch_scope_;
}
#endif
// static
ThreadLocalPointer<HangWatchState>*
HangWatchState::GetHangWatchStateForCurrentThread() {
static NoDestructor<ThreadLocalPointer<HangWatchState>> hang_watch_state;
return hang_watch_state.get();
}
PlatformThreadId HangWatchState::GetThreadID() const {
return thread_id_;
}
} // namespace internal
} // namespace base