| // Copyright 2012 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifdef UNSAFE_BUFFERS_BUILD |
| // TODO(crbug.com/390223051): Remove C-library calls to fix the errors. |
| #pragma allow_unsafe_libc_calls |
| #endif |
| |
| #include "gpu/ipc/service/gpu_watchdog_thread.h" |
| |
| #include <memory> |
| #include <string> |
| #include <utility> |
| |
| #include "base/bit_cast.h" |
| #include "base/command_line.h" |
| #include "base/debug/alias.h" |
| #include "base/debug/dump_without_crashing.h" |
| #include "base/files/file_path.h" |
| #include "base/files/file_util.h" |
| #include "base/functional/bind.h" |
| #include "base/functional/callback_helpers.h" |
| #include "base/logging.h" |
| #include "base/memory/ptr_util.h" |
| #include "base/metrics/histogram_functions.h" |
| #include "base/metrics/histogram_macros.h" |
| #include "base/native_library.h" |
| #include "base/numerics/safe_conversions.h" |
| #include "base/power_monitor/power_monitor.h" |
| #include "base/process/process.h" |
| #include "base/strings/string_number_conversions.h" |
| #include "base/system/sys_info.h" |
| #include "base/task/current_thread.h" |
| #include "base/threading/platform_thread.h" |
| #include "base/time/time.h" |
| #include "build/build_config.h" |
| #include "gpu/config/gpu_crash_keys.h" |
| #include "gpu/config/gpu_switches.h" |
| #include "gpu/ipc/common/result_codes.h" |
| |
| #if BUILDFLAG(IS_WIN) |
| #include <windows.h> |
| #endif |
| |
| namespace gpu { |
| |
| base::TimeDelta GetGpuWatchdogTimeout(bool software_rendering) { |
| std::string timeout_str = |
| base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII( |
| switches::kGpuWatchdogTimeoutSeconds); |
| if (!timeout_str.empty()) { |
| size_t timeout_seconds; |
| if (base::StringToSizeT(timeout_str, &timeout_seconds)) |
| return base::Seconds(timeout_seconds); |
| |
| LOG(WARNING) << "Invalid --" << switches::kGpuWatchdogTimeoutSeconds << ": " |
| << timeout_str; |
| } |
| |
| base::TimeDelta timeout = kGpuWatchdogTimeout; |
| #if BUILDFLAG(IS_WIN) |
| int num_of_processors = base::SysInfo::NumberOfProcessors(); |
| if (num_of_processors > 8) { |
| timeout -= base::Seconds(10); |
| } else if (num_of_processors <= 4) { |
| timeout += base::Seconds(5); |
| } |
| #endif |
| |
| if (software_rendering) { |
| timeout *= kSoftwareRenderingFactor; |
| } |
| return timeout; |
| } |
| |
| GpuWatchdogThread::GpuWatchdogThread(base::TimeDelta timeout, |
| int restart_factor, |
| bool is_test_mode, |
| const std::string& thread_name) |
| : base::Thread(thread_name), |
| watchdog_timeout_(timeout), |
| watchdog_restart_factor_(restart_factor), |
| is_test_mode_(is_test_mode) { |
| base::CurrentThread::Get()->AddTaskObserver(this); |
| |
| // DO NOT CHANGE |watched_thread_name_str_uma_|. It's used for UMA and crash |
| // report. |
| if (thread_name == "GpuWatchdog_Compositor") |
| watched_thread_name_str_uma_ = ".compositor"; |
| else |
| watched_thread_name_str_uma_ = ".main"; |
| |
| watched_thread_id_str_ = |
| base::NumberToString(base::PlatformThread::CurrentId().raw()); |
| |
| #if BUILDFLAG(IS_WIN) |
| // GetCurrentThread returns a pseudo-handle that cannot be used by one thread |
| // to identify another. DuplicateHandle creates a "real" handle that can be |
| // used for this purpose. |
| if (!::DuplicateHandle(::GetCurrentProcess(), ::GetCurrentThread(), |
| ::GetCurrentProcess(), &watched_thread_handle_, |
| THREAD_QUERY_INFORMATION, FALSE, 0)) { |
| watched_thread_handle_ = nullptr; |
| } |
| #endif |
| |
| #if BUILDFLAG(IS_LINUX) && !BUILDFLAG(IS_CASTOS) |
| tty_file_.reset(base::OpenFile( |
| base::FilePath(FILE_PATH_LITERAL("/sys/class/tty/tty0/active")), "r")); |
| UpdateActiveTTY(); |
| host_tty_ = active_tty_; |
| #endif |
| |
| Arm(); |
| } |
| |
| GpuWatchdogThread::~GpuWatchdogThread() { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(watched_thread_sequence_checker_); |
| // Stop() might take too long and the watchdog timeout is triggered. |
| // Disarm first before calling Stop() to avoid a crash. |
| if (IsArmed()) |
| Disarm(); |
| PauseWatchdog(); |
| |
| Stop(); // stop the watchdog thread |
| |
| base::CurrentThread::Get()->RemoveTaskObserver(this); |
| base::PowerMonitor::GetInstance()->RemovePowerSuspendObserver(this); |
| GpuWatchdogThreadEventHistogram(GpuWatchdogThreadEvent::kGpuWatchdogEnd); |
| #if BUILDFLAG(IS_WIN) |
| if (watched_thread_handle_) |
| CloseHandle(watched_thread_handle_); |
| #endif |
| } |
| |
| // static |
| std::unique_ptr<GpuWatchdogThread> GpuWatchdogThread::Create( |
| bool start_backgrounded, |
| base::TimeDelta timeout, |
| int restart_factor, |
| bool is_test_mode, |
| const std::string& thread_name) { |
| auto watchdog_thread = base::WrapUnique(new GpuWatchdogThread( |
| timeout, restart_factor, is_test_mode, thread_name)); |
| watchdog_thread->Start(); |
| if (start_backgrounded) |
| watchdog_thread->OnBackgrounded(); |
| return watchdog_thread; |
| } |
| |
| // static |
| std::unique_ptr<GpuWatchdogThread> GpuWatchdogThread::Create( |
| bool start_backgrounded, |
| bool software_rendering, |
| const std::string& thread_name) { |
| return Create(start_backgrounded, GetGpuWatchdogTimeout(software_rendering), |
| kRestartFactor, /*test_mode=*/false, thread_name); |
| } |
| |
| // static |
| std::unique_ptr<GpuWatchdogThread> GpuWatchdogThread::Create( |
| bool start_backgrounded, |
| const GpuWatchdogThread* existing_watchdog, |
| const std::string& thread_name) { |
| DCHECK(existing_watchdog); |
| return Create(start_backgrounded, existing_watchdog->watchdog_timeout_, |
| existing_watchdog->watchdog_restart_factor_, |
| /*test_mode=*/false, thread_name); |
| } |
| |
| // Android Chrome goes to the background. Called from the gpu io thread. |
| void GpuWatchdogThread::OnBackgrounded() { |
| // Report progress first in case the Watchdog timeout task in the watchdog |
| // thread is not invalidated soon enough. |
| InProgress(); |
| |
| task_runner()->PostTask( |
| FROM_HERE, |
| base::BindOnce(&GpuWatchdogThread::StopWatchdogTimeoutTask, |
| base::Unretained(this), kAndroidBackgroundForeground)); |
| } |
| |
| // Android Chrome goes to the foreground. Called from the gpu io thread. |
| void GpuWatchdogThread::OnForegrounded() { |
| task_runner()->PostTask( |
| FROM_HERE, |
| base::BindOnce(&GpuWatchdogThread::RestartWatchdogTimeoutTask, |
| base::Unretained(this), kAndroidBackgroundForeground)); |
| } |
| |
| // Called from the gpu thread when gpu init has completed. |
| void GpuWatchdogThread::OnInitComplete() { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(watched_thread_sequence_checker_); |
| |
| task_runner()->PostTask( |
| FROM_HERE, base::BindOnce(&GpuWatchdogThread::UpdateInitializationFlag, |
| base::Unretained(this))); |
| Disarm(); |
| |
| // The PowerMonitorObserver needs to be register on the watchdog thread so the |
| // notifications are delivered on that thread. |
| task_runner()->PostTask(FROM_HERE, |
| base::BindOnce(&GpuWatchdogThread::AddPowerObserver, |
| base::Unretained(this))); |
| } |
| |
| // Called from the gpu thread in viz::GpuServiceImpl::~GpuServiceImpl(). |
| // After this, no Disarm() will be called before the watchdog thread is |
| // destroyed. If this destruction takes too long, the watchdog timeout |
| // will be triggered. |
| void GpuWatchdogThread::OnGpuProcessTearDown() { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(watched_thread_sequence_checker_); |
| |
| in_gpu_process_teardown_ = true; |
| if (!IsArmed()) |
| Arm(); |
| } |
| |
| // Called from the watched gpu thread. |
| void GpuWatchdogThread::PauseWatchdog() { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(watched_thread_sequence_checker_); |
| // Report progress first in case the Watchdog timeout task in the watchdog |
| // thread is not invalidated soon enough. |
| InProgress(); |
| |
| task_runner()->PostTask( |
| FROM_HERE, base::BindOnce(&GpuWatchdogThread::StopWatchdogTimeoutTask, |
| base::Unretained(this), kGeneralGpuFlow)); |
| } |
| |
| // Called from the watched gpu thread. |
| void GpuWatchdogThread::ResumeWatchdog() { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(watched_thread_sequence_checker_); |
| |
| task_runner()->PostTask( |
| FROM_HERE, base::BindOnce(&GpuWatchdogThread::RestartWatchdogTimeoutTask, |
| base::Unretained(this), kGeneralGpuFlow)); |
| } |
| |
| // Running on the watchdog thread. |
| // On Linux, Init() will be called twice for Sandbox Initialization. The |
| // watchdog is stopped and then restarted in StartSandboxLinux(). Everything |
| // should be the same and continue after the second init(). |
| void GpuWatchdogThread::Init() { |
| // Get and Invalidate weak_ptr should be done on the watchdog thread only. |
| weak_ptr_ = weak_factory_.GetWeakPtr(); |
| task_runner()->PostDelayedTask( |
| FROM_HERE, |
| base::BindOnce(&GpuWatchdogThread::OnWatchdogTimeout, weak_ptr_), |
| watchdog_timeout_); |
| |
| last_arm_disarm_counter_ = ReadArmDisarmCounter(); |
| watchdog_start_timeticks_ = base::TimeTicks::Now(); |
| last_on_watchdog_timeout_timeticks_ = watchdog_start_timeticks_; |
| next_on_watchdog_timeout_time_ = base::Time::Now() + watchdog_timeout_; |
| in_gpu_initialization_ = true; |
| |
| #if BUILDFLAG(IS_WIN) |
| if (watched_thread_handle_) { |
| if (base::ThreadTicks::IsSupported()) |
| base::ThreadTicks::WaitUntilInitialized(); |
| last_on_watchdog_timeout_thread_ticks_ = GetWatchedThreadTime(); |
| remaining_watched_thread_ticks_ = watchdog_timeout_; |
| } |
| #endif |
| } |
| |
| // Running on the watchdog thread. |
| void GpuWatchdogThread::CleanUp() { |
| DCHECK(task_runner()->RunsTasksInCurrentSequence()); |
| weak_factory_.InvalidateWeakPtrs(); |
| } |
| |
| void GpuWatchdogThread::ReportProgress() { |
| InProgress(); |
| } |
| |
| void GpuWatchdogThread::WillProcessTask(const base::PendingTask& pending_task, |
| bool was_blocked_or_low_priority) { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(watched_thread_sequence_checker_); |
| |
| // The watchdog is armed at the beginning of the gpu process teardown. |
| // Do not call Arm() during teardown. |
| if (in_gpu_process_teardown_) |
| DCHECK(IsArmed()); |
| else |
| Arm(); |
| } |
| |
| void GpuWatchdogThread::DidProcessTask(const base::PendingTask& pending_task) { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(watched_thread_sequence_checker_); |
| // Keep the watchdog armed during tear down. |
| if (in_gpu_process_teardown_) |
| InProgress(); |
| else |
| Disarm(); |
| } |
| |
| // Power Suspends. Running on the watchdog thread. |
| void GpuWatchdogThread::OnSuspend() { |
| DCHECK(task_runner()->RunsTasksInCurrentSequence()); |
| InProgress(); |
| StopWatchdogTimeoutTask(kPowerSuspendResume); |
| } |
| |
| // Power Resumes. Running on the watchdog thread. |
| void GpuWatchdogThread::OnResume() { |
| DCHECK(task_runner()->RunsTasksInCurrentSequence()); |
| RestartWatchdogTimeoutTask(kPowerSuspendResume); |
| } |
| |
| // Running on the watchdog thread. |
| // Call AddPowerSuspendObserver on the watchdog thread so that OnSuspend() and |
| // OnResume() will be called on this thread. |
| void GpuWatchdogThread::AddPowerObserver() { |
| DCHECK(task_runner()->RunsTasksInCurrentSequence()); |
| |
| // Adding the Observer to the power monitor is safe even if power monitor is |
| // not yet initialized. |
| bool is_system_suspended = |
| base::PowerMonitor::GetInstance() |
| ->AddPowerSuspendObserverAndReturnSuspendedState(this); |
| if (is_system_suspended) |
| StopWatchdogTimeoutTask(kPowerSuspendResume); |
| } |
| |
| // Running on the watchdog thread. |
| void GpuWatchdogThread::RestartWatchdogTimeoutTask( |
| PauseResumeSource source_of_request) { |
| DCHECK(task_runner()->RunsTasksInCurrentSequence()); |
| base::TimeDelta timeout; |
| |
| switch (source_of_request) { |
| case kAndroidBackgroundForeground: |
| if (!is_backgrounded_) |
| return; |
| is_backgrounded_ = false; |
| timeout = watchdog_timeout_ * watchdog_restart_factor_; |
| foregrounded_timeticks_ = base::TimeTicks::Now(); |
| foregrounded_event_ = true; |
| num_of_timeout_after_foregrounded_ = 0; |
| break; |
| case kPowerSuspendResume: |
| if (!in_power_suspension_) |
| return; |
| in_power_suspension_ = false; |
| timeout = watchdog_timeout_ * watchdog_restart_factor_; |
| power_resume_timeticks_ = base::TimeTicks::Now(); |
| power_resumed_event_ = true; |
| num_of_timeout_after_power_resume_ = 0; |
| break; |
| case kGeneralGpuFlow: |
| if (!is_paused_) |
| return; |
| is_paused_ = false; |
| timeout = watchdog_timeout_; |
| watchdog_resume_timeticks_ = base::TimeTicks::Now(); |
| break; |
| } |
| |
| if (!is_backgrounded_ && !in_power_suspension_ && !is_paused_) { |
| weak_ptr_ = weak_factory_.GetWeakPtr(); |
| task_runner()->PostDelayedTask( |
| FROM_HERE, |
| base::BindOnce(&GpuWatchdogThread::OnWatchdogTimeout, weak_ptr_), |
| timeout); |
| last_on_watchdog_timeout_timeticks_ = base::TimeTicks::Now(); |
| next_on_watchdog_timeout_time_ = base::Time::Now() + timeout; |
| last_arm_disarm_counter_ = ReadArmDisarmCounter(); |
| #if BUILDFLAG(IS_WIN) |
| if (watched_thread_handle_) { |
| last_on_watchdog_timeout_thread_ticks_ = GetWatchedThreadTime(); |
| remaining_watched_thread_ticks_ = timeout; |
| } |
| #endif |
| } |
| } |
| |
| void GpuWatchdogThread::StopWatchdogTimeoutTask( |
| PauseResumeSource source_of_request) { |
| DCHECK(task_runner()->RunsTasksInCurrentSequence()); |
| |
| switch (source_of_request) { |
| case kAndroidBackgroundForeground: |
| if (is_backgrounded_) |
| return; |
| is_backgrounded_ = true; |
| backgrounded_timeticks_ = base::TimeTicks::Now(); |
| foregrounded_event_ = false; |
| break; |
| case kPowerSuspendResume: |
| if (in_power_suspension_) |
| return; |
| in_power_suspension_ = true; |
| power_suspend_timeticks_ = base::TimeTicks::Now(); |
| power_resumed_event_ = false; |
| break; |
| case kGeneralGpuFlow: |
| if (is_paused_) |
| return; |
| is_paused_ = true; |
| watchdog_pause_timeticks_ = base::TimeTicks::Now(); |
| break; |
| } |
| |
| // Revoke any pending watchdog timeout task |
| weak_factory_.InvalidateWeakPtrs(); |
| } |
| |
| // On the watchdog thread only. |
| void GpuWatchdogThread::UpdateInitializationFlag() { |
| DCHECK(task_runner()->RunsTasksInCurrentSequence()); |
| in_gpu_initialization_ = false; |
| } |
| |
| // Note on the atomic operations on `arm_disarm_counter_`: |
| // We use `std::memory_order_relaxed` for the atomic operations. This is safe |
| // because for the increments we only care about atomicity - this is similar to |
| // the usual atomic ref counting patterns. And for reads we only care about |
| // consistency since we only use it for detecting hangs - it's not critical if |
| // there's a race between arming/disarming and reading. |
| // |
| // Arm() and Disarm() are called from the watched gpu thread only. |
| // The watchdog is armed only in these three functions - |
| // GpuWatchdogThread(), WillProcessTask(), and OnGpuProcessTearDown() |
| void GpuWatchdogThread::Arm() { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(watched_thread_sequence_checker_); |
| |
| arm_disarm_counter_.fetch_add(1, std::memory_order_relaxed); |
| |
| // Arm/Disarm are always called in sequence. Now it's an odd number. |
| DCHECK(IsArmed()); |
| } |
| |
| void GpuWatchdogThread::Disarm() { |
| DCHECK_CALLED_ON_VALID_SEQUENCE(watched_thread_sequence_checker_); |
| |
| arm_disarm_counter_.fetch_add(1, std::memory_order_relaxed); |
| |
| // Arm/Disarm are always called in sequence. Now it's an even number. |
| DCHECK(!IsArmed()); |
| } |
| |
| // It's ok to call this function on any thread since it doesn't change the |
| // IsArmed() state by itself. |
| void GpuWatchdogThread::InProgress() { |
| // Increment by 2. This is equivalent to Disarm() + Arm(). |
| // If Watchdog is already disarmed, it stays in the same disarmed status. |
| arm_disarm_counter_.fetch_add(2, std::memory_order_relaxed); |
| } |
| |
| // The watchdog is considered armed if the `arm_disarm_counter_` is odd. |
| bool GpuWatchdogThread::IsArmed() { |
| return arm_disarm_counter_.load(std::memory_order_relaxed) & 1; |
| } |
| |
| // This is used for reading the `arm_disarm_counter_` value to be compared with |
| // the `last_arm_disarm_counter_` value. |
| int GpuWatchdogThread::ReadArmDisarmCounter() { |
| return arm_disarm_counter_.load(std::memory_order_relaxed); |
| } |
| |
| // Running on the watchdog thread. |
| void GpuWatchdogThread::OnWatchdogTimeout() { |
| DCHECK(task_runner()->RunsTasksInCurrentSequence()); |
| DCHECK(!is_backgrounded_); |
| DCHECK(!in_power_suspension_); |
| DCHECK(!is_paused_); |
| |
| // If this metric is added too early (eg. watchdog creation time), it cannot |
| // be persistent. The histogram data will be lost after crash or browser exit. |
| // Delay the recording of kGpuWatchdogStart until the firs |
| // OnWatchdogTimeout() to ensure this metric is created in the persistent |
| // memory. |
| if (!is_watchdog_start_histogram_recorded_) { |
| is_watchdog_start_histogram_recorded_ = true; |
| GpuWatchdogThreadEventHistogram(GpuWatchdogThreadEvent::kGpuWatchdogStart); |
| } |
| |
| GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kTimeout); |
| if (power_resumed_event_) |
| num_of_timeout_after_power_resume_++; |
| if (foregrounded_event_) |
| num_of_timeout_after_foregrounded_++; |
| |
| #if BUILDFLAG(IS_LINUX) && !BUILDFLAG(IS_CASTOS) |
| UpdateActiveTTY(); |
| #endif |
| |
| // Collect all needed info for gpu hang detection. |
| int arm_disarm_counter = ReadArmDisarmCounter(); |
| bool disarmed = arm_disarm_counter % 2 == 0; // even number |
| bool gpu_makes_progress = arm_disarm_counter != last_arm_disarm_counter_; |
| bool no_gpu_hang = disarmed || gpu_makes_progress || SlowWatchdogThread(); |
| |
| bool watched_thread_needs_more_time = |
| WatchedThreadNeedsMoreThreadTime(no_gpu_hang); |
| no_gpu_hang = no_gpu_hang || watched_thread_needs_more_time || |
| ContinueOnNonHostX11ServerTty(); |
| |
| // No gpu hang. Continue with another OnWatchdogTimeout task. |
| if (no_gpu_hang) { |
| ContinueWithNextWatchdogTimeoutTask(); |
| return; |
| } |
| |
| // A GPU hang is detected. |
| TRACE_EVENT1("gpu,startup", "OnWatchdogTimeout", "timeoutMs", |
| watchdog_timeout_.InMilliseconds()); |
| |
| // If the watched thread makes a progress after crash dump, the GPU process |
| // will not be killed and every thing continues after this function. |
| // Otherwise, this is the end of the GPU process. |
| DeliberatelyTerminateToRecoverFromHang(); |
| } |
| |
| void GpuWatchdogThread::ContinueWithNextWatchdogTimeoutTask() { |
| last_on_watchdog_timeout_timeticks_ = base::TimeTicks::Now(); |
| next_on_watchdog_timeout_time_ = base::Time::Now() + watchdog_timeout_; |
| last_arm_disarm_counter_ = ReadArmDisarmCounter(); |
| |
| task_runner()->PostDelayedTask( |
| FROM_HERE, |
| base::BindOnce(&GpuWatchdogThread::OnWatchdogTimeout, weak_ptr_), |
| watchdog_timeout_); |
| } |
| |
| bool GpuWatchdogThread::SlowWatchdogThread() { |
| // If it takes 15 more seconds than the expected time between two |
| // OnWatchdogTimeout() calls, the system is considered slow and it's not a GPU |
| // hang. |
| bool slow_watchdog_thread = |
| (base::Time::Now() - next_on_watchdog_timeout_time_) >= |
| kUnreasonableTimeoutDelay; |
| |
| // Record this case only when a GPU hang is detected and the thread is slow. |
| if (slow_watchdog_thread) |
| GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kSlowWatchdogThread); |
| |
| return slow_watchdog_thread; |
| } |
| |
| bool GpuWatchdogThread::WatchedThreadNeedsMoreThreadTime( |
| bool no_gpu_hang_detected) { |
| #if BUILDFLAG(IS_WIN) |
| if (!watched_thread_handle_) |
| return false; |
| |
| WatchedThreadNeedsMoreThreadTimeHistogram( |
| no_gpu_hang_detected, |
| /*start_of_more_thread_time*/ false); |
| |
| if (!no_gpu_hang_detected && count_of_more_gpu_thread_time_allowed_ >= |
| kMaxCountOfMoreGpuThreadTimeAllowed) { |
| less_than_full_thread_time_after_capped_ = true; |
| } else { |
| less_than_full_thread_time_after_capped_ = false; |
| } |
| |
| // Calculate how many thread ticks the watched thread spent doing the work. |
| base::ThreadTicks now = GetWatchedThreadTime(); |
| base::TimeDelta thread_time_elapsed = |
| now - last_on_watchdog_timeout_thread_ticks_; |
| last_on_watchdog_timeout_thread_ticks_ = now; |
| remaining_watched_thread_ticks_ -= thread_time_elapsed; |
| |
| if (no_gpu_hang_detected || |
| count_of_more_gpu_thread_time_allowed_ >= |
| kMaxCountOfMoreGpuThreadTimeAllowed || |
| thread_time_elapsed.is_negative() /* bogus data */ || |
| remaining_watched_thread_ticks_ <= base::TimeDelta()) { |
| // Reset the remaining thread ticks. |
| remaining_watched_thread_ticks_ = watchdog_timeout_; |
| count_of_more_gpu_thread_time_allowed_ = 0; |
| |
| return false; |
| } else { |
| // This is the start of allowing more thread time. |
| if (count_of_more_gpu_thread_time_allowed_ == 0) { |
| WatchedThreadNeedsMoreThreadTimeHistogram( |
| no_gpu_hang_detected, /*start_of_more_thread_time*/ true); |
| } |
| count_of_more_gpu_thread_time_allowed_++; |
| |
| return true; |
| } |
| #else |
| return false; |
| #endif |
| } |
| |
| #if BUILDFLAG(IS_WIN) |
| base::ThreadTicks GpuWatchdogThread::GetWatchedThreadTime() { |
| DCHECK(watched_thread_handle_); |
| |
| if (base::ThreadTicks::IsSupported()) { |
| // Note: GetForThread() might return bogus results if running on different |
| // CPUs between two calls. |
| return base::ThreadTicks::GetForThread( |
| base::PlatformThreadHandle(watched_thread_handle_)); |
| } else { |
| FILETIME creation_time; |
| FILETIME exit_time; |
| FILETIME kernel_time; |
| FILETIME user_time; |
| BOOL result = GetThreadTimes(watched_thread_handle_, &creation_time, |
| &exit_time, &kernel_time, &user_time); |
| if (!result) |
| return base::ThreadTicks(); |
| |
| // Need to bit_cast to fix alignment, then divide by 10 to convert |
| // 100-nanoseconds to microseconds. |
| int64_t user_time_us = base::bit_cast<int64_t, FILETIME>(user_time) / 10; |
| int64_t kernel_time_us = |
| base::bit_cast<int64_t, FILETIME>(kernel_time) / 10; |
| |
| return base::ThreadTicks() + |
| base::Microseconds(user_time_us + kernel_time_us); |
| } |
| } |
| #endif |
| |
| void GpuWatchdogThread::DeliberatelyTerminateToRecoverFromHang() { |
| DCHECK(task_runner()->RunsTasksInCurrentSequence()); |
| |
| // If this is for gpu testing, do not terminate the gpu process. |
| // Just signal and quit. |
| if (is_test_mode_) { |
| test_result_timeout_and_gpu_hang_.Set(); |
| return; |
| } |
| |
| #if BUILDFLAG(IS_WIN) |
| if (IsDebuggerPresent()) |
| return; |
| #endif |
| |
| // Store variables so they're available in crash dumps to help determine the |
| // cause of any hang. |
| base::TimeTicks function_begin_timeticks = base::TimeTicks::Now(); |
| base::debug::Alias(&in_gpu_initialization_); |
| base::debug::Alias(&num_of_timeout_after_power_resume_); |
| base::debug::Alias(&num_of_timeout_after_foregrounded_); |
| base::debug::Alias(&function_begin_timeticks); |
| base::debug::Alias(&watchdog_start_timeticks_); |
| base::debug::Alias(&power_suspend_timeticks_); |
| base::debug::Alias(&power_resume_timeticks_); |
| base::debug::Alias(&backgrounded_timeticks_); |
| base::debug::Alias(&foregrounded_timeticks_); |
| base::debug::Alias(&watchdog_pause_timeticks_); |
| base::debug::Alias(&watchdog_resume_timeticks_); |
| base::debug::Alias(&in_power_suspension_); |
| base::debug::Alias(&in_gpu_process_teardown_); |
| base::debug::Alias(&is_backgrounded_); |
| base::debug::Alias(&last_on_watchdog_timeout_timeticks_); |
| base::TimeDelta timeticks_elapses = |
| function_begin_timeticks - last_on_watchdog_timeout_timeticks_; |
| base::debug::Alias(&timeticks_elapses); |
| #if BUILDFLAG(IS_WIN) |
| base::debug::Alias(&remaining_watched_thread_ticks_); |
| base::debug::Alias(&less_than_full_thread_time_after_capped_); |
| #endif |
| |
| // The watchdog currently doesn't watch multiple threads. If multiple threads |
| // are supported, use '|' to separate thread ids in "list_of_hung_threads". |
| crash_keys::list_of_hung_threads.Set(watched_thread_id_str_); |
| |
| crash_keys::gpu_watchdog_crashed_in_gpu_init.Set( |
| in_gpu_initialization_ ? "1" : "0"); |
| |
| crash_keys::gpu_watchdog_kill_after_power_resume.Set( |
| WithinOneMinFromPowerResumed() ? "1" : "0"); |
| |
| const int num_of_processors = base::SysInfo::NumberOfProcessors(); |
| crash_keys::num_of_processors.Set(base::NumberToString(num_of_processors)); |
| |
| crash_keys::gpu_thread.Set(watched_thread_name_str_uma_); |
| |
| // Check the arm_disarm_counter value one more time. |
| auto last_arm_disarm_counter = ReadArmDisarmCounter(); |
| base::debug::Alias(&last_arm_disarm_counter); |
| |
| // Create a crash dump first |
| base::debug::DumpWithoutCrashing(); |
| |
| // A kKill event is triggered and DumpWithoutCrashing() is called in the |
| // watchdog timeout routine OnWatchdogTimeout(). If it turns out |
| // gpu does not hang after the crash dump, another histogram |
| // kNoKillForGpuProgressDuringCrashDumping will be recorded later. |
| GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kKill); |
| |
| // Final check after the crash dump. If the watched thread makes a progress |
| // (disarmed) during generating crash dump, no need to crash the GPU process. |
| bool gpu_hang = IsArmed(); |
| if (gpu_hang) { |
| // Still armed without any progress. The GPU process is now killed. |
| GpuWatchdogThreadEventHistogram(GpuWatchdogThreadEvent::kGpuWatchdogKill); |
| #if BUILDFLAG(IS_WIN) |
| if (less_than_full_thread_time_after_capped_) |
| GpuWatchdogTimeoutHistogram( |
| GpuWatchdogTimeoutEvent::kKillOnLessThreadTime); |
| #endif |
| |
| // Use RESULT_CODE_HUNG so this crash is separated from other |
| // EXCEPTION_ACCESS_VIOLATION buckets for UMA analysis. |
| // TerminateCurrentProcessImmediately itself will not generate a dump. |
| base::Process::TerminateCurrentProcessImmediately(RESULT_CODE_HUNG); |
| // The end of the GPU process. |
| } else { |
| crash_keys::list_of_hung_threads.Clear(); |
| crash_keys::gpu_watchdog_crashed_in_gpu_init.Clear(); |
| crash_keys::gpu_watchdog_kill_after_power_resume.Clear(); |
| crash_keys::num_of_processors.Clear(); |
| crash_keys::gpu_thread.Clear(); |
| |
| GpuWatchdogTimeoutHistogram( |
| GpuWatchdogTimeoutEvent::kNoKillForGpuProgressDuringCrashDumping); |
| #if BUILDFLAG(IS_WIN) |
| // Reset the counters for WatchedThreadNeedsMoreThreadTime(). |
| remaining_watched_thread_ticks_ = watchdog_timeout_; |
| count_of_more_gpu_thread_time_allowed_ = 0; |
| #endif |
| |
| ContinueWithNextWatchdogTimeoutTask(); |
| } |
| } |
| |
| void GpuWatchdogThread::GpuWatchdogThreadEventHistogram( |
| GpuWatchdogThreadEvent thread_event) { |
| base::UmaHistogramEnumeration("GPU.WatchdogThread.Event", thread_event); |
| base::UmaHistogramEnumeration( |
| "GPU.WatchdogThread.Event" + watched_thread_name_str_uma_, thread_event); |
| } |
| |
| void GpuWatchdogThread::GpuWatchdogTimeoutHistogram( |
| GpuWatchdogTimeoutEvent timeout_event) { |
| base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout", timeout_event); |
| base::UmaHistogramEnumeration( |
| "GPU.WatchdogThread.Timeout" + watched_thread_name_str_uma_, |
| timeout_event); |
| |
| bool recorded = false; |
| if (in_gpu_initialization_) { |
| base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout.Init", |
| timeout_event); |
| base::UmaHistogramEnumeration( |
| "GPU.WatchdogThread.Timeout.Init" + watched_thread_name_str_uma_, |
| timeout_event); |
| recorded = true; |
| } |
| |
| if (WithinOneMinFromPowerResumed()) { |
| base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout.PowerResume", |
| timeout_event); |
| base::UmaHistogramEnumeration( |
| "GPU.WatchdogThread.Timeout.PowerResume" + watched_thread_name_str_uma_, |
| timeout_event); |
| recorded = true; |
| } |
| |
| if (WithinOneMinFromForegrounded()) { |
| base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout.Foregrounded", |
| timeout_event); |
| base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout.Foregrounded" + |
| watched_thread_name_str_uma_, |
| timeout_event); |
| recorded = true; |
| } |
| |
| if (!recorded) { |
| base::UmaHistogramEnumeration("GPU.WatchdogThread.Timeout.Normal", |
| timeout_event); |
| base::UmaHistogramEnumeration( |
| "GPU.WatchdogThread.Timeout.Normal" + watched_thread_name_str_uma_, |
| timeout_event); |
| } |
| } |
| |
| #if BUILDFLAG(IS_WIN) |
| void GpuWatchdogThread::WatchedThreadNeedsMoreThreadTimeHistogram( |
| bool no_gpu_hang_detected, |
| bool start_of_more_thread_time) { |
| if (start_of_more_thread_time) { |
| // This is the start of allowing more thread time. Only record it once for |
| // all following timeouts on the same detected gpu hang, so we know this |
| // is equivalent one crash in our crash reports. |
| GpuWatchdogTimeoutHistogram(GpuWatchdogTimeoutEvent::kMoreThreadTime); |
| } else { |
| if (count_of_more_gpu_thread_time_allowed_ > 0) { |
| if (no_gpu_hang_detected) { |
| // If count_of_more_gpu_thread_time_allowed_ > 0, we know extra time was |
| // extended in the previous OnWatchdogTimeout(). Now we find gpu makes |
| // progress. Record this case. |
| GpuWatchdogTimeoutHistogram( |
| GpuWatchdogTimeoutEvent::kProgressAfterMoreThreadTime); |
| } else if (count_of_more_gpu_thread_time_allowed_ >= |
| kMaxCountOfMoreGpuThreadTimeAllowed) { |
| GpuWatchdogTimeoutHistogram( |
| GpuWatchdogTimeoutEvent::kLessThanFullThreadTimeAfterCapped); |
| } |
| } |
| } |
| } |
| #endif |
| |
| bool GpuWatchdogThread::WithinOneMinFromPowerResumed() { |
| size_t count = base::ClampFloor<size_t>(base::Minutes(1) / watchdog_timeout_); |
| return power_resumed_event_ && num_of_timeout_after_power_resume_ <= count; |
| } |
| |
| bool GpuWatchdogThread::WithinOneMinFromForegrounded() { |
| size_t count = base::ClampFloor<size_t>(base::Minutes(1) / watchdog_timeout_); |
| return foregrounded_event_ && num_of_timeout_after_foregrounded_ <= count; |
| } |
| |
| #if BUILDFLAG(IS_LINUX) && !BUILDFLAG(IS_CASTOS) |
| void GpuWatchdogThread::UpdateActiveTTY() { |
| last_active_tty_ = active_tty_; |
| |
| active_tty_ = -1; |
| char tty_string[8] = {}; |
| if (tty_file_ && !fseek(tty_file_.get(), 0, SEEK_SET) && |
| fread(tty_string, 1, 7, tty_file_.get())) { |
| int tty_number; |
| if (sscanf(tty_string, "tty%d\n", &tty_number) == 1) { |
| active_tty_ = tty_number; |
| } |
| } |
| } |
| #endif |
| |
| bool GpuWatchdogThread::ContinueOnNonHostX11ServerTty() { |
| #if BUILDFLAG(IS_LINUX) && !BUILDFLAG(IS_CASTOS) |
| if (host_tty_ == -1 || active_tty_ == -1) |
| return false; |
| |
| // Don't crash if we're not on the TTY of our host X11 server. |
| if (active_tty_ != host_tty_) { |
| // Only record for the time there is a change on TTY |
| if (last_active_tty_ == active_tty_) { |
| GpuWatchdogTimeoutHistogram( |
| GpuWatchdogTimeoutEvent::kContinueOnNonHostServerTty); |
| } |
| return true; |
| } |
| #endif |
| return false; |
| } |
| |
| // For gpu testing only. Return whether a GPU hang was detected or not. |
| bool GpuWatchdogThread::IsGpuHangDetectedForTesting() { |
| DCHECK(is_test_mode_); |
| return test_result_timeout_and_gpu_hang_.IsSet(); |
| } |
| |
| } // namespace gpu |