| // Copyright 2016 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "chrome/chrome_watcher/kasko_util.h" |
| |
| #include <sddl.h> |
| |
| #include <memory> |
| #include <set> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include "base/base_paths.h" |
| #include "base/bind.h" |
| #include "base/callback_helpers.h" |
| #include "base/environment.h" |
| #include "base/files/file_path.h" |
| #include "base/format_macros.h" |
| #include "base/macros.h" |
| #include "base/path_service.h" |
| #include "base/strings/string_number_conversions.h" |
| #include "base/strings/string_util.h" |
| #include "base/strings/stringprintf.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "base/win/wait_chain.h" |
| #include "base/win/win_util.h" |
| |
| #include "chrome/chrome_watcher/chrome_watcher_main_api.h" |
| #include "chrome/chrome_watcher/system_load_estimator.h" |
| #include "components/crash/content/app/crashpad.h" |
| #include "components/memory_pressure/direct_memory_pressure_calculator_win.h" |
| #include "components/memory_pressure/memory_pressure_calculator.h" |
| #include "syzygy/kasko/api/reporter.h" |
| |
| namespace { |
| |
| using MemoryPressureLevel = |
| memory_pressure::MemoryPressureCalculator::MemoryPressureLevel; |
| |
| // Labels a crash report to the server as a hang report. |
| const wchar_t kHangReportCrashKey[] = L"hang-report"; |
| |
| // Helper function for determining the crash server to use. Defaults to the |
| // standard crash server, but can be overridden via an environment variable. |
| // Enables easy integration testing. |
| base::string16 GetKaskoCrashServerUrl() { |
| static const char kKaskoCrashServerUrl[] = "KASKO_CRASH_SERVER_URL"; |
| static const wchar_t kDefaultKaskoCrashServerUrl[] = |
| L"https://clients2.google.com/cr/report"; |
| |
| std::unique_ptr<base::Environment> env(base::Environment::Create()); |
| std::string env_var; |
| if (env->GetVar(kKaskoCrashServerUrl, &env_var)) { |
| return base::UTF8ToUTF16(env_var); |
| } |
| return kDefaultKaskoCrashServerUrl; |
| } |
| |
| // Helper function for determining the crash reports directory to use. Defaults |
| // to the browser data directory, but can be overridden via an environment |
| // variable. Enables easy integration testing. |
| base::FilePath GetKaskoCrashReportsBaseDir( |
| const base::char16* browser_data_directory) { |
| static const char kKaskoCrashReportBaseDir[] = "KASKO_CRASH_REPORTS_BASE_DIR"; |
| std::unique_ptr<base::Environment> env(base::Environment::Create()); |
| std::string env_var; |
| if (env->GetVar(kKaskoCrashReportBaseDir, &env_var)) { |
| return base::FilePath(base::UTF8ToUTF16(env_var)); |
| } |
| return base::FilePath(browser_data_directory); |
| } |
| |
| struct EventSourceDeregisterer { |
| using pointer = HANDLE; |
| void operator()(HANDLE event_source_handle) const { |
| if (!::DeregisterEventSource(event_source_handle)) |
| DPLOG(ERROR) << "DeregisterEventSource"; |
| } |
| }; |
| using ScopedEventSourceHandle = |
| std::unique_ptr<HANDLE, EventSourceDeregisterer>; |
| |
| struct SidDeleter { |
| using pointer = PSID; |
| void operator()(PSID sid) const { |
| if (::LocalFree(sid) != nullptr) |
| DPLOG(ERROR) << "LocalFree"; |
| } |
| }; |
| using ScopedSid = std::unique_ptr<PSID, SidDeleter>; |
| |
| void OnCrashReportUpload(void* context, |
| const base::char16* report_id, |
| const base::char16* minidump_path, |
| const base::char16* const* keys, |
| const base::char16* const* values) { |
| // Open the event source. |
| ScopedEventSourceHandle event_source_handle( |
| ::RegisterEventSource(nullptr, L"Chrome")); |
| if (!event_source_handle) { |
| PLOG(ERROR) << "RegisterEventSource"; |
| return; |
| } |
| |
| // Get the user's SID for the log record. |
| base::string16 sid_string; |
| PSID sid = nullptr; |
| if (base::win::GetUserSidString(&sid_string) && !sid_string.empty()) { |
| if (!::ConvertStringSidToSid(sid_string.c_str(), &sid)) |
| DPLOG(ERROR) << "ConvertStringSidToSid"; |
| DCHECK(sid); |
| } |
| // Ensure cleanup on scope exit. |
| ScopedSid scoped_sid; |
| if (sid) |
| scoped_sid.reset(sid); |
| |
| // Generate the message. |
| // Note that the format of this message must match the consumer in |
| // chrome/browser/crash_upload_list_win.cc. |
| base::string16 message = |
| L"Crash uploaded. Id=" + base::string16(report_id) + L"."; |
| |
| // Matches Omaha. |
| const int kCrashUploadEventId = 2; |
| |
| // Report the event. |
| const base::char16* strings[] = {message.c_str()}; |
| if (!::ReportEvent(event_source_handle.get(), EVENTLOG_INFORMATION_TYPE, |
| 0, // category |
| kCrashUploadEventId, sid, |
| 1, // count |
| 0, strings, nullptr)) { |
| DPLOG(ERROR); |
| } |
| } |
| |
| void AddCrashKey(const wchar_t *key, const wchar_t *value, |
| std::vector<kasko::api::CrashKey> *crash_keys) { |
| DCHECK(key); |
| DCHECK(value); |
| DCHECK(crash_keys); |
| |
| crash_keys->resize(crash_keys->size() + 1); |
| kasko::api::CrashKey& crash_key = crash_keys->back(); |
| base::wcslcpy(crash_key.name, key, kasko::api::CrashKey::kNameMaxLength); |
| base::wcslcpy(crash_key.value, value, kasko::api::CrashKey::kValueMaxLength); |
| } |
| |
| // Get the |process| and the |thread_id| of the node inside the |wait_chain| |
| // that is of type ThreadType and belongs to a process that is valid for the |
| // capture of a crash dump. Returns true if such a node was found. |
| bool GetLastValidNodeInfo(const base::win::WaitChainNodeVector& wait_chain, |
| base::Process* process, |
| DWORD* thread_id) { |
| // The last thread in the wait chain is nominated as the hung thread. |
| base::win::WaitChainNodeVector::const_reverse_iterator it; |
| for (it = wait_chain.rbegin(); it != wait_chain.rend(); ++it) { |
| if (it->ObjectType != WctThreadType) |
| continue; |
| |
| auto current_process = base::Process::Open(it->ThreadObject.ProcessId); |
| if (EnsureTargetProcessValidForCapture(current_process)) { |
| *process = std::move(current_process); |
| *thread_id = it->ThreadObject.ThreadId; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| // Adds the entire wait chain to |crash_keys|. |
| // |
| // As an example (key : value): |
| // hung-process-wait-chain-00 : Thread 10242 in process 4554 with status Blocked |
| // hung-process-wait-chain-01 : Lock of type ThreadWait with status Owned |
| // hung-process-wait-chain-02 : Thread 77221 in process 4554 with status Blocked |
| // |
| void AddWaitChainToCrashKeys(const base::win::WaitChainNodeVector& wait_chain, |
| std::vector<kasko::api::CrashKey>* crash_keys) { |
| for (size_t i = 0; i < wait_chain.size(); i++) { |
| AddCrashKey( |
| base::StringPrintf(L"hung-process-wait-chain-%02" PRIuS, i).c_str(), |
| base::win::WaitChainNodeToString(wait_chain[i]).c_str(), crash_keys); |
| } |
| } |
| |
| base::FilePath GetExeFilePathForProcess(const base::Process& process) { |
| wchar_t exe_name[MAX_PATH]; |
| DWORD exe_name_len = arraysize(exe_name); |
| // Note: requesting the Win32 path format. |
| if (::QueryFullProcessImageName(process.Handle(), 0, exe_name, |
| &exe_name_len) == 0) { |
| DPLOG(ERROR) << "Failed to get executable name for process"; |
| return base::FilePath(); |
| } |
| |
| // QueryFullProcessImageName's documentation does not specify behavior when |
| // the buffer is too small, but we know that GetModuleFileNameEx succeeds and |
| // truncates the returned name in such a case. Given that paths of arbitrary |
| // length may exist, the conservative approach is to reject names when |
| // the returned length is that of the buffer. |
| if (exe_name_len > 0 && exe_name_len < arraysize(exe_name)) |
| return base::FilePath(exe_name); |
| |
| return base::FilePath(); |
| } |
| |
| // Adds the executable base name for each unique pid found in the |wait_chain| |
| // to the |crash_keys|. |
| void AddProcessExeNameToCrashKeys( |
| const base::win::WaitChainNodeVector& wait_chain, |
| std::vector<kasko::api::CrashKey>* crash_keys) { |
| std::set<DWORD> unique_pids; |
| for (size_t i = 0; i < wait_chain.size(); i += 2) |
| unique_pids.insert(wait_chain[i].ThreadObject.ProcessId); |
| |
| for (DWORD pid : unique_pids) { |
| // This is racy on the pid but for the purposes of this function, some error |
| // threshold can be tolerated. Hopefully the race doesn't happen often. |
| base::Process process( |
| base::Process::OpenWithAccess(pid, PROCESS_QUERY_LIMITED_INFORMATION)); |
| |
| base::string16 exe_file_path = L"N/A"; |
| if (process.IsValid()) |
| exe_file_path = GetExeFilePathForProcess(process).BaseName().value(); |
| |
| AddCrashKey( |
| base::StringPrintf(L"hung-process-wait-chain-pid-%u", pid).c_str(), |
| exe_file_path.c_str(), crash_keys); |
| } |
| } |
| |
| void AddSystemLoadInformation(std::vector<kasko::api::CrashKey>* crash_keys) { |
| DCHECK(crash_keys); |
| |
| // Add memory pressure level. |
| memory_pressure::DirectMemoryPressureCalculator memory_calculator; |
| const wchar_t* memory_pressure_level = L""; |
| switch (memory_calculator.CalculateCurrentPressureLevel()) { |
| case MemoryPressureLevel::MEMORY_PRESSURE_LEVEL_NONE: |
| memory_pressure_level = L"none-or-unknown"; |
| break; |
| case MemoryPressureLevel::MEMORY_PRESSURE_LEVEL_MODERATE: |
| memory_pressure_level = L"moderate"; |
| break; |
| case MemoryPressureLevel::MEMORY_PRESSURE_LEVEL_CRITICAL: |
| memory_pressure_level = L"critical"; |
| break; |
| } |
| AddCrashKey(L"memory-pressure", memory_pressure_level, crash_keys); |
| |
| // Add measures of cpu and disk load. |
| chrome_watcher::SystemLoadEstimator::Estimate load_estimate = {}; |
| if (!chrome_watcher::SystemLoadEstimator::Measure(&load_estimate)) |
| return; |
| |
| AddCrashKey(L"cpu-load-percent", |
| base::IntToString16(load_estimate.cpu_load_pct).c_str(), |
| crash_keys); |
| AddCrashKey(L"disk-idle-percent", |
| base::IntToString16(load_estimate.disk_idle_pct).c_str(), |
| crash_keys); |
| AddCrashKey(L"disk-avg-queue-len", |
| base::IntToString16(load_estimate.avg_disk_queue_len).c_str(), |
| crash_keys); |
| } |
| |
| } // namespace |
| |
| bool InitializeKaskoReporter(const base::string16& endpoint, |
| const base::char16* browser_data_directory) { |
| base::string16 crash_server = GetKaskoCrashServerUrl(); |
| base::FilePath crash_reports_base_dir = |
| GetKaskoCrashReportsBaseDir(browser_data_directory); |
| |
| return kasko::api::InitializeReporter( |
| endpoint.c_str(), |
| crash_server.c_str(), |
| crash_reports_base_dir.Append(L"Crash Reports").value().c_str(), |
| crash_reports_base_dir.Append(kPermanentlyFailedReportsSubdir) |
| .value() |
| .c_str(), |
| &OnCrashReportUpload, |
| nullptr); |
| } |
| |
| void ShutdownKaskoReporter() { |
| kasko::api::ShutdownReporter(); |
| } |
| |
| bool EnsureTargetProcessValidForCapture(const base::Process& process) { |
| // Ensure the target process's executable is inside the current Chrome |
| // directory. |
| base::FilePath chrome_dir; |
| if (!PathService::Get(base::DIR_EXE, &chrome_dir)) |
| return false; |
| |
| return chrome_dir.IsParent(GetExeFilePathForProcess(process)); |
| } |
| |
| void DumpHungProcess(DWORD main_thread_id, const base::string16& channel, |
| const base::char16* hang_type, |
| const base::Process& process) { |
| // Read the Crashpad module annotations for the process. |
| std::vector<kasko::api::CrashKey> annotations; |
| crash_reporter::ReadMainModuleAnnotationsForKasko(process, &annotations); |
| |
| // Label the report as a hang report. |
| AddCrashKey(kHangReportCrashKey, hang_type, &annotations); |
| |
| // Note: system load is measured as early as possible, as it is potentially |
| // more volatile than wait chain information. |
| // TODO(manzagop): consider continuous load observation, instead of punctual |
| // observation, which may fail to observe load. |
| AddSystemLoadInformation(&annotations); |
| |
| // Use the Wait Chain Traversal API to determine the hung thread. Defaults to |
| // UI thread on error. The wait chain may point to a different thread in a |
| // different process for the hung thread. |
| DWORD hung_thread_id = main_thread_id; |
| base::Process hung_process = process.Duplicate(); |
| |
| base::win::WaitChainNodeVector wait_chain; |
| bool is_deadlock = false; |
| base::string16 thread_chain_failure_reason; |
| DWORD thread_chain_last_error = ERROR_SUCCESS; |
| if (base::win::GetThreadWaitChain(main_thread_id, &wait_chain, &is_deadlock, |
| &thread_chain_failure_reason, |
| &thread_chain_last_error)) { |
| bool found_valid_node = |
| GetLastValidNodeInfo(wait_chain, &hung_process, &hung_thread_id); |
| DCHECK(found_valid_node); |
| |
| // Add some interesting data about the wait chain to the crash keys. |
| AddCrashKey(L"hung-process-is-deadlock", is_deadlock ? L"true" : L"false", |
| &annotations); |
| AddWaitChainToCrashKeys(wait_chain, &annotations); |
| AddProcessExeNameToCrashKeys(wait_chain, &annotations); |
| } else { |
| // The call to GetThreadWaitChain() failed. Include the reason inside the |
| // report using crash keys. |
| // TODO(pmonette): Remove this when UMA is added to wait_chain.cc. |
| AddCrashKey(L"hung-process-wait-chain-failure-reason", |
| thread_chain_failure_reason.c_str(), &annotations); |
| AddCrashKey(L"hung-process-wait-chain-last-error", |
| base::UintToString16(thread_chain_last_error).c_str(), |
| &annotations); |
| } |
| |
| std::vector<const base::char16*> key_buffers; |
| std::vector<const base::char16*> value_buffers; |
| for (const auto& crash_key : annotations) { |
| key_buffers.push_back(crash_key.name); |
| value_buffers.push_back(crash_key.value); |
| } |
| key_buffers.push_back(nullptr); |
| value_buffers.push_back(nullptr); |
| |
| // Synthesize an exception for the hung thread. Populate the record with the |
| // current context of the thread to get the stack trace bucketed on the crash |
| // backend. |
| CONTEXT thread_context = {}; |
| EXCEPTION_RECORD exception_record = {}; |
| exception_record.ExceptionCode = EXCEPTION_ARRAY_BOUNDS_EXCEEDED; |
| EXCEPTION_POINTERS exception_pointers = {&exception_record, &thread_context}; |
| |
| base::win::ScopedHandle hung_thread(::OpenThread( |
| THREAD_SUSPEND_RESUME | THREAD_GET_CONTEXT | THREAD_QUERY_INFORMATION, |
| FALSE, hung_thread_id)); |
| |
| bool have_context = false; |
| if (hung_thread.IsValid()) { |
| DWORD suspend_count = ::SuspendThread(hung_thread.Get()); |
| const DWORD kSuspendFailed = static_cast<DWORD>(-1); |
| if (suspend_count != kSuspendFailed) { |
| // Best effort capture of the context. |
| thread_context.ContextFlags = CONTEXT_FLOATING_POINT | CONTEXT_SEGMENTS | |
| CONTEXT_INTEGER | CONTEXT_CONTROL; |
| if (::GetThreadContext(hung_thread.Get(), &thread_context) == TRUE) |
| have_context = true; |
| |
| ::ResumeThread(hung_thread.Get()); |
| } |
| } |
| |
| // TODO(manzagop): consider making the dump-type channel-dependent. |
| if (have_context) { |
| kasko::api::SendReportForProcess( |
| hung_process.Handle(), hung_thread_id, &exception_pointers, |
| kasko::api::LARGER_DUMP_TYPE, key_buffers.data(), value_buffers.data()); |
| } else { |
| kasko::api::SendReportForProcess(hung_process.Handle(), 0, nullptr, |
| kasko::api::LARGER_DUMP_TYPE, |
| key_buffers.data(), value_buffers.data()); |
| } |
| } |