| // Copyright (c) 2014 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "components/browser_watcher/watcher_metrics_provider_win.h" |
| |
| #include <stddef.h> |
| |
| #include <limits> |
| #include <memory> |
| #include <set> |
| #include <vector> |
| |
| #include "base/bind.h" |
| #include "base/feature_list.h" |
| #include "base/metrics/histogram.h" |
| #include "base/metrics/histogram_base.h" |
| #include "base/metrics/histogram_macros.h" |
| #include "base/metrics/sparse_histogram.h" |
| #include "base/process/process.h" |
| #include "base/strings/string_number_conversions.h" |
| #include "base/strings/string_piece.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "base/win/registry.h" |
| #include "components/browser_watcher/features.h" |
| #include "components/browser_watcher/postmortem_report_collector.h" |
| #include "components/browser_watcher/stability_debugging_win.h" |
| #include "third_party/crashpad/crashpad/client/crash_report_database.h" |
| |
| namespace browser_watcher { |
| |
| namespace { |
| |
| // Process ID APIs on Windows talk in DWORDs, whereas for string formatting |
| // and parsing, this code uses int. In practice there are no process IDs with |
| // the high bit set on Windows, so there's no danger of overflow if this is |
| // done consistently. |
| static_assert(sizeof(DWORD) == sizeof(int), |
| "process ids are expected to be no larger than int"); |
| |
| // This function does soft matching on the PID recorded in the key only. |
| // Due to PID reuse, the possibility exists that the process that's now live |
| // with the given PID is not the same process the data was recorded for. |
| // This doesn't matter for the purpose, as eventually the data will be |
| // scavenged and reported. |
| bool IsDeadProcess(base::StringPiece16 key_or_value_name) { |
| // Truncate the input string to the first occurrence of '-', if one exists. |
| size_t num_end = key_or_value_name.find(L'-'); |
| if (num_end != base::StringPiece16::npos) |
| key_or_value_name = key_or_value_name.substr(0, num_end); |
| |
| // Convert to the numeric PID. |
| int pid = 0; |
| if (!base::StringToInt(key_or_value_name, &pid) || pid == 0) |
| return true; |
| |
| // This is a very inexpensive check for the common case of our own PID. |
| if (static_cast<base::ProcessId>(pid) == base::GetCurrentProcId()) |
| return false; |
| |
| // The process is not our own - see whether a process with this PID exists. |
| // This is more expensive than the above check, but should also be very rare, |
| // as this only happens more than once for a given PID if a user is running |
| // multiple Chrome instances concurrently. |
| base::Process process = |
| base::Process::Open(static_cast<base::ProcessId>(pid)); |
| if (process.IsValid()) { |
| // The fact that it was possible to open the process says it's live. |
| return false; |
| } |
| |
| return true; |
| } |
| |
| void RecordExitCodes(const base::string16& registry_path) { |
| base::win::RegKey regkey(HKEY_CURRENT_USER, |
| registry_path.c_str(), |
| KEY_QUERY_VALUE | KEY_SET_VALUE); |
| if (!regkey.Valid()) |
| return; |
| |
| size_t num = regkey.GetValueCount(); |
| if (num == 0) |
| return; |
| std::vector<base::string16> to_delete; |
| |
| // Record the exit codes in a sparse stability histogram, as the range of |
| // values used to report failures is large. |
| base::HistogramBase* exit_code_histogram = |
| base::SparseHistogram::FactoryGet( |
| WatcherMetricsProviderWin::kBrowserExitCodeHistogramName, |
| base::HistogramBase::kUmaStabilityHistogramFlag); |
| |
| for (size_t i = 0; i < num; ++i) { |
| base::string16 name; |
| if (regkey.GetValueNameAt(static_cast<int>(i), &name) == ERROR_SUCCESS) { |
| DWORD exit_code = 0; |
| if (regkey.ReadValueDW(name.c_str(), &exit_code) == ERROR_SUCCESS) { |
| // Do not report exit codes for processes that are still live, |
| // notably for our own process. |
| if (exit_code != STILL_ACTIVE || IsDeadProcess(name)) { |
| to_delete.push_back(name); |
| exit_code_histogram->Add(exit_code); |
| } |
| } |
| } |
| } |
| |
| // Delete the values reported above. |
| for (size_t i = 0; i < to_delete.size(); ++i) |
| regkey.DeleteValue(to_delete[i].c_str()); |
| } |
| |
| void DeleteAllValues(base::win::RegKey* key) { |
| DCHECK(key); |
| |
| while (key->GetValueCount() != 0) { |
| base::string16 value_name; |
| LONG res = key->GetValueNameAt(0, &value_name); |
| if (res != ERROR_SUCCESS) { |
| DVLOG(1) << "Failed to get value name " << res; |
| return; |
| } |
| |
| res = key->DeleteValue(value_name.c_str()); |
| if (res != ERROR_SUCCESS) { |
| DVLOG(1) << "Failed to delete value " << value_name; |
| return; |
| } |
| } |
| } |
| |
| void DeleteExitFunnels(const base::string16& registry_path) { |
| base::win::RegistryKeyIterator it(HKEY_CURRENT_USER, registry_path.c_str()); |
| if (!it.Valid()) |
| return; |
| |
| // Exit early if no work to do. |
| if (it.SubkeyCount() == 0) |
| return; |
| |
| // Open the key we use for deletion preemptively to prevent reporting |
| // multiple times on permission problems. |
| base::win::RegKey key(HKEY_CURRENT_USER, |
| registry_path.c_str(), |
| KEY_QUERY_VALUE); |
| if (!key.Valid()) { |
| DVLOG(1) << "Failed to open " << registry_path << " for writing."; |
| return; |
| } |
| |
| // Key names to delete. |
| std::vector<base::string16> keys_to_delete; |
| // Constrain the cleanup to 100 exit funnels at a time, as otherwise this may |
| // take a long time to finish where a lot of data has accrued. This will be |
| // the case in particular for non-UMA users, as the exit funnel data will |
| // accrue without bounds for those users. |
| const size_t kMaxCleanup = 100; |
| for (; it.Valid() && keys_to_delete.size() < kMaxCleanup; ++it) { |
| base::win::RegKey sub_key; |
| LONG res = |
| sub_key.Open(key.Handle(), it.Name(), KEY_QUERY_VALUE | KEY_SET_VALUE); |
| if (res != ERROR_SUCCESS) { |
| DVLOG(1) << "Failed to open subkey " << it.Name(); |
| return; |
| } |
| DeleteAllValues(&sub_key); |
| |
| // Schedule the subkey for deletion. |
| keys_to_delete.push_back(it.Name()); |
| } |
| |
| for (const base::string16& key_name : keys_to_delete) { |
| LONG res = key.DeleteEmptyKey(key_name.c_str()); |
| if (res != ERROR_SUCCESS) |
| DVLOG(1) << "Failed to delete key " << key_name; |
| } |
| } |
| |
| // Called from the blocking pool when metrics reporting is disabled, as there |
| // may be a sizable stash of data to delete. |
| void DeleteExitCodeRegistryKey(const base::string16& registry_path) { |
| CHECK_NE(L"", registry_path); |
| |
| DeleteExitFunnels(registry_path); |
| |
| base::win::RegKey key; |
| LONG res = key.Open(HKEY_CURRENT_USER, registry_path.c_str(), |
| KEY_QUERY_VALUE | KEY_SET_VALUE); |
| if (res == ERROR_SUCCESS) { |
| DeleteAllValues(&key); |
| res = key.DeleteEmptyKey(L""); |
| } |
| if (res != ERROR_FILE_NOT_FOUND && res != ERROR_SUCCESS) |
| DVLOG(1) << "Failed to delete exit code key " << registry_path; |
| } |
| |
| enum CollectionInitializationStatus { |
| INIT_SUCCESS = 0, |
| UNKNOWN_DIR = 1, |
| GET_STABILITY_FILE_PATH_FAILED = 2, |
| CRASHPAD_DATABASE_INIT_FAILED = 3, |
| INIT_STATUS_MAX = 4 |
| }; |
| |
| void LogCollectionInitStatus(CollectionInitializationStatus status) { |
| UMA_HISTOGRAM_ENUMERATION("ActivityTracker.Collect.InitStatus", status, |
| INIT_STATUS_MAX); |
| } |
| |
| } // namespace |
| |
| const char WatcherMetricsProviderWin::kBrowserExitCodeHistogramName[] = |
| "Stability.BrowserExitCodes"; |
| |
| WatcherMetricsProviderWin::WatcherMetricsProviderWin( |
| const base::string16& registry_path, |
| const base::FilePath& user_data_dir, |
| const base::FilePath& crash_dir, |
| const GetExecutableDetailsCallback& exe_details_cb, |
| base::TaskRunner* io_task_runner) |
| : recording_enabled_(false), |
| cleanup_scheduled_(false), |
| registry_path_(registry_path), |
| user_data_dir_(user_data_dir), |
| crash_dir_(crash_dir), |
| exe_details_cb_(exe_details_cb), |
| io_task_runner_(io_task_runner), |
| weak_ptr_factory_(this) { |
| DCHECK(io_task_runner_); |
| } |
| |
| WatcherMetricsProviderWin::~WatcherMetricsProviderWin() { |
| } |
| |
| void WatcherMetricsProviderWin::OnRecordingEnabled() { |
| recording_enabled_ = true; |
| } |
| |
| void WatcherMetricsProviderWin::OnRecordingDisabled() { |
| if (!recording_enabled_ && !cleanup_scheduled_) { |
| // When metrics reporting is disabled, the providers get an |
| // OnRecordingDisabled notification at startup. Use that first notification |
| // to issue the cleanup task. |
| io_task_runner_->PostTask( |
| FROM_HERE, base::Bind(&DeleteExitCodeRegistryKey, registry_path_)); |
| |
| cleanup_scheduled_ = true; |
| } |
| } |
| |
| void WatcherMetricsProviderWin::ProvideStabilityMetrics( |
| metrics::SystemProfileProto* /* system_profile_proto */) { |
| // Note that if there are multiple instances of Chrome running in the same |
| // user account, there's a small race that will double-report the exit codes |
| // from both/multiple instances. This ought to be vanishingly rare and will |
| // only manifest as low-level "random" noise. To work around this it would be |
| // necessary to implement some form of global locking, which is not worth it |
| // here. |
| RecordExitCodes(registry_path_); |
| DeleteExitFunnels(registry_path_); |
| } |
| |
| void WatcherMetricsProviderWin::CollectPostmortemReports( |
| const base::Closure& done_callback) { |
| io_task_runner_->PostTaskAndReply( |
| FROM_HERE, |
| base::Bind( |
| &WatcherMetricsProviderWin::CollectPostmortemReportsOnBlockingPool, |
| weak_ptr_factory_.GetWeakPtr()), |
| done_callback); |
| } |
| |
| void WatcherMetricsProviderWin::CollectPostmortemReportsOnBlockingPool() { |
| // Note: the feature controls both instrumentation and collection. |
| bool is_stability_debugging_on = |
| base::FeatureList::IsEnabled(browser_watcher::kStabilityDebuggingFeature); |
| if (!is_stability_debugging_on) { |
| // TODO(manzagop): delete possible leftover data. |
| return; |
| } |
| |
| SCOPED_UMA_HISTOGRAM_TIMER("ActivityTracker.Collect.TotalTime"); |
| |
| if (user_data_dir_.empty() || crash_dir_.empty()) { |
| LOG(ERROR) << "User data directory or crash directory is unknown."; |
| LogCollectionInitStatus(UNKNOWN_DIR); |
| return; |
| } |
| |
| // Determine the stability directory and the stability file for the current |
| // process. |
| base::FilePath stability_dir = GetStabilityDir(user_data_dir_); |
| base::FilePath current_stability_file; |
| if (!GetStabilityFileForProcess(base::Process::Current(), user_data_dir_, |
| ¤t_stability_file)) { |
| LOG(ERROR) << "Failed to get the current stability file."; |
| LogCollectionInitStatus(GET_STABILITY_FILE_PATH_FAILED); |
| return; |
| } |
| const std::set<base::FilePath>& excluded_debug_files = { |
| current_stability_file}; |
| |
| // Create a database. Note: Chrome already has a g_database in crashpad.cc but |
| // it has internal linkage. Create a new one. |
| std::unique_ptr<crashpad::CrashReportDatabase> crashpad_database = |
| crashpad::CrashReportDatabase::InitializeWithoutCreating(crash_dir_); |
| if (!crashpad_database) { |
| LOG(ERROR) << "Failed to initialize a CrashPad database."; |
| LogCollectionInitStatus(CRASHPAD_DATABASE_INIT_FAILED); |
| return; |
| } |
| |
| LogCollectionInitStatus(INIT_SUCCESS); |
| |
| // TODO(manzagop): fix incorrect version attribution on update. |
| base::string16 product_name, version_number, channel_name; |
| exe_details_cb_.Run(&product_name, &version_number, &channel_name); |
| PostmortemReportCollector collector(base::UTF16ToUTF8(product_name), |
| base::UTF16ToUTF8(version_number), |
| base::UTF16ToUTF8(channel_name)); |
| collector.CollectAndSubmitForUpload(stability_dir, GetStabilityFilePattern(), |
| excluded_debug_files, |
| crashpad_database.get()); |
| } |
| |
| } // namespace browser_watcher |