|  | // Copyright (c) 2014 The Chromium Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | #include "components/browser_watcher/watcher_metrics_provider_win.h" | 
|  |  | 
|  | #include <stddef.h> | 
|  |  | 
|  | #include <limits> | 
|  | #include <memory> | 
|  | #include <set> | 
|  | #include <vector> | 
|  |  | 
|  | #include "base/bind.h" | 
|  | #include "base/feature_list.h" | 
|  | #include "base/metrics/histogram.h" | 
|  | #include "base/metrics/histogram_base.h" | 
|  | #include "base/metrics/histogram_macros.h" | 
|  | #include "base/metrics/sparse_histogram.h" | 
|  | #include "base/process/process.h" | 
|  | #include "base/strings/string_number_conversions.h" | 
|  | #include "base/strings/string_piece.h" | 
|  | #include "base/strings/utf_string_conversions.h" | 
|  | #include "base/win/registry.h" | 
|  | #include "components/browser_watcher/features.h" | 
|  | #include "components/browser_watcher/postmortem_report_collector.h" | 
|  | #include "components/browser_watcher/stability_debugging.h" | 
|  | #include "third_party/crashpad/crashpad/client/crash_report_database.h" | 
|  |  | 
|  | namespace browser_watcher { | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | // Process ID APIs on Windows talk in DWORDs, whereas for string formatting | 
|  | // and parsing, this code uses int. In practice there are no process IDs with | 
|  | // the high bit set on Windows, so there's no danger of overflow if this is | 
|  | // done consistently. | 
|  | static_assert(sizeof(DWORD) == sizeof(int), | 
|  | "process ids are expected to be no larger than int"); | 
|  |  | 
|  | // This function does soft matching on the PID recorded in the key only. | 
|  | // Due to PID reuse, the possibility exists that the process that's now live | 
|  | // with the given PID is not the same process the data was recorded for. | 
|  | // This doesn't matter for the purpose, as eventually the data will be | 
|  | // scavenged and reported. | 
|  | bool IsDeadProcess(base::StringPiece16 key_or_value_name) { | 
|  | // Truncate the input string to the first occurrence of '-', if one exists. | 
|  | size_t num_end = key_or_value_name.find(L'-'); | 
|  | if (num_end != base::StringPiece16::npos) | 
|  | key_or_value_name = key_or_value_name.substr(0, num_end); | 
|  |  | 
|  | // Convert to the numeric PID. | 
|  | int pid = 0; | 
|  | if (!base::StringToInt(key_or_value_name, &pid) || pid == 0) | 
|  | return true; | 
|  |  | 
|  | // This is a very inexpensive check for the common case of our own PID. | 
|  | if (static_cast<base::ProcessId>(pid) == base::GetCurrentProcId()) | 
|  | return false; | 
|  |  | 
|  | // The process is not our own - see whether a process with this PID exists. | 
|  | // This is more expensive than the above check, but should also be very rare, | 
|  | // as this only happens more than once for a given PID if a user is running | 
|  | // multiple Chrome instances concurrently. | 
|  | base::Process process = | 
|  | base::Process::Open(static_cast<base::ProcessId>(pid)); | 
|  | if (process.IsValid()) { | 
|  | // The fact that it was possible to open the process says it's live. | 
|  | return false; | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | void RecordExitCodes(const base::string16& registry_path) { | 
|  | base::win::RegKey regkey(HKEY_CURRENT_USER, | 
|  | registry_path.c_str(), | 
|  | KEY_QUERY_VALUE | KEY_SET_VALUE); | 
|  | if (!regkey.Valid()) | 
|  | return; | 
|  |  | 
|  | size_t num = regkey.GetValueCount(); | 
|  | if (num == 0) | 
|  | return; | 
|  | std::vector<base::string16> to_delete; | 
|  |  | 
|  | // Record the exit codes in a sparse stability histogram, as the range of | 
|  | // values used to report failures is large. | 
|  | base::HistogramBase* exit_code_histogram = | 
|  | base::SparseHistogram::FactoryGet( | 
|  | WatcherMetricsProviderWin::kBrowserExitCodeHistogramName, | 
|  | base::HistogramBase::kUmaStabilityHistogramFlag); | 
|  |  | 
|  | for (size_t i = 0; i < num; ++i) { | 
|  | base::string16 name; | 
|  | if (regkey.GetValueNameAt(static_cast<int>(i), &name) == ERROR_SUCCESS) { | 
|  | DWORD exit_code = 0; | 
|  | if (regkey.ReadValueDW(name.c_str(), &exit_code) == ERROR_SUCCESS) { | 
|  | // Do not report exit codes for processes that are still live, | 
|  | // notably for our own process. | 
|  | if (exit_code != STILL_ACTIVE || IsDeadProcess(name)) { | 
|  | to_delete.push_back(name); | 
|  | exit_code_histogram->Add(exit_code); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Delete the values reported above. | 
|  | for (size_t i = 0; i < to_delete.size(); ++i) | 
|  | regkey.DeleteValue(to_delete[i].c_str()); | 
|  | } | 
|  |  | 
|  | void DeleteAllValues(base::win::RegKey* key) { | 
|  | DCHECK(key); | 
|  |  | 
|  | while (key->GetValueCount() != 0) { | 
|  | base::string16 value_name; | 
|  | LONG res = key->GetValueNameAt(0, &value_name); | 
|  | if (res != ERROR_SUCCESS) { | 
|  | DVLOG(1) << "Failed to get value name " << res; | 
|  | return; | 
|  | } | 
|  |  | 
|  | res = key->DeleteValue(value_name.c_str()); | 
|  | if (res != ERROR_SUCCESS) { | 
|  | DVLOG(1) << "Failed to delete value " << value_name; | 
|  | return; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Called from the blocking pool when metrics reporting is disabled, as there | 
|  | // may be a sizable stash of data to delete. | 
|  | void DeleteExitCodeRegistryKey(const base::string16& registry_path) { | 
|  | CHECK_NE(L"", registry_path); | 
|  |  | 
|  | base::win::RegKey key; | 
|  | LONG res = key.Open(HKEY_CURRENT_USER, registry_path.c_str(), | 
|  | KEY_QUERY_VALUE | KEY_SET_VALUE); | 
|  | if (res == ERROR_SUCCESS) { | 
|  | DeleteAllValues(&key); | 
|  | res = key.DeleteEmptyKey(L""); | 
|  | } | 
|  | if (res != ERROR_FILE_NOT_FOUND && res != ERROR_SUCCESS) | 
|  | DVLOG(1) << "Failed to delete exit code key " << registry_path; | 
|  | } | 
|  |  | 
|  | enum CollectionInitializationStatus { | 
|  | INIT_SUCCESS = 0, | 
|  | UNKNOWN_DIR = 1, | 
|  | GET_STABILITY_FILE_PATH_FAILED = 2, | 
|  | CRASHPAD_DATABASE_INIT_FAILED = 3, | 
|  | INIT_STATUS_MAX = 4 | 
|  | }; | 
|  |  | 
|  | void LogCollectionInitStatus(CollectionInitializationStatus status) { | 
|  | UMA_HISTOGRAM_ENUMERATION("ActivityTracker.Collect.InitStatus", status, | 
|  | INIT_STATUS_MAX); | 
|  | } | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | const char WatcherMetricsProviderWin::kBrowserExitCodeHistogramName[] = | 
|  | "Stability.BrowserExitCodes"; | 
|  |  | 
|  | WatcherMetricsProviderWin::WatcherMetricsProviderWin( | 
|  | const base::string16& registry_path, | 
|  | const base::FilePath& user_data_dir, | 
|  | const base::FilePath& crash_dir, | 
|  | const GetExecutableDetailsCallback& exe_details_cb, | 
|  | base::TaskRunner* io_task_runner) | 
|  | : recording_enabled_(false), | 
|  | cleanup_scheduled_(false), | 
|  | registry_path_(registry_path), | 
|  | user_data_dir_(user_data_dir), | 
|  | crash_dir_(crash_dir), | 
|  | exe_details_cb_(exe_details_cb), | 
|  | io_task_runner_(io_task_runner), | 
|  | weak_ptr_factory_(this) { | 
|  | DCHECK(io_task_runner_); | 
|  | } | 
|  |  | 
|  | WatcherMetricsProviderWin::~WatcherMetricsProviderWin() { | 
|  | } | 
|  |  | 
|  | void WatcherMetricsProviderWin::OnRecordingEnabled() { | 
|  | recording_enabled_ = true; | 
|  | } | 
|  |  | 
|  | void WatcherMetricsProviderWin::OnRecordingDisabled() { | 
|  | if (!recording_enabled_ && !cleanup_scheduled_) { | 
|  | // When metrics reporting is disabled, the providers get an | 
|  | // OnRecordingDisabled notification at startup. Use that first notification | 
|  | // to issue the cleanup task. | 
|  | io_task_runner_->PostTask( | 
|  | FROM_HERE, base::Bind(&DeleteExitCodeRegistryKey, registry_path_)); | 
|  |  | 
|  | cleanup_scheduled_ = true; | 
|  | } | 
|  | } | 
|  |  | 
|  | void WatcherMetricsProviderWin::ProvideStabilityMetrics( | 
|  | metrics::SystemProfileProto* /* system_profile_proto */) { | 
|  | // Note that if there are multiple instances of Chrome running in the same | 
|  | // user account, there's a small race that will double-report the exit codes | 
|  | // from both/multiple instances. This ought to be vanishingly rare and will | 
|  | // only manifest as low-level "random" noise. To work around this it would be | 
|  | // necessary to implement some form of global locking, which is not worth it | 
|  | // here. | 
|  | RecordExitCodes(registry_path_); | 
|  | } | 
|  |  | 
|  | void WatcherMetricsProviderWin::CollectPostmortemReports( | 
|  | const base::Closure& done_callback) { | 
|  | io_task_runner_->PostTaskAndReply( | 
|  | FROM_HERE, | 
|  | base::Bind( | 
|  | &WatcherMetricsProviderWin::CollectPostmortemReportsOnBlockingPool, | 
|  | weak_ptr_factory_.GetWeakPtr()), | 
|  | done_callback); | 
|  | } | 
|  |  | 
|  | void WatcherMetricsProviderWin::CollectPostmortemReportsOnBlockingPool() { | 
|  | // Note: the feature controls both instrumentation and collection. | 
|  | bool is_stability_debugging_on = | 
|  | base::FeatureList::IsEnabled(browser_watcher::kStabilityDebuggingFeature); | 
|  | if (!is_stability_debugging_on) { | 
|  | // TODO(manzagop): delete possible leftover data. | 
|  | return; | 
|  | } | 
|  |  | 
|  | SCOPED_UMA_HISTOGRAM_TIMER("ActivityTracker.Collect.TotalTime"); | 
|  |  | 
|  | if (user_data_dir_.empty() || crash_dir_.empty()) { | 
|  | LOG(ERROR) << "User data directory or crash directory is unknown."; | 
|  | LogCollectionInitStatus(UNKNOWN_DIR); | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Determine the stability directory and the stability file for the current | 
|  | // process. | 
|  | base::FilePath stability_dir = GetStabilityDir(user_data_dir_); | 
|  | base::FilePath current_stability_file; | 
|  | if (!GetStabilityFileForProcess(base::Process::Current(), user_data_dir_, | 
|  | ¤t_stability_file)) { | 
|  | LOG(ERROR) << "Failed to get the current stability file."; | 
|  | LogCollectionInitStatus(GET_STABILITY_FILE_PATH_FAILED); | 
|  | return; | 
|  | } | 
|  | const std::set<base::FilePath>& excluded_debug_files = { | 
|  | current_stability_file}; | 
|  |  | 
|  | // Create a database. Note: Chrome already has a g_database in crashpad.cc but | 
|  | // it has internal linkage. Create a new one. | 
|  | std::unique_ptr<crashpad::CrashReportDatabase> crashpad_database = | 
|  | crashpad::CrashReportDatabase::InitializeWithoutCreating(crash_dir_); | 
|  | if (!crashpad_database) { | 
|  | LOG(ERROR) << "Failed to initialize a CrashPad database."; | 
|  | LogCollectionInitStatus(CRASHPAD_DATABASE_INIT_FAILED); | 
|  | return; | 
|  | } | 
|  |  | 
|  | LogCollectionInitStatus(INIT_SUCCESS); | 
|  |  | 
|  | // TODO(manzagop): fix incorrect version attribution on update. | 
|  | base::string16 product_name, version_number, channel_name; | 
|  | exe_details_cb_.Run(&product_name, &version_number, &channel_name); | 
|  | PostmortemReportCollector collector(base::UTF16ToUTF8(product_name), | 
|  | base::UTF16ToUTF8(version_number), | 
|  | base::UTF16ToUTF8(channel_name)); | 
|  | collector.CollectAndSubmitForUpload(stability_dir, GetStabilityFilePattern(), | 
|  | excluded_debug_files, | 
|  | crashpad_database.get()); | 
|  | } | 
|  |  | 
|  | }  // namespace browser_watcher |