blob: e82dd98671376730366bd24e1bc8af88601438c3 [file] [log] [blame]
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/variations/service/safe_seed_manager.h"
#include "base/base_switches.h"
#include "base/command_line.h"
#include "base/metrics/histogram_functions.h"
#include "base/metrics/histogram_macros.h"
#include "base/numerics/ranges.h"
#include "components/prefs/pref_registry_simple.h"
#include "components/prefs/pref_service.h"
#include "components/variations/client_filterable_state.h"
#include "components/variations/pref_names.h"
#include "components/variations/variations_seed_store.h"
namespace variations {
// As of the time of this writing, January 2018, users at the 99.5th percentile,
// across all platforms, tend to experience fewer than 3 consecutive crashes:
// [1], [2], [3], [4]. Note, however, that this is less true for the less-stable
// channels on some platforms.
// [1] All platforms, stable channel (consistently stable):
// https://uma.googleplex.com/timeline_v2?sid=90ac80f4573249fb341a8e49501bfcfd
// [2] Most platforms, all channels (consistently stable other than occasional
// spikes on Canary):
// https://uma.googleplex.com/timeline_v2?sid=7af5ba1969db76689a401f982a1db539
// [3] A less stable platform, all channels:
// https://uma.googleplex.com/timeline_v2?sid=07dbc8e4fa9f08e332fb609309a21882
// [4] Another less stable platform, all channels:
// https://uma.googleplex.com/timeline_v2?sid=a7b529ef5d52863fae2d216e963c4cbc
// Overall, the only {platform, channel} combinations that spike above 3
// consecutive crashes are ones with very few users, plus Canary. It's probably
// not realistic to avoid false positives for these less-stable configurations.
constexpr int kCrashStreakThreshold = 3;
// Consecutive seed fetch failures are, unfortunately, a bit more common. As of
// January 2018, users at the 99.5th percentile tend to see fewer than 4
// consecutive fetch failures on mobile platforms; and users at the 99th
// percentile tend to see fewer than 5 or 6 consecutive failures on desktop
// platforms. It makes sense that the characteristics differ on mobile
// vs. desktop platforms, given that the two use different scheduling algorithms
// for the fetches. Graphs:
// [1] Android, all channels (consistently connected):
// https://uma.googleplex.com/timeline_v2?sid=99d1d4c2490c60bcbde7afeb77c12a28
// [2] High-connectivity platforms, Stable and Beta channel (consistently
// connected):
// https://uma.googleplex.com/timeline_v2?sid=2db5b7278dad41cbf349f5f2cb30efd9
// [3] Other platforms, Stable and Beta channel (slightly less connected):
// https://uma.googleplex.com/timeline_v2?sid=d4ba2f3751d211898f8e69214147c2ec
// [4] All platforms, Dev (even less connected):
// https://uma.googleplex.com/timeline_v2?sid=5740fb22b17faa823822adfd8e00ec1a
// [5] All platforms, Canary (actually fairly well-connected!):
// https://uma.googleplex.com/timeline_v2?sid=3e14d3e4887792bb614db9f3f2c1d48c
// Note the all of the graphs show a spike on a particular day, presumably due
// to server-side instability. Moreover, the Dev channel on desktop is an
// outlier – users on the Dev channel can experience just shy of 9 consecutive
// failures on some platforms.
// Decision: There is not an obvious threshold that both achieves a low
// false-positive rate and provides good coverage for true positives. For now,
// set a threshold that should minimize false-positives.
// TODO(isherman): Check in with the networking team about their thoughts on how
// to find a better balance here.
constexpr int kFetchFailureStreakThreshold = 25;
SafeSeedManager::SafeSeedManager(bool did_previous_session_exit_cleanly,
PrefService* local_state)
: local_state_(local_state) {
// Increment the crash streak if the previous session crashed.
// Note that the streak is not cleared if the previous run didn’t crash.
// Instead, it’s incremented on each crash until Chrome is able to
// successfully fetch a new seed. This way, a seed update that mostly
// destabilizes Chrome will still result in a fallback to safe mode.
int num_crashes = local_state->GetInteger(prefs::kVariationsCrashStreak);
if (!did_previous_session_exit_cleanly) {
++num_crashes;
local_state->SetInteger(prefs::kVariationsCrashStreak, num_crashes);
}
int num_failed_fetches =
local_state->GetInteger(prefs::kVariationsFailedToFetchSeedStreak);
base::UmaHistogramSparse("Variations.SafeMode.Streak.Crashes",
base::ClampToRange(num_crashes, 0, 100));
base::UmaHistogramSparse("Variations.SafeMode.Streak.FetchFailures",
base::ClampToRange(num_failed_fetches, 0, 100));
}
SafeSeedManager::~SafeSeedManager() = default;
// static
void SafeSeedManager::RegisterPrefs(PrefRegistrySimple* registry) {
// Prefs tracking failures along the way to fetching a seed.
registry->RegisterIntegerPref(prefs::kVariationsCrashStreak, 0);
registry->RegisterIntegerPref(prefs::kVariationsFailedToFetchSeedStreak, 0);
}
bool SafeSeedManager::ShouldRunInSafeMode() const {
// Ignore any number of failures if the --force-fieldtrials flag is set. This
// flag is only used by developers, and there's no need to make the
// development process flakier.
if (base::CommandLine::ForCurrentProcess()->HasSwitch(
::switches::kForceFieldTrials)) {
return false;
}
int num_crashes = local_state_->GetInteger(prefs::kVariationsCrashStreak);
int num_failed_fetches =
local_state_->GetInteger(prefs::kVariationsFailedToFetchSeedStreak);
return num_crashes >= kCrashStreakThreshold ||
num_failed_fetches >= kFetchFailureStreakThreshold;
}
void SafeSeedManager::SetActiveSeedState(
const std::string& seed_data,
const std::string& base64_seed_signature,
std::unique_ptr<ClientFilterableState> client_filterable_state,
base::Time seed_fetch_time) {
DCHECK(!has_set_active_seed_state_);
has_set_active_seed_state_ = true;
active_seed_state_ = std::make_unique<ActiveSeedState>(
seed_data, base64_seed_signature, std::move(client_filterable_state),
seed_fetch_time);
}
void SafeSeedManager::RecordFetchStarted() {
// Pessimistically assume the fetch will fail. The failure streak will be
// reset upon success.
int num_failures_to_fetch =
local_state_->GetInteger(prefs::kVariationsFailedToFetchSeedStreak);
local_state_->SetInteger(prefs::kVariationsFailedToFetchSeedStreak,
num_failures_to_fetch + 1);
}
void SafeSeedManager::RecordSuccessfulFetch(VariationsSeedStore* seed_store) {
// The first time a fetch succeeds for a given run of Chrome, save the active
// seed+filter configuration as safe. Note that it's sufficient to do this
// only on the first successful fetch because the active configuration does
// not change while Chrome is running. Also, note that it's fine to do this
// even if running in safe mode, as the saved seed in that case will just be
// the existing safe seed.
if (active_seed_state_) {
seed_store->StoreSafeSeed(active_seed_state_->seed_data,
active_seed_state_->base64_seed_signature,
*active_seed_state_->client_filterable_state,
active_seed_state_->seed_fetch_time);
// The active seed state is only needed for the first time this code path is
// reached, so free up its memory once the data is no longer needed.
active_seed_state_.reset();
}
// Note: It's important to clear the crash streak as well as the fetch
// failures streak. Crashes that occur after a successful seed fetch do not
// prevent updating to a new seed, and therefore do not necessitate falling
// back to a safe seed.
local_state_->SetInteger(prefs::kVariationsCrashStreak, 0);
local_state_->SetInteger(prefs::kVariationsFailedToFetchSeedStreak, 0);
}
SafeSeedManager::ActiveSeedState::ActiveSeedState(
const std::string& seed_data,
const std::string& base64_seed_signature,
std::unique_ptr<ClientFilterableState> client_filterable_state,
base::Time seed_fetch_time)
: seed_data(seed_data),
base64_seed_signature(base64_seed_signature),
client_filterable_state(std::move(client_filterable_state)),
seed_fetch_time(seed_fetch_time) {}
SafeSeedManager::ActiveSeedState::~ActiveSeedState() = default;
} // namespace variations