blob: ce31af375a2532f815abdbcbcc046d830dc92fcd [file] [log] [blame]
// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_BROWSER_PRIVACY_BUDGET_IDENTIFIABILITY_STUDY_STATE_H_
#define CHROME_BROWSER_PRIVACY_BUDGET_IDENTIFIABILITY_STUDY_STATE_H_
#include <stdint.h>
#include <cstddef>
#include <iosfwd>
#include <vector>
#include "base/containers/flat_map.h"
#include "base/containers/flat_set.h"
#include "base/memory/raw_ptr.h"
#include "base/sequence_checker.h"
#include "base/thread_annotations.h"
#include "chrome/browser/privacy_budget/encountered_surface_tracker.h"
#include "chrome/browser/privacy_budget/mesa_distribution.h"
#include "chrome/browser/privacy_budget/privacy_budget_prefs.h"
#include "chrome/browser/privacy_budget/representative_surface_set.h"
#include "chrome/browser/privacy_budget/surface_set_equivalence.h"
#include "chrome/browser/privacy_budget/surface_set_valuation.h"
#include "chrome/browser/privacy_budget/surface_set_with_valuation.h"
#include "chrome/common/privacy_budget/order_preserving_set.h"
#include "chrome/common/privacy_budget/types.h"
#include "components/prefs/pref_service.h"
#include "identifiability_study_group_settings.h"
#include "third_party/blink/public/common/privacy_budget/identifiability_study_settings.h"
#include "third_party/blink/public/common/privacy_budget/identifiable_surface.h"
class PrefService;
class SurfaceSetEquivalence;
namespace blink {
class IdentifiableSurface;
} // namespace blink
namespace content {
class RenderProcessHost;
} // namespace content
namespace test_utils {
class InspectableIdentifiabilityStudyState;
} // namespace test_utils
// Current state of the identifiability study.
//
// Persists mutable state in a `PrefService`. In normal operation the
// `PrefService` is `LocalState`. The persisted state corresponds to the prefs
// named in `privacy_budget_prefs.h`.
//
// * The list of "active" identifiable surfaces. I.e. the set of surfaces for
// which this client is reporting sampled values.
//
// * The list of "seen" identifiable surfaces. I.e. a list of surfaces that
// this client has seen in the order in which they were observed.
//
// In addition, this object also tracks per-session state which is not
// persisted. This state includes:
//
// * The list of "seen" surfaces that this client has reported to the server.
class IdentifiabilityStudyState {
public:
using OffsetType = unsigned int;
// Construct from a `PrefService`. `pref_service` is used to retrieve and
// store study state and MUST outlive this.
explicit IdentifiabilityStudyState(PrefService* pref_service);
IdentifiabilityStudyState(IdentifiabilityStudyState&) = delete;
IdentifiabilityStudyState& operator=(const IdentifiabilityStudyState&) =
delete;
~IdentifiabilityStudyState();
// Returns the active experiment generation as defined by the server-side
// configuration.
//
// See kIdentifiabilityStudyGeneration.
int generation() const;
// Returns true if metrics collection is enabled for `surface`.
//
// Calling this method may alter the state of the study settings.
bool ShouldRecordSurface(blink::IdentifiableSurface surface);
// Should be called from unit-tests if multiple IdentifiabilityStudyState
// instances are to be constructed.
static void ResetGlobalStudySettingsForTesting();
// Returns true if tracking metrics should be recorded for this
// source_id/surface combination.
bool ShouldReportEncounteredSurface(uint64_t source_id,
blink::IdentifiableSurface surface);
// Resets the state associated with a single report.
//
// It should be called each time the UKM service constructs a UKM client
// report.
void ResetPerReportState();
// Clears all persisted and ephemeral state.
//
// It should be called when the UKM client ID changes or if the experiment
// generation changes.
void ResetPersistedState();
void InitStateForAssignedBlockSampling();
void InitStateForRandomSurfaceSampling();
static int SelectMultinomialChoice(const std::vector<double>& weights);
// Initializes from fields persisted in `pref_service_`.
void InitFromPrefs();
// Initializes a new renderer process.
void InitializeRenderer(content::RenderProcessHost* render_process_host);
// The largest offset that we can select. At worst `seen_surfaces_` must keep
// track of this many (+1) surfaces. This value is approximately based on the
// 90ᵗʰ percentile surface encounter rate as measured in June 2021.
static constexpr OffsetType kMaxSelectedSurfaceOffset = 1999;
// A knob that we can use to split data sets from different versions of the
// implementation where the differences could have material effects on the
// data distribution.
//
// Increment this whenever a non-backwards-compatible change is made in the
// code. This value is independent of any server controlled study parameters.
static constexpr int kGeneratorVersion = 1;
// The ratio between the linear region of the Mesa distribution and the entire
// range. See `MesaDistribution` for details. The distribution is the source
// of random numbers for selecting identifiable surface for measurement.
static constexpr double kMesaDistributionRatio = 0.9;
// The parameter of the geometric distribution used for the tail of the Mesa
// distribution.
static constexpr double kMesaDistributionGeometricDistributionParam = 0.5;
private:
friend class test_utils::InspectableIdentifiabilityStudyState;
using SurfaceSelectionRateMap =
base::flat_map<blink::IdentifiableSurface, int>;
using TypeSelectionRateMap =
base::flat_map<blink::IdentifiableSurface::Type, int>;
// Initializes global study settings based on FeatureLists and FieldTrial
// lists.
void InitializeGlobalStudySettings();
// Determines if the meta experiment must be activated for this client.
bool IsMetaExperimentActive();
// Checks that the invariants hold. When DCHECK_IS_ON() this call is
// expensive. Noop otherwise.
void CheckInvariants() const;
// Returns true if at least one more identifiable surface can be added to the
// active surface set. This is an estimate since each surface costs different
// amounts.
bool CanAddOneMoreActiveSurface() const;
// Attempts to add `surface` to `seen_surfaces_`.
//
// Returns false if `surface` was already included in `seen_surfaces_` or if
// the `seen_surfaces_` set has reached its cap. Returns true otherwise.
bool TryAddNewlySeenSurface(blink::IdentifiableSurface surface);
// Writes individual fields to prefs.
void WriteSeenSurfacesToPrefs() const;
void WriteSelectedOffsetsToPrefs() const;
// Contains all the logic for determining whether a newly observed surface
// should be added to the active list or not. Should only be called if
// `active_surfaces_` does not contain `surface`.
bool DecideInclusionForNewSurface(blink::IdentifiableSurface surface);
// On exit, ensures that `selected_offsets_` is non-empty and satisfies our
// invariants.
void MaybeUpdateSelectedOffsets();
void UpdateSelectedOffsets(unsigned expected_offset_count);
// Resets all in-memory state, but doesn't touch any persisted state. This
// operation invalidates the relationship between persistent and in-memory
// states. A call to this function should be immediately followed by either
// reading from or clearing associated preferences.
void ResetInMemoryState();
// Determines the number of extra offsets that should be a part of the study
// state in order to guide surface selection.
//
// It attempts to answer the following question:
//
// Given that `active_surfaces_.Cost()` of `active_surface_budget_` has
// been consumed, what's the expected number of surfaces we'd need to
// select in order to saturate the budget?
//
unsigned GetCountOfOffsetsToSelect() const;
// Verifies that the offset `o` is within the range that's considered valid.
// The valid range may change between versions.
static bool IsValidOffset(OffsetType o);
// Removes disallowed surfaces from `container` and returns the offsets of
// removed elements relative to the original order of elements.
//
// Modifies `container` in-place. Appends removed offsets to `dropped_offsets`
// in ascending order. (Note that existing offsets are not removed from
// `container`.)
//
// On input, `container` should have no duplicate items nor internal
// meta-surfaces (i.e. surfaces of type kReservedInternal). Returns `false` if
// these conditions are violated.
//
// E.g.:
// Before:
// container == {1,2,3,4}
// dropped_offsets == {}
//
// Surface #3 (at offset 2) is blocked, and should therefore be removed.
//
// After:
// container == {1,2,4}
// dropped_offsets == {2}
static bool StripDisallowedSurfaces(IdentifiableSurfaceList& container,
std::vector<OffsetType>& dropped_offsets);
// Given a list of offsets and a list of offsets to remove, returns the list
// of offsets adjusted to reflect now missing offsets.
//
// So, for example:
//
// Before:
// offsets = {1, 2, 3}
// dropped_offsets = {1}
// After:
// offsets = {1, 2} # Formerly offsets 2, and 3, but are now shifted one
// # position.
//
// ~ or ~
//
// Before:
// offsets = {1,2,4,6}
// dropped_offsets = {2,3,5}
// After:
// offsets = {1,2,3}
//
static std::vector<OffsetType> AdjustForDroppedOffsets(
std::vector<OffsetType> dropped_offsets,
std::vector<OffsetType> offsets);
// Wrapper around some of the experiment field trial params.
IdentifiabilityStudyGroupSettings settings_;
// `pref_service_` pointee must outlive `this`. Used for persistent state.
raw_ptr<PrefService> pref_service_ = nullptr;
// Offset of selected block. Only used when using assigned block sampling.
//
// Persisted in kPrivacyBudgetSelectedBlock within a single study generation.
int selected_block_offset_ = -1;
// `equivalence_` contains a model that determines the equivalence of
// identifiable information for two or more surfaces. See
// SurfaceSetEquivalence for more details.
const SurfaceSetEquivalence equivalence_;
// `valuation_` contains a model that determines an identifiability measure (a
// cost or valuation, in budget parlance) for a set of identifiable surfaces.
const SurfaceSetValuation valuation_;
// Set of identifiable surfaces for which we will collect metrics. This set is
// extended as we go unless it is already saturated.
//
// The set is considered saturated when the cost has reached
// `active_surface_budget_`. It can also be saturated when the cost is near
// `active_surface_budget_` but the remaining budget doesn't accommodate any
// surface.
//
// Invariants:
//
// * active_surfaces_ ∩ kSettings.blocked_surfaces() = Ø.
//
// * s ∈ active_surfaces_ ⇒ s.GetType() ∉ kSettings.blocked_types().
//
// * i ∈ selected_offsets_ ∧ i < seen_surfaces_.size()
// ⇒ seen_surfaces_[i] ∈ active_surfaces_.
//
// * Cost(active_surfaces_) ≤ active_surface_budget_.
//
// Where kSettings is the PrivacyBudgetSettingsProvider singleton.
SurfaceSetWithValuation active_surfaces_;
// Surfaces that the client has encountered in the order in which they were
// encountered. The set is for fast lookup, and the list is for preserving the
// order.
//
// Invariants:
//
// * seen_surfaces_.CheckModel() passes.
//
// * seen_surfaces_ ∩ kSettings.blocked_surfaces() = Ø.
//
// * s ∈ seen_surfaces_ ⇒ s.GetType() ∉ kSettings.blocked_types().
//
// * seen_surfaces_.size() <= kMaxSelectedSurfaceOffset + 1.
//
// Where kSettings is the PrivacyBudgetSettingsProvider singleton.
OrderPreservingSet<blink::IdentifiableSurface> seen_surfaces_;
// Incremental serialization of `seen_surfaces_`. Profiling indicates that as
// the size of the list grows, the serialization consumes a non-negligible
// amount of time during tight loops.
//
// Invariants:
//
// * seen_surface_sequence_string_ = SerializationOf(seen_surfaces_)
std::string seen_surface_sequence_string_;
// Indices into `seen_surfaces_` for surfaces that are *active*.
//
// Only offsets that are less than |seen_surfaces_.size()| are in use. Others
// are kept around until we have sufficient surfaces.
//
// Invariants:
//
// * i ∈ selected_offsets_ ⇒ i <= kMaxSelectedSurfaceOffset.
base::flat_set<OffsetType> selected_offsets_;
// Count of offsets `i` in `selected_offsets_` which satisfy
// `seen_surfaces_[i] ∈ active_surfaces_`.
//
// Invariants:
//
// * active_offset_count_ = O.size() where
// O = { i | i ∈ selected_offsets_ ∧
// seen_surfaces_[i] ∈ active_surfaces_}
int active_offset_count_ = 0;
// Contains kIdentifiabilityStudyGeneration as defined by the server-side
// experiment.
//
// All valid `generation_` values are positive and non-zero. A value of zero
// implies that the study is not active.
const int generation_;
// Hard cap on the number of identifiable surfaces we will sample per client.
// The limit is specified based on the surface valuation as known to
// SurfaceSetValuation.
//
// This setting can be tweaked experimentally via
// `kIdentifiabilityStudyActiveSurfaceBudget`.
//
// Invariants:
//
// * active_surface_budget_ ≤ kMaxIdentifiabilityStudyActiveSurfaceBudget.
//
const int active_surface_budget_;
// Source of random offsets for selection. The returned offsets are in the
// range [0, UINT_MAX]. See mesa_distribution.h for details on the random
// distribution.
//
// This distribution is initialized with the expected number of surfaces as
// the distribution's pivot point. I.e.
// `random_offset_generator_.pivot_point()` is
// `features::kIdentifiabilityStudyExpectedSurfaceCount`.
MesaDistribution<OffsetType> random_offset_generator_;
// Keeps track of which identifiable surfaces have been exposed to which UKM
// sources. Each document and worker context within a document tree has
// a unique source. Hence this field keeps track of identifiable surfaces
// exposed to all execution contexts in all the document trees.
//
// This field resets each time a new UKM report is generated. Hence the
// tracked value is essentially "which surfaces have been exposed to which
// sources since the last UKM report."
//
// Invariants:
//
// * surface_encounters_ ∩ kSettings.blocked_surfaces() = Ø.
//
// * ∀ s ∈ surface_encounters_[i], s.GetType() ∉ kSettings.blocked_types().
//
// Where kSettings is the PrivacyBudgetSettingsProvider singleton.
EncounteredSurfaceTracker surface_encounters_;
// Whether the meta experiment (i.e. reporting the meta surfaces, which
// include information only about usage of APIs) is active or not. Note that
// this setting is independent from the rest of the Identifiability Study, and
// can be enabled / disabled separately.
const bool meta_experiment_active_;
SEQUENCE_CHECKER(sequence_checker_);
};
#endif // CHROME_BROWSER_PRIVACY_BUDGET_IDENTIFIABILITY_STUDY_STATE_H_