blob: a978cf36a3a9e6f3d1ff0b5a602f24e19d2ac090 [file] [log] [blame]
// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef THIRD_PARTY_BLINK_COMMON_PRIVACY_BUDGET_AGGREGATING_SAMPLE_COLLECTOR_H_
#define THIRD_PARTY_BLINK_COMMON_PRIVACY_BUDGET_AGGREGATING_SAMPLE_COLLECTOR_H_
#include <cstdint>
#include <unordered_map>
#include <vector>
#include "base/containers/flat_set.h"
#include "base/synchronization/lock.h"
#include "base/thread_annotations.h"
#include "base/time/time.h"
#include "services/metrics/public/cpp/ukm_recorder.h"
#include "services/metrics/public/mojom/ukm_interface.mojom.h"
#include "third_party/blink/public/common/common_export.h"
#include "third_party/blink/public/common/privacy_budget/identifiability_sample_collector.h"
#include "third_party/blink/public/common/privacy_budget/identifiable_surface.h"
namespace blink {
// An `IdentifiabilitySampleCollector` that does the following:
//
// * De-duplicates recorded samples so that the same
// 〈IdentifiableSurface,IdentifiableToken〉 tuple doesn't get sent to the
// UkmRecorder more than once per `ukm::SourceId`.
//
// * Caps the number of samples that can be recorded against the same surface
// per `ukm::SourceId`. Drops samples in excess of
// kMaxTrackedSamplesPerSurfaces.
//
// * Caps the total number of surfaces that can be tracked for a single process.
// Drops samples in excess of kMaxTrackedSurfaces.
//
// * Buffers metrics instead of invoking `UkmRecorder::Record` each time
// a sample arrives.
//
// * The number of metrics so buffered is capped at kMaxUnsentSamples. If more
// than this many are to be buffered, then flushes all unsent metrics.
//
// * The age of metrics so buffered is capped at kMaxUnsentSampleAge. If
// samples have been sitting in the unsent buffer for longer than that,
// flushes all unsent metrics.
//
// * In so buffering, organizes observed metrics into the fewest number of
// `UkmEntry` instances that are required to record them via `UkmRecorder.
//
// The goal, obviously is to prevent the identifiability study from DoSing the
// browser process and the UKM subsystem since there can be lots of metrics
// being recorded.
class BLINK_COMMON_EXPORT_PRIVATE AggregatingSampleCollector
: public IdentifiabilitySampleCollector {
public:
// Maximum number of surfaces that this class can track. Prevents unbounded
// memory growth.
static constexpr unsigned kMaxTrackedSurfaces = 10000;
// Maximum number of sources that this class can track. Prevents unbounded
// memory growth.
static constexpr unsigned kMaxTrackedSources = 10000;
// Surfaces may return different values. To account for those, this class
// tracks the last several distinct samples that were seen for each surface.
// This is the maximum number of such samples that can be tracked. Again meant
// as a precaution against unbounded memory growth.
//
// If a surface is generating much more than this many distinct samples, it is
// considered "noisy" and may be considered for removal from the study.
static constexpr unsigned kMaxTrackedSamplesPerSurfacePerSourceId = 3;
// Maximum number of unsent samples. This class will automatically flush all
// samples if this limit overflows.
static constexpr unsigned kMaxUnsentSamples = 200;
// Maximum number of sources that this class can track. Flushes automatically
// if this limit overflows.
static constexpr unsigned kMaxUnsentSources = 100;
// Maximum age of the oldest sample in the unsent collection. Again, the class
// will flush all samples if this limit overflows.
static constexpr base::TimeDelta kMaxUnsentSampleAge = base::Seconds(5);
// Should be the same as the type for ukm::UkmEntry::metrics_
using UkmMetricsContainerType = decltype(ukm::mojom::UkmEntry::metrics);
AggregatingSampleCollector();
~AggregatingSampleCollector() override;
// IdentifiabilitySampleCollector
void Record(ukm::UkmRecorder* recorder,
ukm::SourceId source,
std::vector<IdentifiableSample> metrics) override
LOCKS_EXCLUDED(lock_);
void Flush(ukm::UkmRecorder* recorder) override LOCKS_EXCLUDED(lock_);
// FlushSource flushes the metrics per source. This will also reset all limits
// relative to this source.
void FlushSource(ukm::UkmRecorder* recorder, ukm::SourceId source) override
LOCKS_EXCLUDED(lock_);
// Only for testing.
void ResetForTesting() LOCKS_EXCLUDED(lock_);
private:
// Each tracked `IdentifiableSurface` has a corresponding `Samples` instance.
struct Samples {
// `samples.size() <= kMaxTrackedSamplesPerSurfaces`. Typically we only want
// to keep a very small number of these around. Note that we don't do any
// fancy reservoir sampling or approximate counting here due to the size of
// the required sketches.
base::flat_set<IdentifiableToken> samples;
// true if unique sample count exceeds kMaxTrackedSamplesPerSurfaces.
bool overflowed = false;
// Total count of samples that we've observed for this surface. Includes
// duplicates.
int total_value_count = 0;
};
// Attempts to record the samples in `samples`. Returns true if unsent metrics
// should be flushed based on the resulting state of `unsent_metrics_`.
bool TryAcceptSamples(ukm::SourceId source,
std::vector<IdentifiableSample> samples)
LOCKS_EXCLUDED(lock_);
// Accepts `sample` if it meets certain criteria. Typically duplicates are
// dropped. So are samples that if accepted causes some limit to be exceeded.
void TryAcceptSingleSample(ukm::SourceId source,
const IdentifiableSample& sample)
EXCLUSIVE_LOCKS_REQUIRED(lock_);
// Populates `unsent_metrics_` based on a single `IdentifiableSample`.
void AddNewUnsentSample(ukm::SourceId source,
const IdentifiableSample& sample)
EXCLUSIVE_LOCKS_REQUIRED(lock_);
// If there is a `UkmMetricsContainerType` for `source`, the method of adding
// another sample is stunningly different from the case where `source` is new.
// In the former case, the existing `UkmMetricsContainerType` objects need to
// be checked if any of them can be used to store the 〈`key`, `value`〉 pair.
bool AddNewUnsentSampleToKnownSource(ukm::SourceId source,
uint64_t key,
int64_t value)
EXCLUSIVE_LOCKS_REQUIRED(lock_);
// We are heavily dependent on the property that the reference to a value in
// the map isn't invalidated due to mutations other than erase().
base::flat_map<
ukm::SourceId,
std::unordered_map<IdentifiableSurface, Samples, IdentifiableSurfaceHash>>
per_source_per_surface_samples_ GUARDED_BY(lock_);
// Seen surfaces across all sources.
std::unordered_set<IdentifiableSurface, IdentifiableSurfaceHash>
seen_surfaces_ GUARDED_BY(lock_);
// An unordered multi-map of metrics that haven't yet been recorded via
// a `UkmRecorder`.
//
// `unsent_metrics_.size() <= kMaxUnsentSources`.
//
// `UkmEntry`'s `metrics` member is a map, and hence cannot be used to store
// multiple values for a single surface. If more than one value needs to be
// recorded, they need to be in different entries.
//
// At worst, all values recorded for a single source can be recorded in
// kMaxTrackedSamplesPerSurfaces maps. So that's what we do.
//
// Each source maps to one or more UkmMetricsContainerType instances, which
// happens to be the minimum required to represent all observed and accepted
// values for all surfaces.
std::unordered_multimap<ukm::SourceId, UkmMetricsContainerType>
unsent_metrics_ GUARDED_BY(lock_);
// Only valid if `unsent_sample_count_ > 0`.
base::TimeTicks time_of_first_unsent_arrival_ GUARDED_BY(lock_);
// Counted separately from `unsent_metrics_.size()` because each metric can
// hold multiple samples.
// `unsent_sample_count_ <= kMaxUnsentSamples`.
size_t unsent_sample_count_ GUARDED_BY(lock_) = 0;
mutable base::Lock lock_;
};
namespace internal {
// Accesses the global `AggregatingSampleCollector` instance. On non-test
// targets or test targets with no `ScopedSwitchSampleCollector`
// override this is also what's returned by
// `IdentifiabilitySampleCollector::Get()`.
BLINK_COMMON_EXPORT_PRIVATE AggregatingSampleCollector* GetCollectorInstance();
} // namespace internal
} // namespace blink
#endif // THIRD_PARTY_BLINK_COMMON_PRIVACY_BUDGET_AGGREGATING_SAMPLE_COLLECTOR_H_