blob: ff1368dc570b32d5d95651820230714abce4237e [file] [log] [blame]
// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "content/browser/attribution_reporting/privacy_math.h"
#include <stdint.h>
#include <algorithm>
#include <cmath>
#include <functional>
#include <iterator>
#include <map>
#include <optional>
#include <utility>
#include <vector>
#include "base/check_op.h"
#include "base/notreached.h"
#include "base/numerics/byte_conversions.h"
#include "base/numerics/checked_math.h"
#include "base/rand_util.h"
#include "base/ranges/algorithm.h"
#include "components/attribution_reporting/event_report_windows.h"
#include "components/attribution_reporting/max_event_level_reports.h"
#include "components/attribution_reporting/trigger_config.h"
#include "third_party/abseil-cpp/absl/numeric/int128.h"
namespace content {
namespace {
// The max possible number of state combinations given a valid input.
// This comes from 20 maximum total reports, 20 reports per type, 5 windows per
// type, and 32 distinct trigger data values.
constexpr absl::uint128 kMaxNumCombinations =
absl::MakeUint128(/*high=*/9494472u, /*low=*/10758590974061625903u);
absl::uint128 RandGenerator(absl::uint128 range) {
DCHECK_GT(range, 0u);
uint64_t high = absl::Uint128High64(range);
return absl::MakeUint128(
/*high=*/high == 0u ? 0u : base::RandGenerator(high),
/*low=*/base::RandGenerator(absl::Uint128Low64(range)));
}
// Let B be the trigger data cardinality.
// For every trigger data i, there are wi windows and ci maximum reports.
// Let A[C, w1, ..., wB, c1, ..., cB] be the function which counts the number
// of output states.
//
// The following helper function memoizes the recurrence relation which computes
// this:
//
// 1. A[C,w1,...,wB,c1,...,cB] = 1 if B = 0
// If there are no trigger data types to consider, there is only one possible
// output, the null output.
//
// 2. A[C,w1,...,wB,c1,...,cB] = A[C,w1,...,w{B-1},c1,...,c{B-1}] if wB = 0
// If there are no windows to consider for a particular trigger data type, then
// consider only the remaining trigger data types.
//
// 3. A[C,w1,...,wB,c1,...,cB] = sum(A[C - j,w1,...,wB - 1,c1,...,cB - j],
// for j from 0 to min(c_B, C))
// Otherwise, we look at the number of possible outputs assuming we emit some
// number of reports (up to the max) for the current trigger data type under
// consideration. Given that each choice produces a distinct output, we sum
// these up.
absl::uint128 GetNumStatesRecursive(
attribution_reporting::TriggerSpecs::Iterator it,
int max_reports,
int window_val,
int max_reports_per_type,
internal::StateMap& map) {
// Case 1: "B = 0" there is nothing left to assign for the last data index.
// Also consider the trivial Case 2 -> Case 1 case without touching the cache
// or recursive calls.
auto cur = it++;
if (!cur || (window_val == 0 && !it)) {
return 1;
}
// Store these as 8 bit to optimize storage.
const uint8_t key[4] = {
base::checked_cast<uint8_t>(max_reports), //
it.index(), //
base::checked_cast<uint8_t>(window_val), //
base::checked_cast<uint8_t>(max_reports_per_type), //
};
absl::uint128& cached = map[base::numerics::U32FromNativeEndian(key)];
if (cached != 0) {
return cached;
}
// Case 2: wB = 0.
//
// TODO(csharrison): Use the actual spec's max reports when that is
// implemented. Currently we set `max_reports_per_type` to be equal to
// `max_reports` for every type, but in the future it will be specified on the
// `TriggerSpec` as part of the `summary_buckets` field.
if (window_val == 0) {
cached = GetNumStatesRecursive(
it, max_reports, (*it).second.event_report_windows().end_times().size(),
max_reports, map);
return cached;
}
// Case 3.
for (int i = 0; i <= std::min(max_reports_per_type, max_reports); i++) {
cached += GetNumStatesRecursive(cur, max_reports - i, window_val - 1,
max_reports_per_type - i, map);
}
return cached;
}
// A variant of the above algorithm which samples a report given an index.
// This follows a similarly structured algorithm.
void GetReportsFromIndexRecursive(
attribution_reporting::TriggerSpecs::Iterator it,
int max_reports,
int window_val,
int max_reports_per_type,
absl::uint128 index,
std::vector<FakeEventLevelReport>& reports,
internal::StateMap& map) {
// Case 1 and Case 2 -> 1. There are no more valid trigger data value, so
// generate nothing.
auto cur = it++;
if (!cur || (window_val == 0 && !it)) {
return;
}
// Case 2: there are no more windows to consider for the current trigger data,
// so generate based on the remaining trigger data types.
//
// TODO(csharrison): Use the actual spec's max reports when that is
// implemented. Currently we set `max_reports_per_type` to be equal to
// `max_reports` for every type, but in the future it will be specified on the
// `TriggerSpec` as part of the `summary_buckets` field.
if (window_val == 0) {
GetReportsFromIndexRecursive(
it, max_reports, (*it).second.event_report_windows().end_times().size(),
max_reports, index, reports, map);
return;
}
// Case 3: For the current window and trigger data under consideration, we
// need to choose how many reports we emit. Think of the index as pointing to
// a particular output, where outputs are partitioned by the # of reports to
// emit. E.g. think of each dash below as a possible output.
//
// 0 report 1 reports 2 reports
// |----------------------|---------------------|-----------|
// ^ ^
// prev_sum index
//
// The first thing we need to do is figure out how many reports to emit, this
// is as simple as just computing the # of states with 0 reports, 1 report,
// and so on until we find where the index slots in.
//
// Next, we "zoom in" to that partition of outputs in the recursive step to
// figure out what other reports we need to emit (if any). We consider a new
// index which just looks at the "dashes" before `index`, i.e. index' = index
// - prev_sum.
absl::uint128 prev_sum = 0;
for (int i = 0; i <= std::min(max_reports_per_type, max_reports); i++) {
absl::uint128 num_states = GetNumStatesRecursive(
cur, max_reports - i, window_val - 1, max_reports_per_type - i, map);
// The index is associated with emitting `i` reports
if (num_states + prev_sum > index) {
for (int k = 0; k < i; k++) {
reports.push_back(FakeEventLevelReport{.trigger_data = (*cur).first,
.window_index = window_val - 1});
}
DCHECK_GE(index - prev_sum, 0);
// Zoom into all other outputs that are associated with picking `i`
// reports for this config.
GetReportsFromIndexRecursive(cur, max_reports - i, window_val - 1,
max_reports_per_type - i, index - prev_sum,
reports, map);
return;
}
prev_sum += num_states;
}
NOTREACHED();
}
absl::uint128 GetNumStatesCached(
const attribution_reporting::TriggerSpecs& specs,
int max_reports,
internal::StateMap& map) {
if (specs.empty() || max_reports == 0) {
return 1;
}
auto it = specs.begin();
size_t num_windows = (*it).second.event_report_windows().end_times().size();
// Optimized fast-path.
if (specs.SingleSharedSpec()) {
int64_t states = internal::GetNumberOfStarsAndBarsSequences(
/*num_stars=*/max_reports,
/*num_bars=*/specs.size() * num_windows);
DCHECK_EQ(states, GetNumStatesRecursive(it, max_reports, num_windows,
max_reports, map));
DCHECK_GE(states, 0);
return states;
}
return GetNumStatesRecursive(it, max_reports, num_windows, max_reports, map);
}
} // namespace
RandomizedResponseData::RandomizedResponseData(double rate,
double channel_capacity,
RandomizedResponse response)
: rate_(rate),
channel_capacity_(channel_capacity),
response_(std::move(response)) {
DCHECK_GE(rate_, 0);
DCHECK_LE(rate_, 1);
DCHECK_GE(channel_capacity_, 0);
}
RandomizedResponseData::~RandomizedResponseData() = default;
RandomizedResponseData::RandomizedResponseData(const RandomizedResponseData&) =
default;
RandomizedResponseData& RandomizedResponseData::operator=(
const RandomizedResponseData&) = default;
RandomizedResponseData::RandomizedResponseData(RandomizedResponseData&&) =
default;
RandomizedResponseData& RandomizedResponseData::operator=(
RandomizedResponseData&&) = default;
bool GenerateWithRate(double r) {
DCHECK_GE(r, 0);
DCHECK_LE(r, 1);
return base::RandDouble() < r;
}
double GetRandomizedResponseRate(absl::uint128 num_states, double epsilon) {
DCHECK_GT(num_states, 0);
double num_states_double = static_cast<double>(num_states);
return num_states_double / (num_states_double - 1 + std::exp(epsilon));
}
absl::uint128 GetNumStates(
const attribution_reporting::TriggerSpecs& specs,
attribution_reporting::MaxEventLevelReports max_reports) {
internal::StateMap map;
return GetNumStatesCached(specs, max_reports, map);
}
RandomizedResponseData DoRandomizedResponse(
const attribution_reporting::TriggerSpecs& specs,
attribution_reporting::MaxEventLevelReports max_reports,
double epsilon) {
internal::StateMap map;
return internal::DoRandomizedResponseWithCache(specs, max_reports, epsilon,
map);
}
namespace internal {
int64_t BinomialCoefficient(int n, int k) {
DCHECK_GE(n, 0);
DCHECK_GE(k, 0);
if (k > n) {
return 0;
}
// Speed up some trivial cases.
if (k == n || n == 0) {
return 1;
}
// BinomialCoefficient(n, k) == BinomialCoefficient(n, n - k),
// So simplify if possible.
if (k > n - k) {
k = n - k;
}
// (n choose k) = n (n -1) ... (n - (k - 1)) / k!
// = mul((n + i - i) / i), i from 1 -> k.
//
// You might be surprised that this algorithm works just fine with integer
// division (i.e. division occurs cleanly with no remainder). However, this is
// true for a very simple reason. Imagine a value of `i` causes division with
// remainder in the below algorithm. This immediately implies that
// (n choose i) is fractional, which we know is not the case.
int64_t result = 1;
for (int i = 1; i <= k; i++) {
result = base::CheckMul(result, n + 1 - i).ValueOrDie();
DCHECK_EQ(0, result % i);
result = result / i;
}
return result;
}
// Computes the `combination_index`-th lexicographically smallest k-combination.
// https://en.wikipedia.org/wiki/Combinatorial_number_system
// A k-combination is a sequence of k non-negative integers in decreasing order.
// a_k > a_{k-1} > ... > a_2 > a_1 >= 0.
// k-combinations can be ordered lexicographically, with the smallest
// k-combination being a_k=k-1, a_{k-1}=k-2, .., a_1=0. Given an index
// `combination_index`>=0, and an order k, this method returns the
// `combination_index`-th smallest k-combination.
//
// Given an index `combination_index`, the `combination_index`-th k-combination
// is the unique set of k non-negative integers
// a_k > a_{k-1} > ... > a_2 > a_1 >= 0
// such that `combination_index` = \sum_{i=1}^k {a_i}\choose{i}
//
// We find this set via a simple greedy algorithm.
// http://math0.wvstateu.edu/~baker/cs405/code/Combinadics.html
std::vector<int> GetKCombinationAtIndex(int64_t combination_index, int k) {
DCHECK_GE(combination_index, 0);
DCHECK_GE(k, 0);
// `k` can be no more than max number of event level reports per source (20).
DCHECK_LE(k, 20);
std::vector<int> output_k_combination;
output_k_combination.reserve(k);
if (k == 0) {
return output_k_combination;
}
// To find a_k, iterate candidates upwards from 0 until we've found the
// maximum a such that (a choose k) <= `combination_index`. Let a_k = a. Use
// the previous binomial coefficient to compute the next one. Note: possible
// to speed this up via something other than incremental search.
int64_t target = combination_index;
int candidate = k - 1;
int64_t binomial_coefficient = 0; // BinomialCoefficient(candidate, k)
int64_t next_binomial_coefficient = 1; // BinomialCoefficient(candidate+1, k)
while (next_binomial_coefficient <= target) {
candidate++;
binomial_coefficient = next_binomial_coefficient;
DCHECK_EQ(binomial_coefficient, BinomialCoefficient(candidate, k));
// (n + 1 choose k) = (n choose k) * (n + 1) / (n + 1 - k)
next_binomial_coefficient =
base::CheckMul(binomial_coefficient, candidate + 1).ValueOrDie();
next_binomial_coefficient /= candidate + 1 - k;
}
// We know from the k-combination definition, all subsequent values will be
// strictly decreasing. Find them all by decrementing `candidate`.
// Use the previous binomial coefficient to compute the next one.
int current_k = k;
while (true) {
// The optimized code below maintains this loop invariant.
DCHECK_EQ(binomial_coefficient, BinomialCoefficient(candidate, current_k));
if (binomial_coefficient <= target) {
output_k_combination.push_back(candidate);
target -= binomial_coefficient;
if (static_cast<int>(output_k_combination.size()) == k) {
DCHECK_EQ(target, 0);
return output_k_combination;
}
// (n - 1 choose k - 1) = (n choose k) * k / n
binomial_coefficient = binomial_coefficient * (current_k) / candidate;
current_k--;
candidate--;
} else {
// (n - 1 choose k) = (n choose k) * (n - k) / n
binomial_coefficient =
binomial_coefficient * (candidate - current_k) / candidate;
candidate--;
}
}
}
std::vector<FakeEventLevelReport> GetFakeReportsForSequenceIndex(
const attribution_reporting::TriggerSpecs& specs,
int max_reports,
absl::uint128 index,
StateMap& map) {
std::vector<FakeEventLevelReport> reports;
if (specs.empty() || max_reports == 0) {
return reports;
}
auto it = specs.begin();
GetReportsFromIndexRecursive(
it, max_reports, (*it).second.event_report_windows().end_times().size(),
max_reports, index, reports, map);
return reports;
}
int64_t GetNumberOfStarsAndBarsSequences(int num_stars, int num_bars) {
return BinomialCoefficient(num_stars + num_bars, num_stars);
}
std::vector<int> GetStarIndices(int num_stars,
int num_bars,
int64_t sequence_index) {
DCHECK_LT(sequence_index,
GetNumberOfStarsAndBarsSequences(num_stars, num_bars));
return GetKCombinationAtIndex(sequence_index, num_stars);
}
std::vector<int> GetBarsPrecedingEachStar(std::vector<int> out) {
DCHECK(base::ranges::is_sorted(out, std::greater{}));
for (size_t i = 0u; i < out.size(); i++) {
int star_index = out[i];
// There are `star_index` prior positions in the sequence, and `i` prior
// stars, so there are `star_index` - `i` prior bars.
out[i] = star_index - (out.size() - 1 - i);
}
return out;
}
double BinaryEntropy(double p) {
if (p == 0 || p == 1) {
return 0;
}
return -p * log2(p) - (1 - p) * log2(1 - p);
}
double ComputeChannelCapacity(absl::uint128 num_states,
double randomized_response_rate) {
DCHECK_GT(num_states, 0u);
DCHECK_GE(randomized_response_rate, 0);
DCHECK_LE(randomized_response_rate, 1);
// The capacity of a unary channel is 0. This follows from the definition
// of mutual information.
if (num_states == 1u || randomized_response_rate == 1) {
return 0;
}
double num_states_double = static_cast<double>(num_states);
double p =
randomized_response_rate * (num_states_double - 1) / num_states_double;
return log2(num_states_double) - BinaryEntropy(p) -
p * log2(num_states_double - 1);
}
std::vector<FakeEventLevelReport> GetFakeReportsForSequenceIndex(
const attribution_reporting::TriggerSpecs& specs,
int max_reports,
int64_t random_stars_and_bars_sequence_index) {
const attribution_reporting::TriggerSpec* single_spec =
specs.SingleSharedSpec();
CHECK(single_spec);
const int trigger_data_cardinality = specs.size();
const std::vector<int> bars_preceding_each_star =
GetBarsPrecedingEachStar(GetStarIndices(
/*num_stars=*/max_reports,
/*num_bars=*/trigger_data_cardinality *
single_spec->event_report_windows().end_times().size(),
/*sequence_index=*/random_stars_and_bars_sequence_index));
std::vector<FakeEventLevelReport> fake_reports;
// an output state is uniquely determined by an ordering of c stars and w*d
// bars, where:
// w = the number of reporting windows
// c = the maximum number of reports for a source
// d = the trigger data cardinality for a source
for (int num_bars : bars_preceding_each_star) {
if (num_bars == 0) {
continue;
}
auto result = std::div(num_bars - 1, trigger_data_cardinality);
const int trigger_data_index = result.rem;
DCHECK_GE(trigger_data_index, 0);
DCHECK_LT(trigger_data_index, trigger_data_cardinality);
fake_reports.push_back({
.trigger_data =
std::next(specs.trigger_data_indices().begin(), trigger_data_index)
->first,
.window_index = result.quot,
});
}
DCHECK_LE(fake_reports.size(), static_cast<size_t>(max_reports));
return fake_reports;
}
RandomizedResponseData DoRandomizedResponseWithCache(
const attribution_reporting::TriggerSpecs& specs,
int max_reports,
double epsilon,
StateMap& map) {
const absl::uint128 num_states = GetNumStatesCached(specs, max_reports, map);
double rate = GetRandomizedResponseRate(num_states, epsilon);
std::optional<std::vector<FakeEventLevelReport>> fake_reports;
if (GenerateWithRate(rate)) {
// TODO(csharrison): Justify the fast path with `single_spec` with
// profiling.
//
// Note: we can implement the fast path in more cases than a single shared
// spec if all of the specs have the same # of windows and reports. We can
// consider further optimizing if it's useful. The existing code will cover
// the default specs for navigation / event sources.
const absl::uint128 sequence_index = RandGenerator(num_states);
DCHECK_GE(sequence_index, 0);
DCHECK_LT(sequence_index, kMaxNumCombinations);
fake_reports = specs.SingleSharedSpec()
? internal::GetFakeReportsForSequenceIndex(
specs, max_reports,
base::checked_cast<int64_t>(
absl::Uint128Low64(sequence_index)))
: internal::GetFakeReportsForSequenceIndex(
specs, max_reports, sequence_index, map);
}
return RandomizedResponseData(
rate, internal::ComputeChannelCapacity(num_states, rate),
std::move(fake_reports));
}
} // namespace internal
} // namespace content