| // Copyright 2020 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "components/federated_learning/sim_hash.h" |
| |
| #include <cmath> |
| |
| #include "base/check_op.h" |
| #include "base/hash/legacy_hash.h" |
| #include "base/rand_util.h" |
| #include "testing/gtest/include/gtest/gtest.h" |
| |
| namespace federated_learning { |
| |
| namespace { |
| |
| // Sparse representation of a 2^64 bit vector. Each number in the set represents |
| // the position of a bit that is being set. |
| using LargeBitVector = std::set<uint64_t>; |
| |
| // Roll our own random uint64_t generator as we want deterministic outcome |
| // across different test runs. |
| uint64_t RandUint64() { |
| static uint64_t seed = 0; |
| ++seed; |
| uint64_t arr[2] = {1234ULL, 4321ULL}; |
| return base::legacy::CityHash64WithSeed( |
| base::as_bytes( |
| base::make_span(reinterpret_cast<const char*>(arr), sizeof(arr))), |
| seed); |
| } |
| |
| // Inclusive [min, max] |
| uint64_t RandUint64InRange(uint64_t min, uint64_t max) { |
| DCHECK_LE(min, max); |
| if (min == 0ULL && max == std::numeric_limits<uint64_t>::max()) |
| return RandUint64(); |
| |
| uint64_t range = max - min + 1; |
| DCHECK_LE(2ULL, range); |
| |
| // We must discard random results above this number, as they would |
| // make the random generator non-uniform. |
| uint64_t max_acceptable_value = |
| (std::numeric_limits<uint64_t>::max() / range) * range - 1; |
| |
| uint64_t rand_value; |
| do { |
| rand_value = RandUint64(); |
| } while (rand_value > max_acceptable_value); |
| |
| uint64_t result = (rand_value % range) + min; |
| |
| DCHECK_GE(result, min); |
| DCHECK_LE(result, max); |
| return result; |
| } |
| |
| LargeBitVector RandLargeBitVector( |
| size_t number_of_bits_set, |
| uint64_t max_bit_position = std::numeric_limits<uint64_t>::max()) { |
| LargeBitVector result; |
| while (result.size() < number_of_bits_set) { |
| result.insert(RandUint64InRange(0ULL, max_bit_position)); |
| } |
| return result; |
| } |
| |
| float DotProduct(const LargeBitVector& v1, const LargeBitVector& v2) { |
| float result = 0; |
| for (uint64_t pos : v1) { |
| if (v2.count(pos)) |
| result += 1; |
| } |
| return result; |
| } |
| |
| float Norm(const LargeBitVector& v) { |
| return std::sqrt(DotProduct(v, v)); |
| } |
| |
| WeightedFeatures ToWeightedFeatures(const LargeBitVector& v) { |
| WeightedFeatures result; |
| for (FeatureEncoding feature : v) { |
| result.emplace(feature, FeatureWeight(1)); |
| } |
| return result; |
| } |
| |
| size_t MultipleSimHashGetNumOutputBitsEqual(size_t repeat_times, |
| uint8_t dimensions, |
| const WeightedFeatures& f1, |
| const WeightedFeatures& f2) { |
| uint64_t seed1 = 1; |
| uint64_t seed2 = 100000; |
| |
| uint64_t num_output_bits_equal = 0; |
| for (size_t i = 0; i < repeat_times; ++i) { |
| SetSeedsForTesting(seed1++, seed2++); |
| |
| uint64_t o1 = SimHashWeightedFeatures(f1, dimensions); |
| uint64_t o2 = SimHashWeightedFeatures(f2, dimensions); |
| for (uint8_t j = 0; j < dimensions; ++j) { |
| if ((o1 & 1) == (o2 & 1)) |
| ++num_output_bits_equal; |
| o1 >>= 1; |
| o2 >>= 1; |
| } |
| } |
| return num_output_bits_equal; |
| } |
| |
| TEST(SimHashTest, HashValue) { |
| WeightedFeatures empty; |
| EXPECT_EQ(SimHashWeightedFeatures(empty, 1u), 0ULL); |
| EXPECT_EQ(SimHashWeightedFeatures(empty, 16u), 0ULL); |
| |
| WeightedFeatures f0; |
| f0.emplace(FeatureEncoding(0), FeatureWeight(1)); |
| EXPECT_EQ(SimHashWeightedFeatures(f0, 1u), 0ULL); |
| EXPECT_EQ(SimHashWeightedFeatures(f0, 16u), 8632ULL); |
| |
| WeightedFeatures f1; |
| f1.emplace(FeatureEncoding(0), FeatureWeight(123)); |
| EXPECT_EQ(SimHashWeightedFeatures(f1, 1u), 0ULL); |
| EXPECT_EQ(SimHashWeightedFeatures(f1, 16u), 8632ULL); |
| |
| WeightedFeatures f2; |
| f2.emplace(FeatureEncoding(999999), FeatureWeight(1)); |
| EXPECT_EQ(SimHashWeightedFeatures(f2, 1u), 1ULL); |
| EXPECT_EQ(SimHashWeightedFeatures(f2, 16u), 28603ULL); |
| |
| WeightedFeatures f3; |
| f3.emplace(FeatureEncoding(0), FeatureWeight(1)); |
| f3.emplace(FeatureEncoding(1), FeatureWeight(1)); |
| EXPECT_EQ(SimHashWeightedFeatures(f3, 1u), 0ULL); |
| EXPECT_EQ(SimHashWeightedFeatures(f3, 16u), 10682ULL); |
| |
| WeightedFeatures f4; |
| f4.emplace(FeatureEncoding(0), FeatureWeight(2)); |
| f4.emplace(FeatureEncoding(1), FeatureWeight(3)); |
| EXPECT_EQ(SimHashWeightedFeatures(f4, 1u), 0ULL); |
| EXPECT_EQ(SimHashWeightedFeatures(f4, 16u), 2490ULL); |
| } |
| |
| // Test that given random equally weighted input features, the chances of each |
| // bit in the SimHash result to be 0 and 1 are equally likely. |
| TEST(SimHashTest, ExpectationOnRandomUniformInput) { |
| const uint8_t dimensions = 16u; |
| const size_t repeat_times = 10000u; |
| |
| uint64_t totals[dimensions] = {0}; |
| |
| for (size_t i = 0; i < repeat_times; ++i) { |
| LargeBitVector v = RandLargeBitVector(RandUint64InRange(1u, 10u)); |
| uint64_t hash_result = |
| SimHashWeightedFeatures(ToWeightedFeatures(v), dimensions); |
| |
| for (uint8_t j = 0; j < dimensions; ++j) { |
| totals[j] += (hash_result & 1); |
| hash_result >>= 1; |
| } |
| } |
| |
| const double expectation = 0.5; |
| const double err_tolerance = 0.03; |
| |
| for (uint8_t j = 0; j < dimensions; ++j) { |
| double avg = 1.0 * totals[j] / repeat_times; |
| EXPECT_LT(avg, expectation + err_tolerance); |
| EXPECT_GT(avg, expectation - err_tolerance); |
| } |
| } |
| |
| // Test that the cosine similarity is preserved. |
| TEST(SimHashTest, CosineSimilarity_NonOrthogonalInput) { |
| const float kPi = 3.141592653589793; |
| const uint8_t dimensions = 50u; |
| const size_t repeat_times = 100u; |
| |
| // Generate v1 and v2 that are likely non-orthogonal. |
| LargeBitVector v1 = RandLargeBitVector(4000u, 10000); |
| LargeBitVector v2 = RandLargeBitVector(5000u, 10000); |
| |
| size_t num_output_bits_equal = MultipleSimHashGetNumOutputBitsEqual( |
| repeat_times, dimensions, ToWeightedFeatures(v1), ToWeightedFeatures(v2)); |
| float avg = 1.0 * num_output_bits_equal / dimensions / repeat_times; |
| float expectation = |
| 1.0 - std::acos(DotProduct(v1, v2) / (Norm(v1) * Norm(v2))) / kPi; |
| |
| // Verify that the expectation is different from 0.5 and 1 to make sure the |
| // test is non-trivial. |
| EXPECT_LT(expectation, 0.7); |
| EXPECT_GT(expectation, 0.6); |
| |
| // Verify that SimHash(v1) and SimHash(v2) have approximately |expectation| |
| // fraction of their bits in common. |
| const double err_tolerance = 0.03; |
| EXPECT_LT(avg, expectation + err_tolerance); |
| EXPECT_GT(avg, expectation - err_tolerance); |
| } |
| |
| // Test that when input v1 and v2 are orthogonal, SimHash(v1) and SimHash(v2) |
| // have approximately half their bits in common. |
| TEST(SimHashTest, CosineSimilarity_OrthogonalInput) { |
| const uint8_t dimensions = 50u; |
| const size_t repeat_times = 100u; |
| |
| // Generate v1 and v2 that are likely orthogonal |
| LargeBitVector v1 = RandLargeBitVector(1u); |
| LargeBitVector v2 = RandLargeBitVector(1u); |
| |
| size_t num_output_bits_equal = MultipleSimHashGetNumOutputBitsEqual( |
| repeat_times, dimensions, ToWeightedFeatures(v1), ToWeightedFeatures(v2)); |
| float avg = 1.0 * num_output_bits_equal / dimensions / repeat_times; |
| float expectation = 0.5; |
| |
| // Verify that SimHash(v1) and SimHash(v2) have approximately half their bits |
| // in common. |
| const double err_tolerance = 0.03; |
| EXPECT_LT(avg, expectation + err_tolerance); |
| EXPECT_GT(avg, expectation - err_tolerance); |
| } |
| |
| } // namespace |
| |
| } // namespace federated_learning |