blob: a91cfb2dd2af9eb002f06b451dd915215e2de0cb [file] [log] [blame]
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <stddef.h>
#include <stdint.h>
#include <functional>
#include <limits>
#include <type_traits>
#include <utility>
#include <vector>
#include "base/bits.h"
#include "base/logging.h"
#include "base/macros.h"
#include "base/numerics/safe_conversions.h"
namespace url_pattern_index {
template <typename KeyType_, typename Hasher_>
class SimpleQuadraticProber;
template <typename KeyType_, typename ValueType_, typename Prober_>
class ClosedHashMap;
// The default Prober used by the HashMap.
template <typename KeyType_, typename Hasher_>
using DefaultProber = SimpleQuadraticProber<KeyType_, Hasher_>;
// The default HashMap data structure meant to be used. For more fine-grained
// control one can use ClosedHashMap with their own Prober.
template <typename KeyType_,
typename ValueType_,
typename Hasher_ = std::hash<KeyType_>>
using HashMap =
ClosedHashMap<KeyType_, ValueType_, DefaultProber<KeyType_, Hasher_>>;
// An insert-only map implemented as a hash-table with open addressing (also
// called closed hashing). The table can contain up to 2^30 distinct keys (or up
// to 2^32 - 1 on 64bit systems).
// On normal operation load factor varies within range (1/4, 1/2]. The real load
// factor can be less or equal to 1/4 if rehashing is requested explicitly to
// allocate more memory for future insertions.
// The table discloses its internal structure in order to allow converting it to
// different formats. It consists of 2 vectors, |hash_table| and |entries|. The
// |entries| is a vector of pair<KeyType, ValueType>, whereas |hash_table|'s
// elements (also called slots) are indexes into the |entries| vector. If, for
// some i, hash_table[i] >= entries.size() then the i-th slot is empty. It is
// guaranteed that each of the entries is referenced by exactly one table slot.
// The Prober is a strategy class used to find a slot for a particular key by
// probing the key against a sequence of slots called a probe sequence. Often it
// stores one or more hashers - functors used to calculate hash values of the
// keys in order to parameterize the probe sequence. The Prober class should
// provide the following:
// Public type(s):
// KeyType - the type of the keys the table can store. Should be equal to the
// ClosedHashTable's KeyType.
// Public method(s):
// template <typename SlotCompare>
// size_t FindSlot(const KeyType& key,
// size_t table_size,
// const SlotCompare& compare) const;
// Walks the probe sequence for the given |key| starting from some initial
// slot calculated deterministically from the |key|, e.g. by computing its
// hash code. The walk continues until it finds a hash table slot such that
// compare(key, slot) returns true (which, for instance, can mean that the
// slot is empty or its key is equal to |key|). Returns the index of that slot
// in [0, table_size). The method is required to perform a finite number of
// probes until it finds a slot.
// The same Prober class can be used to ensure compatibility when performing
// lookups in two different representations of what is conceptually the same
// hash map.
// TODO(pkalinnikov): Add key comparator.
template <typename KeyType_, typename ValueType_, typename Prober_>
class ClosedHashMap {
using KeyType = KeyType_;
using ValueType = ValueType_;
using EntryType = std::pair<KeyType, ValueType>;
using Prober = Prober_;
static_assert(std::is_same<KeyType, typename Prober::KeyType>::value,
"The KeyType should be the same in the Prober.");
// Creates an empty hash-map with the default prober, and space allocated for
// 4 distinct keys.
ClosedHashMap() : ClosedHashMap(4) {}
// Creates an empty hash-map with the specified |capacity|, so that this many
// distinct keys can be inserted with no rehashing or reallocation taking
// place. The |prober| is a strategy used for finding slots for the keys.
explicit ClosedHashMap(size_t capacity, const Prober& prober = Prober())
: hash_table_(CalculateHashTableSizeFor(capacity), EmptySlot()),
prober_(prober) {
// Returns the number of distinct keys.
size_t size() const { return entries_.size(); }
// Returns the number of slots in the |hash_table|.
size_t table_size() const { return hash_table_.size(); }
const std::vector<uint32_t>& hash_table() const { return hash_table_; }
const std::vector<EntryType>& entries() const { return entries_; }
// Returns a pointer to the value stored for the given |key|, or nullptr if
// the |key| is not present in the map. The returned pointer is guaranteed to
// be valid as long as no new keys get added to the map (more strictly
// speaking, as long as no reallocations happen for the |entries| vector).
const ValueType* Get(const KeyType& key) const {
const size_t entry_index = hash_table_[FindSlotForKey(key)];
if (entry_index == EmptySlot())
return nullptr;
DCHECK_LT(entry_index, entries_.size());
return &entries_[entry_index].second;
// Associates |value| with the given |key| if the |key| is not yet present in
// the map, and returns true. Otherwise does nothing and returns false.
bool Insert(const KeyType& key, ValueType value) {
const size_t slot = FindSlotForKey(key);
if (hash_table_[slot] != EmptySlot())
return false;
EmplaceKeyValue(slot, key, std::move(value));
return true;
// Returns a reference to the value stored for the given |key|. If the |key|
// is not present in the map, it's inserted and value-initialized. The same
// guarantee on reference validity applies as for the result of Get(key).
ValueType& operator[](const KeyType& key) {
const size_t slot = FindSlotForKey(key);
size_t entry_index = hash_table_[slot];
if (entry_index == EmptySlot()) {
entry_index = EmplaceKeyValue(slot, key, ValueType());
DCHECK_LT(entry_index, entries_.size());
return entries_[entry_index].second;
// Resizes the |hash_table|, if necessary, so that at least |capacity|
// distinct keys can be stored with the load factor being no higher than 1/2.
void Rehash(size_t capacity) {
if (capacity <= hash_table_.size() / 2)
DCHECK_LE(entries_.size(), static_cast<size_t>(EmptySlot()));
hash_table_.assign(CalculateHashTableSizeFor(capacity), EmptySlot());
for (size_t index = 0; index != entries_.size(); ++index) {
const size_t slot = FindSlotForKey(entries_[index].first);
DCHECK_EQ(hash_table_[slot], EmptySlot());
hash_table_[slot] = static_cast<uint32_t>(index);
// Reserves enough space so that |capacity| distinct keys can be stored with
// the load factor being no higher than 1/2.
void Reserve(size_t capacity) {
// The indicator of absent entry for a certain slot in |hash_table|.
static constexpr uint32_t EmptySlot() {
return std::numeric_limits<uint32_t>::max();
// Returns the number of |hash_table| slots necessary to maintain a load
// factor between 1/4 and 1/2 for the number of distinct keys given by
// |capacity|.
static size_t CalculateHashTableSizeFor(size_t capacity) {
// TODO(pkalinnikov): Implement base::bits::Log2Ceiling for arbitrary types.
const uint32_t capacity_32 = base::checked_cast<uint32_t>(capacity);
const int power_of_two = base::bits::Log2Ceiling(capacity_32) + 1;
CHECK_LT(power_of_two, std::numeric_limits<size_t>::digits);
return static_cast<size_t>(1) << power_of_two;
// Adds a new |key|-|value| pair to the structure. Returns the index of the
// newly created entry. The table is rehashed if the newly created entry
// bursts the load factor above 1/2. Otherwise the new entry is associated
// with a specific |slot| of the table.
size_t EmplaceKeyValue(size_t slot, KeyType key, ValueType value) {
const size_t entry_index = entries_.size();
CHECK_LT(entry_index, static_cast<size_t>(EmptySlot()));
entries_.emplace_back(std::move(key), std::move(value));
if (entry_index >= hash_table_.size() / 2) {
Rehash(entry_index + 1);
} else {
DCHECK_LT(slot, hash_table_.size());
DCHECK_EQ(hash_table_[slot], EmptySlot());
hash_table_[slot] = static_cast<uint32_t>(entry_index);
return entry_index;
// Finds a slot such that it's either empty (indicating that the |key| is not
// stored) or contains the |key|.
size_t FindSlotForKey(const KeyType& key) const {
return prober_.FindSlot(
key, hash_table_.size(), [this](const KeyType& key, size_t slot_index) {
DCHECK_LT(slot_index, hash_table_.size());
const uint32_t entry_index = hash_table_[slot_index];
DCHECK(entry_index == EmptySlot() || entry_index < entries_.size());
return entry_index == EmptySlot() ||
entries_[entry_index].first == key;
// Contains indices into |entries_|, or EmptySlot() for free table slots.
// Always contains at least twice as many slots as there are elements in
// |entries_|, i.e. the load factor is always less than or equal to 1/2.
std::vector<uint32_t> hash_table_;
// Contains all the inserted key-value pairs. The keys are unique. The size of
// the vector is never greater than EmptySlot().
std::vector<EntryType> entries_;
// The strategy used to find a slot for a key.
Prober prober_;
// The implementation of Prober that uses a simple form of quadratic probing.
// That is, for a given |key| the sequence of probes is the following (modulo
// the hash table size):
// hash(key), hash(key) + 1, hash(key) + 3, ..., hash(key) + k * (k - 1) / 2
// To use this prober the hash table should maintain its size M equal to powers
// of two. It can be shown that in this case the aforementioned quadratic probe
// sequence for k <= M visits k distinct table slots.
// Template parameters:
// KeyType_ - the type of the table's keys.
// Hasher_ - the type of the functor used to calculate hashes of keys.
template <typename KeyType_, typename Hasher_>
class SimpleQuadraticProber {
using KeyType = KeyType_;
using Hasher = Hasher_;
// Constructs the prober that uses the |hasher| functor to hash the keys.
explicit SimpleQuadraticProber(const Hasher& hasher = Hasher())
: hasher_(hasher) {}
// Returns the slot index for the |key|. Requires that |compare| returns true
// for at least one slot index between 0 and |table_size| - 1.
template <typename SlotCompare>
size_t FindSlot(const KeyType& key,
size_t table_size,
const SlotCompare& compare) const {
DCHECK_GT(table_size, 0u);
DCHECK_EQ(table_size & (table_size - 1), 0u);
const size_t kMask = table_size - 1;
size_t slot_index = hasher_(key) & kMask;
// The loop will always be finite, since |compare| is guaranteed to return
// true for at least 1 slot index, and the probe sequence visits all slots.
for (size_t step_size = 1; !compare(key, slot_index); ++step_size) {
DCHECK_LT(step_size, table_size);
slot_index = (slot_index + step_size) & kMask;
DCHECK_LT(slot_index, table_size);
return slot_index;
// The functor used to calculate hashes of keys.
Hasher hasher_;
} // namespace url_pattern_index