blob: 42ca63ba32edc6ab2554b4ed06c31b5168e02481 [file]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_
#define BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_
#include <algorithm>
#include <atomic>
#include "base/allocator/buildflags.h"
#include "base/allocator/partition_allocator/partition_alloc_check.h"
#include "base/allocator/partition_allocator/partition_alloc_config.h"
#include "base/allocator/partition_allocator/yield_processor.h"
#include "base/base_export.h"
#include "base/compiler_specific.h"
#include "base/thread_annotations.h"
#include "build/build_config.h"
#if BUILDFLAG(IS_WIN)
#include "base/win/windows_types.h"
#endif
#if BUILDFLAG(IS_POSIX)
#include <errno.h>
#include <pthread.h>
#endif
#if BUILDFLAG(IS_APPLE)
#include <os/lock.h>
// os_unfair_lock is available starting with OS X 10.12, and Chromium targets
// 10.11 at the minimum, so the symbols are not always available *at runtime*.
// But we build with a 11.x SDK, so it's always in the headers.
//
// However, since the majority of clients have at least 10.12 (released late
// 2016), we declare the symbols here, marking them weak. They will be nullptr
// on 10.11, and defined on more recent versions.
// Silence the compiler warning, here and below.
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunguarded-availability"
#define PA_WEAK __attribute__((weak))
extern "C" {
PA_WEAK void os_unfair_lock_lock(os_unfair_lock_t lock);
PA_WEAK bool os_unfair_lock_trylock(os_unfair_lock_t lock);
PA_WEAK void os_unfair_lock_unlock(os_unfair_lock_t lock);
} // extern "C"
#pragma clang diagnostic pop
#endif // BUILDFLAG(IS_APPLE)
#if BUILDFLAG(IS_FUCHSIA)
#include <lib/sync/mutex.h>
#endif
namespace partition_alloc::internal {
// The behavior of this class depends on whether PA_HAS_FAST_MUTEX is defined.
// 1. When it is defined:
//
// Simple spinning lock. It will spin in user space a set number of times before
// going into the kernel to sleep.
//
// This is intended to give "the best of both worlds" between a SpinLock and
// base::Lock:
// - SpinLock: Inlined fast path, no external function calls, just
// compare-and-swap. Short waits do not go into the kernel. Good behavior in
// low contention cases.
// - base::Lock: Good behavior in case of contention.
//
// We don't rely on base::Lock which we could make spin (by calling Try() in a
// loop), as performance is below a custom spinlock as seen on high-level
// benchmarks. Instead this implements a simple non-recursive mutex on top of
// the futex() syscall on Linux, and SRWLock on Windows. The main difference
// between this and a libc implementation is that it only supports the simplest
// path: private (to a process), non-recursive mutexes with no priority
// inheritance, no timed waits.
//
// As an interesting side-effect to be used in the allocator, this code does not
// make any allocations, locks are small with a constexpr constructor and no
// destructor.
//
// 2. Otherwise: This is a simple SpinLock, in the sense that it does not have
// any awareness of other threads' behavior. One exception: x86 macOS uses
// os_unfair_lock() if available, which is the case for macOS >= 10.12, that is
// most clients.
class LOCKABLE BASE_EXPORT SpinningMutex {
public:
inline constexpr SpinningMutex();
ALWAYS_INLINE void Acquire() EXCLUSIVE_LOCK_FUNCTION();
ALWAYS_INLINE void Release() UNLOCK_FUNCTION();
ALWAYS_INLINE bool Try() EXCLUSIVE_TRYLOCK_FUNCTION(true);
void AssertAcquired() const {} // Not supported.
void Reinit() UNLOCK_FUNCTION();
private:
void LockSlow() EXCLUSIVE_LOCK_FUNCTION();
// See below, the latency of PA_YIELD_PROCESSOR can be as high as ~150
// cycles. Meanwhile, sleeping costs a few us. Spinning 64 times at 3GHz would
// cost 150 * 64 / 3e9 ~= 3.2us.
//
// This applies to Linux kernels, on x86_64. On ARM we might want to spin
// more.
static constexpr int kSpinCount = 64;
#if defined(PA_HAS_FAST_MUTEX)
#if defined(PA_HAS_LINUX_KERNEL)
void FutexWait();
void FutexWake();
static constexpr int kUnlocked = 0;
static constexpr int kLockedUncontended = 1;
static constexpr int kLockedContended = 2;
std::atomic<int32_t> state_{kUnlocked};
#elif BUILDFLAG(IS_WIN)
CHROME_SRWLOCK lock_ = SRWLOCK_INIT;
#elif BUILDFLAG(IS_POSIX)
pthread_mutex_t lock_ = PTHREAD_MUTEX_INITIALIZER;
#elif BUILDFLAG(IS_FUCHSIA)
sync_mutex lock_;
#endif
#else // defined(PA_HAS_FAST_MUTEX)
std::atomic<bool> lock_{false};
#if BUILDFLAG(IS_APPLE) && !defined(PA_NO_OS_UNFAIR_LOCK_CRBUG_1267256)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunguarded-availability"
os_unfair_lock unfair_lock_ = OS_UNFAIR_LOCK_INIT;
#pragma clang diagnostic pop
#endif // BUILDFLAG(IS_APPLE) && !defined(PA_NO_OS_UNFAIR_LOCK_CRBUG_1267256)
// Spinlock-like, fallback.
ALWAYS_INLINE bool TrySpinLock();
ALWAYS_INLINE void ReleaseSpinLock();
void LockSlowSpinLock();
#endif
};
ALWAYS_INLINE void SpinningMutex::Acquire() {
int tries = 0;
int backoff = 1;
// Busy-waiting is inlined, which is fine as long as we have few callers. This
// is only used for the partition lock, so this is the case.
do {
if (LIKELY(Try()))
return;
// Note: Per the intel optimization manual
// (https://software.intel.com/content/dam/develop/public/us/en/documents/64-ia-32-architectures-optimization-manual.pdf),
// the "pause" instruction is more costly on Skylake Client than on previous
// architectures. The latency is found to be 141 cycles
// there (from ~10 on previous ones, nice 14x).
//
// According to Agner Fog's instruction tables, the latency is still >100
// cycles on Ice Lake, and from other sources, seems to be high as well on
// Adler Lake. Separately, it is (from
// https://agner.org/optimize/instruction_tables.pdf) also high on AMD Zen 3
// (~65). So just assume that it's this way for most x86_64 architectures.
//
// Also, loop several times here, following the guidelines in section 2.3.4
// of the manual, "Pause latency in Skylake Client Microarchitecture".
for (int yields = 0; yields < backoff; yields++) {
PA_YIELD_PROCESSOR;
tries++;
}
constexpr int kMaxBackoff = 16;
backoff = std::min(kMaxBackoff, backoff << 1);
} while (tries < kSpinCount);
LockSlow();
}
inline constexpr SpinningMutex::SpinningMutex() = default;
#if defined(PA_HAS_FAST_MUTEX)
#if defined(PA_HAS_LINUX_KERNEL)
ALWAYS_INLINE bool SpinningMutex::Try() {
// Using the weak variant of compare_exchange(), which may fail spuriously. On
// some architectures such as ARM, CAS is typically performed as a LDREX/STREX
// pair, where the store may fail. In the strong version, there is a loop
// inserted by the compiler to retry in these cases.
//
// Since we are retrying in Lock() anyway, there is no point having two nested
// loops.
int expected = kUnlocked;
return (state_.load(std::memory_order_relaxed) == expected) &&
state_.compare_exchange_weak(expected, kLockedUncontended,
std::memory_order_acquire,
std::memory_order_relaxed);
}
ALWAYS_INLINE void SpinningMutex::Release() {
if (UNLIKELY(state_.exchange(kUnlocked, std::memory_order_release) ==
kLockedContended)) {
// |kLockedContended|: there is a waiter to wake up.
//
// Here there is a window where the lock is unlocked, since we just set it
// to |kUnlocked| above. Meaning that another thread can grab the lock
// in-between now and |FutexWake()| waking up a waiter. Aside from
// potentially fairness, this is not an issue, as the newly-awaken thread
// will check that the lock is still free.
//
// There is a small pessimization here though: if we have a single waiter,
// then when it wakes up, the lock will be set to |kLockedContended|, so
// when this waiter releases the lock, it will needlessly call
// |FutexWake()|, even though there are no waiters. This is supported by the
// kernel, and is what bionic (Android's libc) also does.
FutexWake();
}
}
#elif BUILDFLAG(IS_WIN)
ALWAYS_INLINE bool SpinningMutex::Try() {
return !!::TryAcquireSRWLockExclusive(reinterpret_cast<PSRWLOCK>(&lock_));
}
ALWAYS_INLINE void SpinningMutex::Release() {
::ReleaseSRWLockExclusive(reinterpret_cast<PSRWLOCK>(&lock_));
}
#elif BUILDFLAG(IS_POSIX)
ALWAYS_INLINE bool SpinningMutex::Try() {
int retval = pthread_mutex_trylock(&lock_);
PA_DCHECK(retval == 0 || retval == EBUSY);
return retval == 0;
}
ALWAYS_INLINE void SpinningMutex::Release() {
int retval = pthread_mutex_unlock(&lock_);
PA_DCHECK(retval == 0);
}
#elif BUILDFLAG(IS_FUCHSIA)
ALWAYS_INLINE bool SpinningMutex::Try() {
return sync_mutex_trylock(&lock_) == ZX_OK;
}
ALWAYS_INLINE void SpinningMutex::Release() {
sync_mutex_unlock(&lock_);
}
#endif
#else // defined(PA_HAS_FAST_MUTEX)
ALWAYS_INLINE bool SpinningMutex::TrySpinLock() {
// Possibly faster than CAS. The theory is that if the cacheline is shared,
// then it can stay shared, for the contended case.
return !lock_.load(std::memory_order_relaxed) &&
!lock_.exchange(true, std::memory_order_acquire);
}
ALWAYS_INLINE void SpinningMutex::ReleaseSpinLock() {
lock_.store(false, std::memory_order_release);
}
#if BUILDFLAG(IS_APPLE)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunguarded-availability"
ALWAYS_INLINE bool SpinningMutex::Try() {
// ARM64 macOS is macOS 11.x at least, guaranteed to have os_unfair_lock().
#if BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
return os_unfair_lock_trylock(&unfair_lock_);
#else
if (LIKELY(os_unfair_lock_trylock))
return os_unfair_lock_trylock(&unfair_lock_);
return TrySpinLock();
#endif // BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
}
ALWAYS_INLINE void SpinningMutex::Release() {
#if BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
return os_unfair_lock_unlock(&unfair_lock_);
#else
// Always testing trylock(), since the definitions are all or nothing.
if (LIKELY(os_unfair_lock_trylock))
return os_unfair_lock_unlock(&unfair_lock_);
return ReleaseSpinLock();
#endif // BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
}
ALWAYS_INLINE void SpinningMutex::LockSlow() {
#if BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
return os_unfair_lock_lock(&unfair_lock_);
#else
if (LIKELY(os_unfair_lock_trylock))
return os_unfair_lock_lock(&unfair_lock_);
return LockSlowSpinLock();
#endif // BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
}
#pragma clang diagnostic pop
#else
ALWAYS_INLINE bool SpinningMutex::Try() {
return TrySpinLock();
}
ALWAYS_INLINE void SpinningMutex::Release() {
return ReleaseSpinLock();
}
ALWAYS_INLINE void SpinningMutex::LockSlow() {
return LockSlowSpinLock();
}
#endif // BUILDFLAG(IS_APPLE)
#endif // defined(PA_HAS_FAST_MUTEX)
} // namespace partition_alloc::internal
#endif // BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_