base/allocator/partition_allocator/spinning_mutex.h - chromium/src - Git at Google

 // Copyright 2020 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_
 #define BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_

 #include <algorithm>
 #include <atomic>

 #include "base/allocator/buildflags.h"
 #include "base/allocator/partition_allocator/partition_alloc_check.h"
 #include "base/allocator/partition_allocator/partition_alloc_config.h"
 #include "base/allocator/partition_allocator/yield_processor.h"
 #include "base/base_export.h"
 #include "base/compiler_specific.h"
 #include "base/thread_annotations.h"
 #include "build/build_config.h"

 #if BUILDFLAG(IS_WIN)
 #include "base/win/windows_types.h"
 #endif

 #if BUILDFLAG(IS_POSIX)
 #include <errno.h>
 #include <pthread.h>
 #endif

 #if BUILDFLAG(IS_APPLE)

 #include <os/lock.h>

 // os_unfair_lock is available starting with OS X 10.12, and Chromium targets
 // 10.11 at the minimum, so the symbols are not always available *at runtime*.
 // But we build with a 11.x SDK, so it's always in the headers.
 //
 // However, since the majority of clients have at least 10.12 (released late
 // 2016), we declare the symbols here, marking them weak. They will be nullptr
 // on 10.11, and defined on more recent versions.

 // Silence the compiler warning, here and below.
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunguarded-availability"

 #define PA_WEAK __attribute__((weak))

 extern "C" {

 PA_WEAK void os_unfair_lock_lock(os_unfair_lock_t lock);
 PA_WEAK bool os_unfair_lock_trylock(os_unfair_lock_t lock);
 PA_WEAK void os_unfair_lock_unlock(os_unfair_lock_t lock);

 }  // extern "C"

 #pragma clang diagnostic pop

 #endif  // BUILDFLAG(IS_APPLE)

 #if BUILDFLAG(IS_FUCHSIA)
 #include <lib/sync/mutex.h>
 #endif

 namespace partition_alloc::internal {

 // The behavior of this class depends on whether PA_HAS_FAST_MUTEX is defined.
 // 1. When it is defined:
 //
 // Simple spinning lock. It will spin in user space a set number of times before
 // going into the kernel to sleep.
 //
 // This is intended to give "the best of both worlds" between a SpinLock and
 // base::Lock:
 // - SpinLock: Inlined fast path, no external function calls, just
 //   compare-and-swap. Short waits do not go into the kernel. Good behavior in
 //   low contention cases.
 // - base::Lock: Good behavior in case of contention.
 //
 // We don't rely on base::Lock which we could make spin (by calling Try() in a
 // loop), as performance is below a custom spinlock as seen on high-level
 // benchmarks. Instead this implements a simple non-recursive mutex on top of
 // the futex() syscall on Linux, and SRWLock on Windows. The main difference
 // between this and a libc implementation is that it only supports the simplest
 // path: private (to a process), non-recursive mutexes with no priority
 // inheritance, no timed waits.
 //
 // As an interesting side-effect to be used in the allocator, this code does not
 // make any allocations, locks are small with a constexpr constructor and no
 // destructor.
 //
 // 2. Otherwise: This is a simple SpinLock, in the sense that it does not have
 // any awareness of other threads' behavior. One exception: x86 macOS uses
 // os_unfair_lock() if available, which is the case for macOS >= 10.12, that is
 // most clients.
 class LOCKABLE BASE_EXPORT SpinningMutex {
  public:
   inline constexpr SpinningMutex();
   ALWAYS_INLINE void Acquire() EXCLUSIVE_LOCK_FUNCTION();
   ALWAYS_INLINE void Release() UNLOCK_FUNCTION();
   ALWAYS_INLINE bool Try() EXCLUSIVE_TRYLOCK_FUNCTION(true);
   void AssertAcquired() const {}  // Not supported.
   void Reinit() UNLOCK_FUNCTION();

  private:
   void LockSlow() EXCLUSIVE_LOCK_FUNCTION();

   // See below, the latency of PA_YIELD_PROCESSOR can be as high as ~150
   // cycles. Meanwhile, sleeping costs a few us. Spinning 64 times at 3GHz would
   // cost 150 * 64 / 3e9 ~= 3.2us.
   //
   // This applies to Linux kernels, on x86_64. On ARM we might want to spin
   // more.
   static constexpr int kSpinCount = 64;

 #if defined(PA_HAS_FAST_MUTEX)

 #if defined(PA_HAS_LINUX_KERNEL)
   void FutexWait();
   void FutexWake();

   static constexpr int kUnlocked = 0;
   static constexpr int kLockedUncontended = 1;
   static constexpr int kLockedContended = 2;

   std::atomic<int32_t> state_{kUnlocked};
 #elif BUILDFLAG(IS_WIN)
   CHROME_SRWLOCK lock_ = SRWLOCK_INIT;
 #elif BUILDFLAG(IS_POSIX)
   pthread_mutex_t lock_ = PTHREAD_MUTEX_INITIALIZER;
 #elif BUILDFLAG(IS_FUCHSIA)
   sync_mutex lock_;
 #endif

 #else  // defined(PA_HAS_FAST_MUTEX)
   std::atomic<bool> lock_{false};

 #if BUILDFLAG(IS_APPLE) && !defined(PA_NO_OS_UNFAIR_LOCK_CRBUG_1267256)

 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunguarded-availability"
   os_unfair_lock unfair_lock_ = OS_UNFAIR_LOCK_INIT;
 #pragma clang diagnostic pop

 #endif  // BUILDFLAG(IS_APPLE) && !defined(PA_NO_OS_UNFAIR_LOCK_CRBUG_1267256)

   // Spinlock-like, fallback.
   ALWAYS_INLINE bool TrySpinLock();
   ALWAYS_INLINE void ReleaseSpinLock();
   void LockSlowSpinLock();

 #endif
 };

 ALWAYS_INLINE void SpinningMutex::Acquire() {
   int tries = 0;
   int backoff = 1;
   // Busy-waiting is inlined, which is fine as long as we have few callers. This
   // is only used for the partition lock, so this is the case.
   do {
     if (LIKELY(Try()))
       return;
     // Note: Per the intel optimization manual
     // (https://software.intel.com/content/dam/develop/public/us/en/documents/64-ia-32-architectures-optimization-manual.pdf),
     // the "pause" instruction is more costly on Skylake Client than on previous
     // architectures. The latency is found to be 141 cycles
     // there (from ~10 on previous ones, nice 14x).
     //
     // According to Agner Fog's instruction tables, the latency is still >100
     // cycles on Ice Lake, and from other sources, seems to be high as well on
     // Adler Lake. Separately, it is (from
     // https://agner.org/optimize/instruction_tables.pdf) also high on AMD Zen 3
     // (~65). So just assume that it's this way for most x86_64 architectures.
     //
     // Also, loop several times here, following the guidelines in section 2.3.4
     // of the manual, "Pause latency in Skylake Client Microarchitecture".
     for (int yields = 0; yields < backoff; yields++) {
       PA_YIELD_PROCESSOR;
       tries++;
     }
     constexpr int kMaxBackoff = 16;
     backoff = std::min(kMaxBackoff, backoff << 1);
   } while (tries < kSpinCount);

   LockSlow();
 }

 inline constexpr SpinningMutex::SpinningMutex() = default;

 #if defined(PA_HAS_FAST_MUTEX)

 #if defined(PA_HAS_LINUX_KERNEL)

 ALWAYS_INLINE bool SpinningMutex::Try() {
   // Using the weak variant of compare_exchange(), which may fail spuriously. On
   // some architectures such as ARM, CAS is typically performed as a LDREX/STREX
   // pair, where the store may fail. In the strong version, there is a loop
   // inserted by the compiler to retry in these cases.
   //
   // Since we are retrying in Lock() anyway, there is no point having two nested
   // loops.
   int expected = kUnlocked;
   return (state_.load(std::memory_order_relaxed) == expected) &&
          state_.compare_exchange_weak(expected, kLockedUncontended,
                                       std::memory_order_acquire,
                                       std::memory_order_relaxed);
 }

 ALWAYS_INLINE void SpinningMutex::Release() {
   if (UNLIKELY(state_.exchange(kUnlocked, std::memory_order_release) ==
                kLockedContended)) {
     // |kLockedContended|: there is a waiter to wake up.
     //
     // Here there is a window where the lock is unlocked, since we just set it
     // to |kUnlocked| above. Meaning that another thread can grab the lock
     // in-between now and |FutexWake()| waking up a waiter. Aside from
     // potentially fairness, this is not an issue, as the newly-awaken thread
     // will check that the lock is still free.
     //
     // There is a small pessimization here though: if we have a single waiter,
     // then when it wakes up, the lock will be set to |kLockedContended|, so
     // when this waiter releases the lock, it will needlessly call
     // |FutexWake()|, even though there are no waiters. This is supported by the
     // kernel, and is what bionic (Android's libc) also does.
     FutexWake();
   }
 }

 #elif BUILDFLAG(IS_WIN)

 ALWAYS_INLINE bool SpinningMutex::Try() {
   return !!::TryAcquireSRWLockExclusive(reinterpret_cast<PSRWLOCK>(&lock_));
 }

 ALWAYS_INLINE void SpinningMutex::Release() {
   ::ReleaseSRWLockExclusive(reinterpret_cast<PSRWLOCK>(&lock_));
 }

 #elif BUILDFLAG(IS_POSIX)

 ALWAYS_INLINE bool SpinningMutex::Try() {
   int retval = pthread_mutex_trylock(&lock_);
   PA_DCHECK(retval == 0 || retval == EBUSY);
   return retval == 0;
 }

 ALWAYS_INLINE void SpinningMutex::Release() {
   int retval = pthread_mutex_unlock(&lock_);
   PA_DCHECK(retval == 0);
 }

 #elif BUILDFLAG(IS_FUCHSIA)

 ALWAYS_INLINE bool SpinningMutex::Try() {
   return sync_mutex_trylock(&lock_) == ZX_OK;
 }

 ALWAYS_INLINE void SpinningMutex::Release() {
   sync_mutex_unlock(&lock_);
 }

 #endif

 #else  // defined(PA_HAS_FAST_MUTEX)

 ALWAYS_INLINE bool SpinningMutex::TrySpinLock() {
   // Possibly faster than CAS. The theory is that if the cacheline is shared,
   // then it can stay shared, for the contended case.
   return !lock_.load(std::memory_order_relaxed) &&
          !lock_.exchange(true, std::memory_order_acquire);
 }

 ALWAYS_INLINE void SpinningMutex::ReleaseSpinLock() {
   lock_.store(false, std::memory_order_release);
 }

 #if BUILDFLAG(IS_APPLE)

 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunguarded-availability"

 ALWAYS_INLINE bool SpinningMutex::Try() {
   // ARM64 macOS is macOS 11.x at least, guaranteed to have os_unfair_lock().
 #if BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
   return os_unfair_lock_trylock(&unfair_lock_);
 #else
   if (LIKELY(os_unfair_lock_trylock))
     return os_unfair_lock_trylock(&unfair_lock_);

   return TrySpinLock();
 #endif  // BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
 }

 ALWAYS_INLINE void SpinningMutex::Release() {
 #if BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
   return os_unfair_lock_unlock(&unfair_lock_);
 #else
   // Always testing trylock(), since the definitions are all or nothing.
   if (LIKELY(os_unfair_lock_trylock))
     return os_unfair_lock_unlock(&unfair_lock_);

   return ReleaseSpinLock();
 #endif  // BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
 }

 ALWAYS_INLINE void SpinningMutex::LockSlow() {
 #if BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
   return os_unfair_lock_lock(&unfair_lock_);
 #else
   if (LIKELY(os_unfair_lock_trylock))
     return os_unfair_lock_lock(&unfair_lock_);

   return LockSlowSpinLock();
 #endif  // BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
 }

 #pragma clang diagnostic pop

 #else
 ALWAYS_INLINE bool SpinningMutex::Try() {
   return TrySpinLock();
 }

 ALWAYS_INLINE void SpinningMutex::Release() {
   return ReleaseSpinLock();
 }

 ALWAYS_INLINE void SpinningMutex::LockSlow() {
   return LockSlowSpinLock();
 }

 #endif  // BUILDFLAG(IS_APPLE)

 #endif  // defined(PA_HAS_FAST_MUTEX)

 }  // namespace partition_alloc::internal

 #endif  // BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_
	// Copyright 2020 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_
	#define BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_

	#include <algorithm>
	#include <atomic>

	#include "base/allocator/buildflags.h"
	#include "base/allocator/partition_allocator/partition_alloc_check.h"
	#include "base/allocator/partition_allocator/partition_alloc_config.h"
	#include "base/allocator/partition_allocator/yield_processor.h"
	#include "base/base_export.h"
	#include "base/compiler_specific.h"
	#include "base/thread_annotations.h"
	#include "build/build_config.h"

	#if BUILDFLAG(IS_WIN)
	#include "base/win/windows_types.h"
	#endif

	#if BUILDFLAG(IS_POSIX)
	#include <errno.h>
	#include <pthread.h>
	#endif

	#if BUILDFLAG(IS_APPLE)

	#include <os/lock.h>

	// os_unfair_lock is available starting with OS X 10.12, and Chromium targets
	// 10.11 at the minimum, so the symbols are not always available at runtime.
	// But we build with a 11.x SDK, so it's always in the headers.
	//
	// However, since the majority of clients have at least 10.12 (released late
	// 2016), we declare the symbols here, marking them weak. They will be nullptr
	// on 10.11, and defined on more recent versions.

	// Silence the compiler warning, here and below.
	#pragma clang diagnostic push
	#pragma clang diagnostic ignored "-Wunguarded-availability"

	#define PA_WEAK __attribute__((weak))

	extern "C" {

	PA_WEAK void os_unfair_lock_lock(os_unfair_lock_t lock);
	PA_WEAK bool os_unfair_lock_trylock(os_unfair_lock_t lock);
	PA_WEAK void os_unfair_lock_unlock(os_unfair_lock_t lock);

	} // extern "C"

	#pragma clang diagnostic pop

	#endif // BUILDFLAG(IS_APPLE)

	#if BUILDFLAG(IS_FUCHSIA)
	#include <lib/sync/mutex.h>
	#endif

	namespace partition_alloc::internal {

	// The behavior of this class depends on whether PA_HAS_FAST_MUTEX is defined.
	// 1. When it is defined:
	//
	// Simple spinning lock. It will spin in user space a set number of times before
	// going into the kernel to sleep.
	//
	// This is intended to give "the best of both worlds" between a SpinLock and
	// base::Lock:
	// - SpinLock: Inlined fast path, no external function calls, just
	// compare-and-swap. Short waits do not go into the kernel. Good behavior in
	// low contention cases.
	// - base::Lock: Good behavior in case of contention.
	//
	// We don't rely on base::Lock which we could make spin (by calling Try() in a
	// loop), as performance is below a custom spinlock as seen on high-level
	// benchmarks. Instead this implements a simple non-recursive mutex on top of
	// the futex() syscall on Linux, and SRWLock on Windows. The main difference
	// between this and a libc implementation is that it only supports the simplest
	// path: private (to a process), non-recursive mutexes with no priority
	// inheritance, no timed waits.
	//
	// As an interesting side-effect to be used in the allocator, this code does not
	// make any allocations, locks are small with a constexpr constructor and no
	// destructor.
	//
	// 2. Otherwise: This is a simple SpinLock, in the sense that it does not have
	// any awareness of other threads' behavior. One exception: x86 macOS uses
	// os_unfair_lock() if available, which is the case for macOS >= 10.12, that is
	// most clients.
	class LOCKABLE BASE_EXPORT SpinningMutex {
	public:
	inline constexpr SpinningMutex();
	ALWAYS_INLINE void Acquire() EXCLUSIVE_LOCK_FUNCTION();
	ALWAYS_INLINE void Release() UNLOCK_FUNCTION();
	ALWAYS_INLINE bool Try() EXCLUSIVE_TRYLOCK_FUNCTION(true);
	void AssertAcquired() const {} // Not supported.
	void Reinit() UNLOCK_FUNCTION();

	private:
	void LockSlow() EXCLUSIVE_LOCK_FUNCTION();

	// See below, the latency of PA_YIELD_PROCESSOR can be as high as ~150
	// cycles. Meanwhile, sleeping costs a few us. Spinning 64 times at 3GHz would
	// cost 150 * 64 / 3e9 ~= 3.2us.
	//
	// This applies to Linux kernels, on x86_64. On ARM we might want to spin
	// more.
	static constexpr int kSpinCount = 64;

	#if defined(PA_HAS_FAST_MUTEX)

	#if defined(PA_HAS_LINUX_KERNEL)
	void FutexWait();
	void FutexWake();

	static constexpr int kUnlocked = 0;
	static constexpr int kLockedUncontended = 1;
	static constexpr int kLockedContended = 2;

	std::atomic<int32_t> state_{kUnlocked};
	#elif BUILDFLAG(IS_WIN)
	CHROME_SRWLOCK lock_ = SRWLOCK_INIT;
	#elif BUILDFLAG(IS_POSIX)
	pthread_mutex_t lock_ = PTHREAD_MUTEX_INITIALIZER;
	#elif BUILDFLAG(IS_FUCHSIA)
	sync_mutex lock_;
	#endif

	#else // defined(PA_HAS_FAST_MUTEX)
	std::atomic<bool> lock_{false};

	#if BUILDFLAG(IS_APPLE) && !defined(PA_NO_OS_UNFAIR_LOCK_CRBUG_1267256)

	#pragma clang diagnostic push
	#pragma clang diagnostic ignored "-Wunguarded-availability"
	os_unfair_lock unfair_lock_ = OS_UNFAIR_LOCK_INIT;
	#pragma clang diagnostic pop

	#endif // BUILDFLAG(IS_APPLE) && !defined(PA_NO_OS_UNFAIR_LOCK_CRBUG_1267256)

	// Spinlock-like, fallback.
	ALWAYS_INLINE bool TrySpinLock();
	ALWAYS_INLINE void ReleaseSpinLock();
	void LockSlowSpinLock();

	#endif
	};

	ALWAYS_INLINE void SpinningMutex::Acquire() {
	int tries = 0;
	int backoff = 1;
	// Busy-waiting is inlined, which is fine as long as we have few callers. This
	// is only used for the partition lock, so this is the case.
	do {
	if (LIKELY(Try()))
	return;
	// Note: Per the intel optimization manual
	// (https://software.intel.com/content/dam/develop/public/us/en/documents/64-ia-32-architectures-optimization-manual.pdf),
	// the "pause" instruction is more costly on Skylake Client than on previous
	// architectures. The latency is found to be 141 cycles
	// there (from ~10 on previous ones, nice 14x).
	//
	// According to Agner Fog's instruction tables, the latency is still >100
	// cycles on Ice Lake, and from other sources, seems to be high as well on
	// Adler Lake. Separately, it is (from
	// https://agner.org/optimize/instruction_tables.pdf) also high on AMD Zen 3
	// (~65). So just assume that it's this way for most x86_64 architectures.
	//
	// Also, loop several times here, following the guidelines in section 2.3.4
	// of the manual, "Pause latency in Skylake Client Microarchitecture".
	for (int yields = 0; yields < backoff; yields++) {
	PA_YIELD_PROCESSOR;
	tries++;
	}
	constexpr int kMaxBackoff = 16;
	backoff = std::min(kMaxBackoff, backoff << 1);
	} while (tries < kSpinCount);

	LockSlow();
	}

	inline constexpr SpinningMutex::SpinningMutex() = default;

	#if defined(PA_HAS_FAST_MUTEX)

	#if defined(PA_HAS_LINUX_KERNEL)

	ALWAYS_INLINE bool SpinningMutex::Try() {
	// Using the weak variant of compare_exchange(), which may fail spuriously. On
	// some architectures such as ARM, CAS is typically performed as a LDREX/STREX
	// pair, where the store may fail. In the strong version, there is a loop
	// inserted by the compiler to retry in these cases.
	//
	// Since we are retrying in Lock() anyway, there is no point having two nested
	// loops.
	int expected = kUnlocked;
	return (state_.load(std::memory_order_relaxed) == expected) &&
	state_.compare_exchange_weak(expected, kLockedUncontended,
	std::memory_order_acquire,
	std::memory_order_relaxed);
	}

	ALWAYS_INLINE void SpinningMutex::Release() {
	if (UNLIKELY(state_.exchange(kUnlocked, std::memory_order_release) ==
	kLockedContended)) {
	// \|kLockedContended\|: there is a waiter to wake up.
	//
	// Here there is a window where the lock is unlocked, since we just set it
	// to \|kUnlocked\| above. Meaning that another thread can grab the lock
	// in-between now and \|FutexWake()\| waking up a waiter. Aside from
	// potentially fairness, this is not an issue, as the newly-awaken thread
	// will check that the lock is still free.
	//
	// There is a small pessimization here though: if we have a single waiter,
	// then when it wakes up, the lock will be set to \|kLockedContended\|, so
	// when this waiter releases the lock, it will needlessly call
	// \|FutexWake()\|, even though there are no waiters. This is supported by the
	// kernel, and is what bionic (Android's libc) also does.
	FutexWake();
	}
	}

	#elif BUILDFLAG(IS_WIN)

	ALWAYS_INLINE bool SpinningMutex::Try() {
	return !!::TryAcquireSRWLockExclusive(reinterpret_cast<PSRWLOCK>(&lock_));
	}

	ALWAYS_INLINE void SpinningMutex::Release() {
	::ReleaseSRWLockExclusive(reinterpret_cast<PSRWLOCK>(&lock_));
	}

	#elif BUILDFLAG(IS_POSIX)

	ALWAYS_INLINE bool SpinningMutex::Try() {
	int retval = pthread_mutex_trylock(&lock_);
	PA_DCHECK(retval == 0 \|\| retval == EBUSY);
	return retval == 0;
	}

	ALWAYS_INLINE void SpinningMutex::Release() {
	int retval = pthread_mutex_unlock(&lock_);
	PA_DCHECK(retval == 0);
	}

	#elif BUILDFLAG(IS_FUCHSIA)

	ALWAYS_INLINE bool SpinningMutex::Try() {
	return sync_mutex_trylock(&lock_) == ZX_OK;
	}

	ALWAYS_INLINE void SpinningMutex::Release() {
	sync_mutex_unlock(&lock_);
	}

	#endif

	#else // defined(PA_HAS_FAST_MUTEX)

	ALWAYS_INLINE bool SpinningMutex::TrySpinLock() {
	// Possibly faster than CAS. The theory is that if the cacheline is shared,
	// then it can stay shared, for the contended case.
	return !lock_.load(std::memory_order_relaxed) &&
	!lock_.exchange(true, std::memory_order_acquire);
	}

	ALWAYS_INLINE void SpinningMutex::ReleaseSpinLock() {
	lock_.store(false, std::memory_order_release);
	}

	#if BUILDFLAG(IS_APPLE)

	#pragma clang diagnostic push
	#pragma clang diagnostic ignored "-Wunguarded-availability"

	ALWAYS_INLINE bool SpinningMutex::Try() {
	// ARM64 macOS is macOS 11.x at least, guaranteed to have os_unfair_lock().
	#if BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
	return os_unfair_lock_trylock(&unfair_lock_);
	#else
	if (LIKELY(os_unfair_lock_trylock))
	return os_unfair_lock_trylock(&unfair_lock_);

	return TrySpinLock();
	#endif // BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
	}

	ALWAYS_INLINE void SpinningMutex::Release() {
	#if BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
	return os_unfair_lock_unlock(&unfair_lock_);
	#else
	// Always testing trylock(), since the definitions are all or nothing.
	if (LIKELY(os_unfair_lock_trylock))
	return os_unfair_lock_unlock(&unfair_lock_);

	return ReleaseSpinLock();
	#endif // BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
	}

	ALWAYS_INLINE void SpinningMutex::LockSlow() {
	#if BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
	return os_unfair_lock_lock(&unfair_lock_);
	#else
	if (LIKELY(os_unfair_lock_trylock))
	return os_unfair_lock_lock(&unfair_lock_);

	return LockSlowSpinLock();
	#endif // BUILDFLAG(IS_MAC) && defined(ARCH_CPU_ARM64)
	}

	#pragma clang diagnostic pop

	#else
	ALWAYS_INLINE bool SpinningMutex::Try() {
	return TrySpinLock();
	}

	ALWAYS_INLINE void SpinningMutex::Release() {
	return ReleaseSpinLock();
	}

	ALWAYS_INLINE void SpinningMutex::LockSlow() {
	return LockSlowSpinLock();
	}

	#endif // BUILDFLAG(IS_APPLE)

	#endif // defined(PA_HAS_FAST_MUTEX)

	} // namespace partition_alloc::internal

	#endif // BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_MUTEX_H_