Support ARM64EC ABI
diff --git a/src/threadpool-atomics.h b/src/threadpool-atomics.h index 44772e2..eaa0707 100644 --- a/src/threadpool-atomics.h +++ b/src/threadpool-atomics.h
@@ -5,7 +5,7 @@ #include <stdint.h> /* SSE-specific headers */ -#if defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) +#if defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) && !defined(_M_ARM64EC) #include <xmmintrin.h> #endif @@ -369,365 +369,6 @@ static inline void pthreadpool_fence_release() { __sync_synchronize(); } -#elif defined(_MSC_VER) && defined(_M_X64) - typedef volatile uint32_t pthreadpool_atomic_uint32_t; - typedef volatile size_t pthreadpool_atomic_size_t; - typedef void *volatile pthreadpool_atomic_void_p; - - static inline uint32_t pthreadpool_load_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return *address; - } - - static inline size_t pthreadpool_load_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return *address; - } - - static inline void* pthreadpool_load_relaxed_void_p( - pthreadpool_atomic_void_p* address) - { - return *address; - } - - static inline uint32_t pthreadpool_load_acquire_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - /* x86-64 loads always have acquire semantics; use only a compiler barrier */ - const uint32_t value = *address; - _ReadBarrier(); - return value; - } - - static inline size_t pthreadpool_load_acquire_size_t( - pthreadpool_atomic_size_t* address) - { - /* x86-64 loads always have acquire semantics; use only a compiler barrier */ - const size_t value = *address; - _ReadBarrier(); - return value; - } - - static inline void pthreadpool_store_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - *address = value; - } - - static inline void pthreadpool_store_relaxed_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - *address = value; - } - - static inline void pthreadpool_store_relaxed_void_p( - pthreadpool_atomic_void_p* address, - void* value) - { - *address = value; - } - - static inline void pthreadpool_store_release_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - /* x86-64 stores always have release semantics; use only a compiler barrier */ - _WriteBarrier(); - *address = value; - } - - static inline void pthreadpool_store_release_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - /* x86-64 stores always have release semantics; use only a compiler barrier */ - _WriteBarrier(); - *address = value; - } - - static inline size_t pthreadpool_decrement_fetch_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement64((volatile __int64*) address); - } - - static inline size_t pthreadpool_decrement_fetch_release_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement64((volatile __int64*) address); - } - - static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement64((volatile __int64*) address); - } - - static inline bool pthreadpool_try_decrement_relaxed_size_t( - pthreadpool_atomic_size_t* value) - { - size_t actual_value = *value; - while (actual_value != 0) { - const size_t new_value = actual_value - 1; - const size_t expected_value = actual_value; - actual_value = _InterlockedCompareExchange64( - (volatile __int64*) value, (__int64) new_value, (__int64) expected_value); - if (actual_value == expected_value) { - return true; - } - } - return false; - } - - static inline void pthreadpool_fence_acquire() { - _mm_lfence(); - _ReadBarrier(); - } - - static inline void pthreadpool_fence_release() { - _WriteBarrier(); - _mm_sfence(); - } -#elif defined(_MSC_VER) && defined(_M_IX86) - typedef volatile uint32_t pthreadpool_atomic_uint32_t; - typedef volatile size_t pthreadpool_atomic_size_t; - typedef void *volatile pthreadpool_atomic_void_p; - - static inline uint32_t pthreadpool_load_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return *address; - } - - static inline size_t pthreadpool_load_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return *address; - } - - static inline void* pthreadpool_load_relaxed_void_p( - pthreadpool_atomic_void_p* address) - { - return *address; - } - - static inline uint32_t pthreadpool_load_acquire_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - /* x86 loads always have acquire semantics; use only a compiler barrier */ - const uint32_t value = *address; - _ReadBarrier(); - return value; - } - - static inline size_t pthreadpool_load_acquire_size_t( - pthreadpool_atomic_size_t* address) - { - /* x86 loads always have acquire semantics; use only a compiler barrier */ - const size_t value = *address; - _ReadBarrier(); - return value; - } - - static inline void pthreadpool_store_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - *address = value; - } - - static inline void pthreadpool_store_relaxed_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - *address = value; - } - - static inline void pthreadpool_store_relaxed_void_p( - pthreadpool_atomic_void_p* address, - void* value) - { - *address = value; - } - - static inline void pthreadpool_store_release_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - /* x86 stores always have release semantics; use only a compiler barrier */ - _WriteBarrier(); - *address = value; - } - - static inline void pthreadpool_store_release_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - /* x86 stores always have release semantics; use only a compiler barrier */ - _WriteBarrier(); - *address = value; - } - - static inline size_t pthreadpool_decrement_fetch_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement((volatile long*) address); - } - - static inline size_t pthreadpool_decrement_fetch_release_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement((volatile long*) address); - } - - static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement((volatile long*) address); - } - - static inline bool pthreadpool_try_decrement_relaxed_size_t( - pthreadpool_atomic_size_t* value) - { - size_t actual_value = *value; - while (actual_value != 0) { - const size_t new_value = actual_value - 1; - const size_t expected_value = actual_value; - actual_value = _InterlockedCompareExchange( - (volatile long*) value, (long) new_value, (long) expected_value); - if (actual_value == expected_value) { - return true; - } - } - return false; - } - - static inline void pthreadpool_fence_acquire() { - _mm_lfence(); - } - - static inline void pthreadpool_fence_release() { - _mm_sfence(); - } -#elif defined(_MSC_VER) && defined(_M_ARM64) - typedef volatile uint32_t pthreadpool_atomic_uint32_t; - typedef volatile size_t pthreadpool_atomic_size_t; - typedef void *volatile pthreadpool_atomic_void_p; - - static inline uint32_t pthreadpool_load_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return (uint32_t) __iso_volatile_load32((const volatile __int32*) address); - } - - static inline size_t pthreadpool_load_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) __iso_volatile_load64((const volatile __int64*) address); - } - - static inline void* pthreadpool_load_relaxed_void_p( - pthreadpool_atomic_void_p* address) - { - return (void*) __iso_volatile_load64((const volatile __int64*) address); - } - - static inline uint32_t pthreadpool_load_acquire_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return (uint32_t) __ldar32((volatile unsigned __int32*) address); - } - - static inline size_t pthreadpool_load_acquire_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) __ldar64((volatile unsigned __int64*) address); - } - - static inline void pthreadpool_store_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - __iso_volatile_store32((volatile __int32*) address, (__int32) value); - } - - static inline void pthreadpool_store_relaxed_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - __iso_volatile_store64((volatile __int64*) address, (__int64) value); - } - - static inline void pthreadpool_store_relaxed_void_p( - pthreadpool_atomic_void_p* address, - void* value) - { - __iso_volatile_store64((volatile __int64*) address, (__int64) value); - } - - static inline void pthreadpool_store_release_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - _WriteBarrier(); - __stlr32((unsigned __int32 volatile*) address, (unsigned __int32) value); - } - - static inline void pthreadpool_store_release_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - _WriteBarrier(); - __stlr64((unsigned __int64 volatile*) address, (unsigned __int64) value); - } - - static inline size_t pthreadpool_decrement_fetch_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement64_nf((volatile __int64*) address); - } - - static inline size_t pthreadpool_decrement_fetch_release_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement64_rel((volatile __int64*) address); - } - - static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement64((volatile __int64*) address); - } - - static inline bool pthreadpool_try_decrement_relaxed_size_t( - pthreadpool_atomic_size_t* value) - { - size_t actual_value = (size_t) __iso_volatile_load64((const volatile __int64*) value); - while (actual_value != 0) { - const size_t new_value = actual_value - 1; - const size_t expected_value = actual_value; - actual_value = _InterlockedCompareExchange64_nf( - (volatile __int64*) value, (__int64) new_value, (__int64) expected_value); - if (actual_value == expected_value) { - return true; - } - } - return false; - } - - static inline void pthreadpool_fence_acquire() { - __dmb(_ARM64_BARRIER_ISHLD); - _ReadBarrier(); - } - - static inline void pthreadpool_fence_release() { - _WriteBarrier(); - __dmb(_ARM64_BARRIER_ISH); - } #elif defined(_MSC_VER) && defined(_M_ARM) typedef volatile uint32_t pthreadpool_atomic_uint32_t; typedef volatile size_t pthreadpool_atomic_size_t; @@ -851,15 +492,370 @@ _WriteBarrier(); __dmb(_ARM_BARRIER_ISH); } +#elif defined(_MSC_VER) && defined(_M_ARM64) + typedef volatile uint32_t pthreadpool_atomic_uint32_t; + typedef volatile size_t pthreadpool_atomic_size_t; + typedef void *volatile pthreadpool_atomic_void_p; + + static inline uint32_t pthreadpool_load_relaxed_uint32_t( + pthreadpool_atomic_uint32_t* address) + { + return (uint32_t) __iso_volatile_load32((const volatile __int32*) address); + } + + static inline size_t pthreadpool_load_relaxed_size_t( + pthreadpool_atomic_size_t* address) + { + return (size_t) __iso_volatile_load64((const volatile __int64*) address); + } + + static inline void* pthreadpool_load_relaxed_void_p( + pthreadpool_atomic_void_p* address) + { + return (void*) __iso_volatile_load64((const volatile __int64*) address); + } + + static inline uint32_t pthreadpool_load_acquire_uint32_t( + pthreadpool_atomic_uint32_t* address) + { + return (uint32_t) __ldar32((volatile unsigned __int32*) address); + } + + static inline size_t pthreadpool_load_acquire_size_t( + pthreadpool_atomic_size_t* address) + { + return (size_t) __ldar64((volatile unsigned __int64*) address); + } + + static inline void pthreadpool_store_relaxed_uint32_t( + pthreadpool_atomic_uint32_t* address, + uint32_t value) + { + __iso_volatile_store32((volatile __int32*) address, (__int32) value); + } + + static inline void pthreadpool_store_relaxed_size_t( + pthreadpool_atomic_size_t* address, + size_t value) + { + __iso_volatile_store64((volatile __int64*) address, (__int64) value); + } + + static inline void pthreadpool_store_relaxed_void_p( + pthreadpool_atomic_void_p* address, + void* value) + { + __iso_volatile_store64((volatile __int64*) address, (__int64) value); + } + + static inline void pthreadpool_store_release_uint32_t( + pthreadpool_atomic_uint32_t* address, + uint32_t value) + { + _WriteBarrier(); + __stlr32((unsigned __int32 volatile*) address, (unsigned __int32) value); + } + + static inline void pthreadpool_store_release_size_t( + pthreadpool_atomic_size_t* address, + size_t value) + { + _WriteBarrier(); + __stlr64((unsigned __int64 volatile*) address, (unsigned __int64) value); + } + + static inline size_t pthreadpool_decrement_fetch_relaxed_size_t( + pthreadpool_atomic_size_t* address) + { + return (size_t) _InterlockedDecrement64_nf((volatile __int64*) address); + } + + static inline size_t pthreadpool_decrement_fetch_release_size_t( + pthreadpool_atomic_size_t* address) + { + return (size_t) _InterlockedDecrement64_rel((volatile __int64*) address); + } + + static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t( + pthreadpool_atomic_size_t* address) + { + return (size_t) _InterlockedDecrement64((volatile __int64*) address); + } + + static inline bool pthreadpool_try_decrement_relaxed_size_t( + pthreadpool_atomic_size_t* value) + { + size_t actual_value = (size_t) __iso_volatile_load64((const volatile __int64*) value); + while (actual_value != 0) { + const size_t new_value = actual_value - 1; + const size_t expected_value = actual_value; + actual_value = _InterlockedCompareExchange64_nf( + (volatile __int64*) value, (__int64) new_value, (__int64) expected_value); + if (actual_value == expected_value) { + return true; + } + } + return false; + } + + static inline void pthreadpool_fence_acquire() { + __dmb(_ARM64_BARRIER_ISHLD); + _ReadBarrier(); + } + + static inline void pthreadpool_fence_release() { + _WriteBarrier(); + __dmb(_ARM64_BARRIER_ISH); + } +#elif defined(_MSC_VER) && defined(_M_IX86) + typedef volatile uint32_t pthreadpool_atomic_uint32_t; + typedef volatile size_t pthreadpool_atomic_size_t; + typedef void *volatile pthreadpool_atomic_void_p; + + static inline uint32_t pthreadpool_load_relaxed_uint32_t( + pthreadpool_atomic_uint32_t* address) + { + return *address; + } + + static inline size_t pthreadpool_load_relaxed_size_t( + pthreadpool_atomic_size_t* address) + { + return *address; + } + + static inline void* pthreadpool_load_relaxed_void_p( + pthreadpool_atomic_void_p* address) + { + return *address; + } + + static inline uint32_t pthreadpool_load_acquire_uint32_t( + pthreadpool_atomic_uint32_t* address) + { + /* x86 loads always have acquire semantics; use only a compiler barrier */ + const uint32_t value = *address; + _ReadBarrier(); + return value; + } + + static inline size_t pthreadpool_load_acquire_size_t( + pthreadpool_atomic_size_t* address) + { + /* x86 loads always have acquire semantics; use only a compiler barrier */ + const size_t value = *address; + _ReadBarrier(); + return value; + } + + static inline void pthreadpool_store_relaxed_uint32_t( + pthreadpool_atomic_uint32_t* address, + uint32_t value) + { + *address = value; + } + + static inline void pthreadpool_store_relaxed_size_t( + pthreadpool_atomic_size_t* address, + size_t value) + { + *address = value; + } + + static inline void pthreadpool_store_relaxed_void_p( + pthreadpool_atomic_void_p* address, + void* value) + { + *address = value; + } + + static inline void pthreadpool_store_release_uint32_t( + pthreadpool_atomic_uint32_t* address, + uint32_t value) + { + /* x86 stores always have release semantics; use only a compiler barrier */ + _WriteBarrier(); + *address = value; + } + + static inline void pthreadpool_store_release_size_t( + pthreadpool_atomic_size_t* address, + size_t value) + { + /* x86 stores always have release semantics; use only a compiler barrier */ + _WriteBarrier(); + *address = value; + } + + static inline size_t pthreadpool_decrement_fetch_relaxed_size_t( + pthreadpool_atomic_size_t* address) + { + return (size_t) _InterlockedDecrement((volatile long*) address); + } + + static inline size_t pthreadpool_decrement_fetch_release_size_t( + pthreadpool_atomic_size_t* address) + { + return (size_t) _InterlockedDecrement((volatile long*) address); + } + + static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t( + pthreadpool_atomic_size_t* address) + { + return (size_t) _InterlockedDecrement((volatile long*) address); + } + + static inline bool pthreadpool_try_decrement_relaxed_size_t( + pthreadpool_atomic_size_t* value) + { + size_t actual_value = *value; + while (actual_value != 0) { + const size_t new_value = actual_value - 1; + const size_t expected_value = actual_value; + actual_value = _InterlockedCompareExchange( + (volatile long*) value, (long) new_value, (long) expected_value); + if (actual_value == expected_value) { + return true; + } + } + return false; + } + + static inline void pthreadpool_fence_acquire() { + _mm_lfence(); + } + + static inline void pthreadpool_fence_release() { + _mm_sfence(); + } +#elif defined(_MSC_VER) && defined(_M_X64) + typedef volatile uint32_t pthreadpool_atomic_uint32_t; + typedef volatile size_t pthreadpool_atomic_size_t; + typedef void *volatile pthreadpool_atomic_void_p; + + static inline uint32_t pthreadpool_load_relaxed_uint32_t( + pthreadpool_atomic_uint32_t* address) + { + return *address; + } + + static inline size_t pthreadpool_load_relaxed_size_t( + pthreadpool_atomic_size_t* address) + { + return *address; + } + + static inline void* pthreadpool_load_relaxed_void_p( + pthreadpool_atomic_void_p* address) + { + return *address; + } + + static inline uint32_t pthreadpool_load_acquire_uint32_t( + pthreadpool_atomic_uint32_t* address) + { + /* x86-64 loads always have acquire semantics; use only a compiler barrier */ + const uint32_t value = *address; + _ReadBarrier(); + return value; + } + + static inline size_t pthreadpool_load_acquire_size_t( + pthreadpool_atomic_size_t* address) + { + /* x86-64 loads always have acquire semantics; use only a compiler barrier */ + const size_t value = *address; + _ReadBarrier(); + return value; + } + + static inline void pthreadpool_store_relaxed_uint32_t( + pthreadpool_atomic_uint32_t* address, + uint32_t value) + { + *address = value; + } + + static inline void pthreadpool_store_relaxed_size_t( + pthreadpool_atomic_size_t* address, + size_t value) + { + *address = value; + } + + static inline void pthreadpool_store_relaxed_void_p( + pthreadpool_atomic_void_p* address, + void* value) + { + *address = value; + } + + static inline void pthreadpool_store_release_uint32_t( + pthreadpool_atomic_uint32_t* address, + uint32_t value) + { + /* x86-64 stores always have release semantics; use only a compiler barrier */ + _WriteBarrier(); + *address = value; + } + + static inline void pthreadpool_store_release_size_t( + pthreadpool_atomic_size_t* address, + size_t value) + { + /* x86-64 stores always have release semantics; use only a compiler barrier */ + _WriteBarrier(); + *address = value; + } + + static inline size_t pthreadpool_decrement_fetch_relaxed_size_t( + pthreadpool_atomic_size_t* address) + { + return (size_t) _InterlockedDecrement64((volatile __int64*) address); + } + + static inline size_t pthreadpool_decrement_fetch_release_size_t( + pthreadpool_atomic_size_t* address) + { + return (size_t) _InterlockedDecrement64((volatile __int64*) address); + } + + static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t( + pthreadpool_atomic_size_t* address) + { + return (size_t) _InterlockedDecrement64((volatile __int64*) address); + } + + static inline bool pthreadpool_try_decrement_relaxed_size_t( + pthreadpool_atomic_size_t* value) + { + size_t actual_value = *value; + while (actual_value != 0) { + const size_t new_value = actual_value - 1; + const size_t expected_value = actual_value; + actual_value = _InterlockedCompareExchange64( + (volatile __int64*) value, (__int64) new_value, (__int64) expected_value); + if (actual_value == expected_value) { + return true; + } + } + return false; + } + + static inline void pthreadpool_fence_acquire() { + _mm_lfence(); + _ReadBarrier(); + } + + static inline void pthreadpool_fence_release() { + _WriteBarrier(); + _mm_sfence(); + } #else #error "Platform-specific implementation of threadpool-atomics.h required" #endif -#if defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) - static inline void pthreadpool_yield() { - _mm_pause(); - } -#elif defined(__ARM_ACLE) || defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64)) +#if defined(__ARM_ACLE) || defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)) static inline void pthreadpool_yield() { __yield(); } @@ -867,6 +863,10 @@ static inline void pthreadpool_yield() { __asm__ __volatile__("yield"); } +#elif defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + static inline void pthreadpool_yield() { + _mm_pause(); + } #else static inline void pthreadpool_yield() { pthreadpool_fence_acquire();
diff --git a/src/threadpool-utils.h b/src/threadpool-utils.h index 91e2445..0b81360 100644 --- a/src/threadpool-utils.h +++ b/src/threadpool-utils.h
@@ -4,7 +4,7 @@ #include <stddef.h> /* SSE-specific headers */ -#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) +#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) && !defined(_M_ARM64EC) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) #include <xmmintrin.h> #endif @@ -15,12 +15,12 @@ struct fpu_state { -#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) - uint32_t mxcsr; -#elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) || defined(_MSC_VER) && defined(_M_ARM) +#if defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) || defined(_MSC_VER) && defined(_M_ARM) uint32_t fpscr; -#elif defined(__GNUC__) && defined(__aarch64__) || defined(_MSC_VER) && defined(_M_ARM64) +#elif defined(__GNUC__) && defined(__aarch64__) || defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC)) uint64_t fpcr; +#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) + uint32_t mxcsr; #else char unused; #endif @@ -28,12 +28,12 @@ static inline struct fpu_state get_fpu_state() { struct fpu_state state = { 0 }; -#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) - state.mxcsr = (uint32_t) _mm_getcsr(); -#elif defined(_MSC_VER) && defined(_M_ARM) +#if defined(_MSC_VER) && defined(_M_ARM) state.fpscr = (uint32_t) _MoveFromCoprocessor(10, 7, 1, 0, 0); -#elif defined(_MSC_VER) && defined(_M_ARM64) +#elif defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC)) state.fpcr = (uint64_t) _ReadStatusReg(0x5A20); +#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) + state.mxcsr = (uint32_t) _mm_getcsr(); #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) __asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r" (state.fpscr)); #elif defined(__GNUC__) && defined(__aarch64__) @@ -43,27 +43,25 @@ } static inline void set_fpu_state(const struct fpu_state state) { -#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) - _mm_setcsr((unsigned int) state.mxcsr); -#elif defined(_MSC_VER) && defined(_M_ARM) +#if defined(_MSC_VER) && defined(_M_ARM) _MoveToCoprocessor((int) state.fpscr, 10, 7, 1, 0, 0); -#elif defined(_MSC_VER) && defined(_M_ARM64) +#elif defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC)) _WriteStatusReg(0x5A20, (__int64) state.fpcr); #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) __asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [fpscr] "r" (state.fpscr)); #elif defined(__GNUC__) && defined(__aarch64__) __asm__ __volatile__("MSR fpcr, %[fpcr]" : : [fpcr] "r" (state.fpcr)); +#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) + _mm_setcsr((unsigned int) state.mxcsr); #endif } static inline void disable_fpu_denormals() { -#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) - _mm_setcsr(_mm_getcsr() | 0x8040); -#elif defined(_MSC_VER) && defined(_M_ARM) +#if defined(_MSC_VER) && defined(_M_ARM) int fpscr = _MoveFromCoprocessor(10, 7, 1, 0, 0); fpscr |= 0x1000000; _MoveToCoprocessor(fpscr, 10, 7, 1, 0, 0); -#elif defined(_MSC_VER) && defined(_M_ARM64) +#elif defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC)) __int64 fpcr = _ReadStatusReg(0x5A20); fpcr |= 0x1080000; _WriteStatusReg(0x5A20, fpcr); @@ -92,6 +90,8 @@ "ORR %w[fpcr], %w[fpcr], 0x80000\n" "MSR fpcr, %[fpcr]\n" : [fpcr] "=r" (fpcr)); +#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) + _mm_setcsr(_mm_getcsr() | 0x8040); #endif }