blob: 3e7e5b7a68fd8453ced1e8262200076e83e4e0dd [file] [log] [blame]
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/targets.h"
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <atomic>
#include <limits>
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
defined(THREAD_SANITIZER)
#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace
#endif // defined(*_SANITIZER)
#if HWY_ARCH_X86
#include <xmmintrin.h>
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <cpuid.h>
#endif
#endif
namespace hwy {
namespace {
bool IsBitSet(const uint32_t reg, const int index) {
return (reg & (1U << index)) != 0;
}
#if HWY_ARCH_X86
// Calls CPUID instruction with eax=level and ecx=count and returns the result
// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
void Cpuid(const uint32_t level, const uint32_t count,
uint32_t* HWY_RESTRICT abcd) {
#ifdef _MSC_VER
int regs[4];
__cpuidex(regs, level, count);
for (int i = 0; i < 4; ++i) {
abcd[i] = regs[i];
}
#else
uint32_t a;
uint32_t b;
uint32_t c;
uint32_t d;
__cpuid_count(level, count, a, b, c, d);
abcd[0] = a;
abcd[1] = b;
abcd[2] = c;
abcd[3] = d;
#endif
}
// Returns the lower 32 bits of extended control register 0.
// Requires CPU support for "OSXSAVE" (see below).
uint32_t ReadXCR0() {
#ifdef _MSC_VER
return static_cast<uint32_t>(_xgetbv(0));
#else
uint32_t xcr0, xcr0_high;
const uint32_t index = 0;
asm volatile(".byte 0x0F, 0x01, 0xD0"
: "=a"(xcr0), "=d"(xcr0_high)
: "c"(index));
return xcr0;
#endif
}
#endif // HWY_ARCH_X86
// Not function-local => no compiler-generated locking.
std::atomic<uint32_t> supported_{0}; // Not yet initialized
// When running tests, this value can be set to the mocked supported targets
// mask. Only written to from a single thread before the test starts.
uint32_t supported_targets_for_test_ = 0;
// Mask of targets disabled at runtime with DisableTargets.
uint32_t supported_mask_{std::numeric_limits<uint32_t>::max()};
#if HWY_ARCH_X86
// Bits indicating which instruction set extensions are supported.
constexpr uint32_t kSSE = 1 << 0;
constexpr uint32_t kSSE2 = 1 << 1;
constexpr uint32_t kSSE3 = 1 << 2;
constexpr uint32_t kSSSE3 = 1 << 3;
constexpr uint32_t kSSE41 = 1 << 4;
constexpr uint32_t kSSE42 = 1 << 5;
constexpr uint32_t kGroupSSE4 = kSSE | kSSE2 | kSSE3 | kSSSE3 | kSSE41 | kSSE42;
constexpr uint32_t kAVX = 1u << 6;
constexpr uint32_t kAVX2 = 1u << 7;
constexpr uint32_t kFMA = 1u << 8;
constexpr uint32_t kLZCNT = 1u << 9;
constexpr uint32_t kBMI = 1u << 10;
constexpr uint32_t kBMI2 = 1u << 11;
// We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to
// use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them
// [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of
// avoiding using and requiring these so AVX2 can still be used.
#ifdef HWY_DISABLE_BMI2_FMA
constexpr uint32_t kGroupAVX2 = kAVX | kAVX2 | kLZCNT;
#else
constexpr uint32_t kGroupAVX2 = kAVX | kAVX2 | kFMA | kLZCNT | kBMI | kBMI2;
#endif
constexpr uint32_t kAVX512F = 1u << 12;
constexpr uint32_t kAVX512VL = 1u << 13;
constexpr uint32_t kAVX512DQ = 1u << 14;
constexpr uint32_t kAVX512BW = 1u << 15;
constexpr uint32_t kGroupAVX3 = kAVX512F | kAVX512VL | kAVX512DQ | kAVX512BW;
#endif
} // namespace
HWY_NORETURN void HWY_FORMAT(3, 4)
Abort(const char* file, int line, const char* format, ...) {
char buf[2000];
va_list args;
va_start(args, format);
vsnprintf(buf, sizeof(buf), format, args);
va_end(args);
fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
defined(THREAD_SANITIZER)
// If compiled with any sanitizer print a stack trace. This call doesn't crash
// the program, instead the trap below will crash it also allowing gdb to
// break there.
__sanitizer_print_stack_trace();
#endif // defined(*_SANITIZER)
#if HWY_COMPILER_MSVC
__debugbreak();
#else
__builtin_trap();
#endif
}
void DisableTargets(uint32_t disabled_targets) {
supported_mask_ = ~(disabled_targets & ~HWY_ENABLED_BASELINE);
// We can call Update() here to initialize the mask but that will trigger a
// call to SupportedTargets() which we use in tests to tell whether any of the
// highway dynamic dispatch functions were used.
chosen_target.DeInit();
}
void SetSupportedTargetsForTest(uint32_t targets) {
// Reset the cached supported_ value to 0 to force a re-evaluation in the
// next call to SupportedTargets() which will use the mocked value set here
// if not zero.
supported_.store(0, std::memory_order_release);
supported_targets_for_test_ = targets;
chosen_target.DeInit();
}
bool SupportedTargetsCalledForTest() {
return supported_.load(std::memory_order_acquire) != 0;
}
uint32_t SupportedTargets() {
uint32_t bits = supported_.load(std::memory_order_acquire);
// Already initialized?
if (HWY_LIKELY(bits != 0)) {
return bits & supported_mask_;
}
// When running tests, this allows to mock the current supported targets.
if (HWY_UNLIKELY(supported_targets_for_test_ != 0)) {
// Store the value to signal that this was used.
supported_.store(supported_targets_for_test_, std::memory_order_release);
return supported_targets_for_test_ & supported_mask_;
}
bits = HWY_SCALAR;
#if HWY_ARCH_X86
uint32_t flags = 0;
uint32_t abcd[4];
Cpuid(0, 0, abcd);
const uint32_t max_level = abcd[0];
// Standard feature flags
Cpuid(1, 0, abcd);
flags |= IsBitSet(abcd[3], 25) ? kSSE : 0;
flags |= IsBitSet(abcd[3], 26) ? kSSE2 : 0;
flags |= IsBitSet(abcd[2], 0) ? kSSE3 : 0;
flags |= IsBitSet(abcd[2], 9) ? kSSSE3 : 0;
flags |= IsBitSet(abcd[2], 19) ? kSSE41 : 0;
flags |= IsBitSet(abcd[2], 20) ? kSSE42 : 0;
flags |= IsBitSet(abcd[2], 12) ? kFMA : 0;
flags |= IsBitSet(abcd[2], 28) ? kAVX : 0;
const bool has_osxsave = IsBitSet(abcd[2], 27);
// Extended feature flags
Cpuid(0x80000001U, 0, abcd);
flags |= IsBitSet(abcd[2], 5) ? kLZCNT : 0;
// Extended features
if (max_level >= 7) {
Cpuid(7, 0, abcd);
flags |= IsBitSet(abcd[1], 3) ? kBMI : 0;
flags |= IsBitSet(abcd[1], 5) ? kAVX2 : 0;
flags |= IsBitSet(abcd[1], 8) ? kBMI2 : 0;
flags |= IsBitSet(abcd[1], 16) ? kAVX512F : 0;
flags |= IsBitSet(abcd[1], 17) ? kAVX512DQ : 0;
flags |= IsBitSet(abcd[1], 30) ? kAVX512BW : 0;
flags |= IsBitSet(abcd[1], 31) ? kAVX512VL : 0;
}
// Verify OS support for XSAVE, without which XMM/YMM registers are not
// preserved across context switches and are not safe to use.
if (has_osxsave) {
const uint32_t xcr0 = ReadXCR0();
// XMM
if (!IsBitSet(xcr0, 1)) {
flags = 0;
}
// YMM
if (!IsBitSet(xcr0, 2)) {
flags &= ~kGroupAVX2;
}
// ZMM + opmask
if ((xcr0 & 0x70) != 0x70) {
flags &= ~kGroupAVX3;
}
}
// Set target bit(s) if all their group's flags are all set.
if ((flags & kGroupAVX3) == kGroupAVX3) {
bits |= HWY_AVX3;
}
if ((flags & kGroupAVX2) == kGroupAVX2) {
bits |= HWY_AVX2;
}
if ((flags & kGroupSSE4) == kGroupSSE4) {
bits |= HWY_SSE4;
}
#else
// TODO(janwas): detect for other platforms
bits = HWY_ENABLED_BASELINE;
#endif // HWY_ARCH_X86
if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
fprintf(stderr, "WARNING: CPU supports %ux but software requires %x\n",
bits, HWY_ENABLED_BASELINE);
}
supported_.store(bits, std::memory_order_release);
return bits & supported_mask_;
}
// Declared in targets.h
ChosenTarget chosen_target;
void ChosenTarget::Update() {
// The supported variable contains the current CPU supported targets shifted
// to the location expected by the ChosenTarget mask. We enabled SCALAR
// regardless of whether it was compiled since it is also used as the
// fallback mechanism to the baseline target.
uint32_t supported = HWY_CHOSEN_TARGET_SHIFT(hwy::SupportedTargets()) |
HWY_CHOSEN_TARGET_MASK_SCALAR;
mask_.store(supported);
}
} // namespace hwy