blob: 09ebe5ab02844ce0227d7004f3640f3622760123 [file] [log] [blame]
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This include guard is checked by foreach_target, so avoid the usual _H_
// suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included
// after/outside this include guard.
#ifndef HWY_HIGHWAY_INCLUDED
#define HWY_HIGHWAY_INCLUDED
// Main header required before using vector types.
#include "hwy/targets.h"
namespace hwy {
// API version (https://semver.org/)
#define HWY_MAJOR 0
#define HWY_MINOR 7
#define HWY_PATCH 0
//------------------------------------------------------------------------------
// Shorthand for descriptors (defined in shared-inl.h) used to select overloads.
// Because Highway functions take descriptor and/or vector arguments, ADL finds
// these functions without requiring users in project::HWY_NAMESPACE to
// qualify Highway functions with hwy::HWY_NAMESPACE. However, ADL rules for
// templates require `using hwy::HWY_NAMESPACE::ShiftLeft;` etc. declarations.
// Full (native-width) vector.
#define HWY_FULL(T) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)>
// Vector of up to MAX_N lanes.
#define HWY_CAPPED(T, MAX_N) \
hwy::HWY_NAMESPACE::Simd<T, HWY_MIN(MAX_N, HWY_LANES(T))>
//------------------------------------------------------------------------------
// Export user functions for static/dynamic dispatch
// Evaluates to 0 inside a translation unit if it is generating anything but the
// static target (the last one if multiple targets are enabled). Used to prevent
// redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only
// compile once anyway, so this is 1 unless it is or has been included.
#ifndef HWY_ONCE
#define HWY_ONCE 1
#endif
// HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for
// HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is
// defined), and can be used to deduce the return type of Choose*.
#if HWY_STATIC_TARGET == HWY_SCALAR
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_WASM
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_NEON
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_PPC8
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SSE4
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX2
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX3
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
#endif
// Dynamic dispatch declarations.
template <typename RetType, typename... Args>
struct FunctionCache {
public:
typedef RetType(FunctionType)(Args...);
// A template function that when instantiated has the same signature as the
// function being called. This function initializes the global cache of the
// current supported targets mask used for dynamic dispatch and calls the
// appropriate function. Since this mask used for dynamic dispatch is a
// global cache, all the highway exported functions, even those exposed by
// different modules, will be initialized after this function runs for any one
// of those exported functions.
template <FunctionType* const table[]>
static RetType ChooseAndCall(Args... args) {
// If we are running here it means we need to update the chosen target.
chosen_target.Update();
return (table[chosen_target.GetIndex()])(args...);
}
};
// Factory function only used to infer the template parameters RetType and Args
// from a function passed to the factory.
template <typename RetType, typename... Args>
FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
return FunctionCache<RetType, Args...>();
}
// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
// nullptr is that target was not compiled.
#if HWY_TARGETS & HWY_SCALAR
#define HWY_CHOOSE_SCALAR(FUNC_NAME) &N_SCALAR::FUNC_NAME
#else
// When scalar is not present and we try to use scalar because other targets
// were disabled at runtime we fall back to the baseline with
// HWY_STATIC_DISPATCH()
#define HWY_CHOOSE_SCALAR(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
#endif
#if HWY_TARGETS & HWY_WASM
#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
#else
#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_RVV
#define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME
#else
#define HWY_CHOOSE_RVV(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_NEON
#define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME
#else
#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_PPC8
#define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
#else
#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SSE4
#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
#else
#define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_AVX2
#define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME
#else
#define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_AVX3
#define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME
#else
#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
#endif
#define HWY_DISPATCH_TABLE(FUNC_NAME) \
HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
// HWY_EXPORT(FUNC_NAME); expands to a static array that is used by
// HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This
// static array must be defined at the same namespace level as the function
// it is exporting.
// After being exported, it can be called from other parts of the same source
// file using HWY_DYNAMIC_DISTPATCH(), in particular from a function wrapper
// like in the following example:
//
// #include "hwy/highway.h"
// HWY_BEFORE_NAMESPACE();
// namespace skeleton {
// namespace HWY_NAMESPACE {
//
// void MyFunction(int a, char b, const char* c) { ... }
//
// // NOLINTNEXTLINE(google-readability-namespace-comments)
// } // namespace HWY_NAMESPACE
// } // namespace skeleton
// HWY_AFTER_NAMESPACE();
//
// namespace skeleton {
// HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope.
//
// void MyFunction(int a, char b, const char* c) {
// return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c);
// }
// } // namespace skeleton
//
#if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
// Simplified version for IDE or the dynamic dispatch case with only one target.
// This case still uses a table, although of a single element, to provide the
// same compile error conditions as with the dynamic dispatch case when multiple
// targets are being compiled.
#define HWY_EXPORT(FUNC_NAME) \
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) (*(HWY_DISPATCH_TABLE(FUNC_NAME)[0]))
#else
// Dynamic dispatch case with one entry per dynamic target plus the scalar
// mode and the initialization wrapper.
#define HWY_EXPORT(FUNC_NAME) \
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) \
const HWY_DISPATCH_TABLE(FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
/* The first entry in the table initializes the global cache and \
* calls the appropriate function. */ \
&decltype(hwy::FunctionCacheFactory(&HWY_STATIC_DISPATCH( \
FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \
HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
HWY_CHOOSE_SCALAR(FUNC_NAME), \
}
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
(*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::chosen_target.GetIndex()]))
#endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
} // namespace hwy
#endif // HWY_HIGHWAY_INCLUDED
//------------------------------------------------------------------------------
// NOTE: ops/*.h cannot use regular include guards because their definitions
// depend on HWY_TARGET, e.g. enabling AVX3 instructions on 128-bit vectors, so
// we want to include them once per target. However, each *-inl.h includes
// highway.h, so we still need an external per-target include guard.
#if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
#ifdef HWY_HIGHWAY_PER_TARGET
#undef HWY_HIGHWAY_PER_TARGET
#else
#define HWY_HIGHWAY_PER_TARGET
#endif
// These define ops inside namespace hwy::HWY_NAMESPACE.
#if HWY_TARGET == HWY_SSE4
#include "hwy/ops/x86_128-inl.h"
#elif HWY_TARGET == HWY_AVX2
#include "hwy/ops/x86_256-inl.h"
#elif HWY_TARGET == HWY_AVX3
#include "hwy/ops/x86_512-inl.h"
#elif HWY_TARGET == HWY_PPC8
#elif HWY_TARGET == HWY_NEON
#include "hwy/ops/arm_neon-inl.h"
#elif HWY_TARGET == HWY_WASM
#include "hwy/ops/wasm_128-inl.h"
#elif HWY_TARGET == HWY_RVV
// TODO(janwas): header
#include "hwy/ops/shared-inl.h"
#elif HWY_TARGET == HWY_SCALAR
#include "hwy/ops/scalar-inl.h"
#else
#pragma message("HWY_TARGET does not match any known target")
#endif // HWY_TARGET
// Commonly used functions/types that must come after ops are defined.
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// The lane type of a vector type, e.g. float for Vec<Simd<float, 4>>.
template <class V>
using LaneType = decltype(GetLane(V()));
// Vector type, e.g. Vec128<float> for Simd<float, 4>. Useful as the return type
// of functions that do not take a vector argument, or as an argument type if
// the function only has a template argument for D, or for explicit type names
// instead of auto. This may be a built-in type.
template <class D>
using Vec = decltype(Zero(D()));
// Mask type. Useful as the return type of functions that do not take a mask
// argument, or as an argument type if the function only has a template argument
// for D, or for explicit type names instead of auto.
template <class D>
using Mask = decltype(MaskFromVec(Zero(D())));
// Returns the closest value to v within [lo, hi].
template <class V>
HWY_API V Clamp(const V v, const V lo, const V hi) {
return Min(Max(lo, v), hi);
}
// CombineShiftRightBytes (and ..Lanes) are not available for the scalar target.
// TODO(janwas): implement for RVV
#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
template <size_t kLanes, class V>
HWY_API V CombineShiftRightLanes(const V hi, const V lo) {
return CombineShiftRightBytes<kLanes * sizeof(LaneType<V>)>(hi, lo);
}
#endif
// Returns lanes with the most significant bit set and all other bits zero.
template <class D>
HWY_API Vec<D> SignBit(D d) {
using Unsigned = MakeUnsigned<typename D::T>;
const Unsigned bit = Unsigned(1) << (sizeof(Unsigned) * 8 - 1);
return BitCast(d, Set(Rebind<Unsigned, D>(), bit));
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HWY_HIGHWAY_PER_TARGET