blob: 71e3430726f16be72a6f3fba3746bc9c8e3257a6 [file] [log] [blame]
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// 128-bit ARM64 NEON vectors and operations.
// External include guard in highway.h - see comment there.
#include <arm_neon.h>
#include "hwy/ops/shared-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// Macros used to define single and double function calls for multiple types
// for full and half vectors. These macros are undefined at the end of the file.
// HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function.
#define HWY_NEON_BUILD_TPL_1
#define HWY_NEON_BUILD_TPL_2
#define HWY_NEON_BUILD_TPL_3
// HWY_NEON_BUILD_RET_* is return type.
#define HWY_NEON_BUILD_RET_1(type, size) Vec128<type, size>
#define HWY_NEON_BUILD_RET_2(type, size) Vec128<type, size>
#define HWY_NEON_BUILD_RET_3(type, size) Vec128<type, size>
// HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives.
#define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type, size> a
#define HWY_NEON_BUILD_PARAM_2(type, size) \
const Vec128<type, size> a, const Vec128<type, size> b
#define HWY_NEON_BUILD_PARAM_3(type, size) \
const Vec128<type, size> a, const Vec128<type, size> b, \
const Vec128<type, size> c
// HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying
// function.
#define HWY_NEON_BUILD_ARG_1 a.raw
#define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
#define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
// We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after
// the __VA_ARGS__ have been expanded. This allows "func" to be a macro on
// itself like with some of the library "functions" such as vshlq_u8. For
// example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as
// "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed.
// Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro
// expects two arguments.
#define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
// Main macro definition that defines a single function for the given type and
// size of vector, using the underlying (prefix##infix##suffix) function and
// the template, return type, parameters and arguments defined by the "args"
// parameters passed here (see HWY_NEON_BUILD_* macros defined before).
#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \
HWY_INLINE HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \
name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \
return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \
HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \
}
// The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function
// called "name" using the set of neon functions starting with the given
// "prefix" for all the variants of certain types, as specified next to each
// macro. For example, the prefix "vsub" can be used to define the operator-
// using args=2.
// uint8_t
#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION(uint8_t, 16, name, prefix##q, infix, u8, args) \
HWY_NEON_DEF_FUNCTION(uint8_t, 8, name, prefix, infix, u8, args) \
HWY_NEON_DEF_FUNCTION(uint8_t, 4, name, prefix, infix, u8, args) \
HWY_NEON_DEF_FUNCTION(uint8_t, 2, name, prefix, infix, u8, args) \
HWY_NEON_DEF_FUNCTION(uint8_t, 1, name, prefix, infix, u8, args)
// int8_t
#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION(int8_t, 16, name, prefix##q, infix, s8, args) \
HWY_NEON_DEF_FUNCTION(int8_t, 8, name, prefix, infix, s8, args) \
HWY_NEON_DEF_FUNCTION(int8_t, 4, name, prefix, infix, s8, args) \
HWY_NEON_DEF_FUNCTION(int8_t, 2, name, prefix, infix, s8, args) \
HWY_NEON_DEF_FUNCTION(int8_t, 1, name, prefix, infix, s8, args)
// uint16_t
#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION(uint16_t, 8, name, prefix##q, infix, u16, args) \
HWY_NEON_DEF_FUNCTION(uint16_t, 4, name, prefix, infix, u16, args) \
HWY_NEON_DEF_FUNCTION(uint16_t, 2, name, prefix, infix, u16, args) \
HWY_NEON_DEF_FUNCTION(uint16_t, 1, name, prefix, infix, u16, args)
// int16_t
#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION(int16_t, 8, name, prefix##q, infix, s16, args) \
HWY_NEON_DEF_FUNCTION(int16_t, 4, name, prefix, infix, s16, args) \
HWY_NEON_DEF_FUNCTION(int16_t, 2, name, prefix, infix, s16, args) \
HWY_NEON_DEF_FUNCTION(int16_t, 1, name, prefix, infix, s16, args)
// uint32_t
#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION(uint32_t, 4, name, prefix##q, infix, u32, args) \
HWY_NEON_DEF_FUNCTION(uint32_t, 2, name, prefix, infix, u32, args) \
HWY_NEON_DEF_FUNCTION(uint32_t, 1, name, prefix, infix, u32, args)
// int32_t
#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION(int32_t, 4, name, prefix##q, infix, s32, args) \
HWY_NEON_DEF_FUNCTION(int32_t, 2, name, prefix, infix, s32, args) \
HWY_NEON_DEF_FUNCTION(int32_t, 1, name, prefix, infix, s32, args)
// uint64_t
#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION(uint64_t, 2, name, prefix##q, infix, u64, args) \
HWY_NEON_DEF_FUNCTION(uint64_t, 1, name, prefix, infix, u64, args)
// int64_t
#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION(int64_t, 2, name, prefix##q, infix, s64, args) \
HWY_NEON_DEF_FUNCTION(int64_t, 1, name, prefix, infix, s64, args)
// float and double
#if defined(__aarch64__)
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args) \
HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args) \
HWY_NEON_DEF_FUNCTION(float, 1, name, prefix, infix, f32, args) \
HWY_NEON_DEF_FUNCTION(double, 2, name, prefix##q, infix, f64, args) \
HWY_NEON_DEF_FUNCTION(double, 1, name, prefix, infix, f64, args)
#else
#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args) \
HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args) \
HWY_NEON_DEF_FUNCTION(float, 1, name, prefix, infix, f32, args)
#endif
// Helper macros to define for more than one type.
// uint8_t, uint16_t and uint32_t
#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
// int8_t, int16_t and int32_t
#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
// uint8_t, uint16_t, uint32_t and uint64_t
#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
// int8_t, int16_t, int32_t and int64_t
#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
// All int*_t and uint*_t up to 64
#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
// All previous types.
#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
// Emulation of some intrinsics on armv7.
#if !defined(__aarch64__)
#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
#define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
#define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
#define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
#define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
#define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
#define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
#define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
#define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
#define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
#define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
#define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
#define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
#define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
#define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
#define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
#define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
#define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
#define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
#define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
#define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
#define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
#define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
#define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
#define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
#define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
#define vzip1_s8(x, y) vzip_s8(x, y).val[0]
#define vzip1_u8(x, y) vzip_u8(x, y).val[0]
#define vzip1_s16(x, y) vzip_s16(x, y).val[0]
#define vzip1_u16(x, y) vzip_u16(x, y).val[0]
#define vzip1_f32(x, y) vzip_f32(x, y).val[0]
#define vzip1_u32(x, y) vzip_u32(x, y).val[0]
#define vzip1_s32(x, y) vzip_s32(x, y).val[0]
#define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
#define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
#define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
#define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
#define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
#define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
#define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
#define vzip2_s8(x, y) vzip_s8(x, y).val[1]
#define vzip2_u8(x, y) vzip_u8(x, y).val[1]
#define vzip2_s16(x, y) vzip_s16(x, y).val[1]
#define vzip2_u16(x, y) vzip_u16(x, y).val[1]
#define vzip2_s32(x, y) vzip_s32(x, y).val[1]
#define vzip2_u32(x, y) vzip_u32(x, y).val[1]
#define vzip2_f32(x, y) vzip_f32(x, y).val[1]
#define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
#define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
#define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
#define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
#define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
#define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
#define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
#endif
template <typename T, size_t N>
struct Raw128;
// 128
template <>
struct Raw128<uint8_t, 16> {
using type = uint8x16_t;
};
template <>
struct Raw128<uint16_t, 8> {
using type = uint16x8_t;
};
template <>
struct Raw128<uint32_t, 4> {
using type = uint32x4_t;
};
template <>
struct Raw128<uint64_t, 2> {
using type = uint64x2_t;
};
template <>
struct Raw128<int8_t, 16> {
using type = int8x16_t;
};
template <>
struct Raw128<int16_t, 8> {
using type = int16x8_t;
};
template <>
struct Raw128<int32_t, 4> {
using type = int32x4_t;
};
template <>
struct Raw128<int64_t, 2> {
using type = int64x2_t;
};
template <>
struct Raw128<float, 4> {
using type = float32x4_t;
};
#if defined(__aarch64__)
template <>
struct Raw128<double, 2> {
using type = float64x2_t;
};
#endif
// 64
template <>
struct Raw128<uint8_t, 8> {
using type = uint8x8_t;
};
template <>
struct Raw128<uint16_t, 4> {
using type = uint16x4_t;
};
template <>
struct Raw128<uint32_t, 2> {
using type = uint32x2_t;
};
template <>
struct Raw128<uint64_t, 1> {
using type = uint64x1_t;
};
template <>
struct Raw128<int8_t, 8> {
using type = int8x8_t;
};
template <>
struct Raw128<int16_t, 4> {
using type = int16x4_t;
};
template <>
struct Raw128<int32_t, 2> {
using type = int32x2_t;
};
template <>
struct Raw128<int64_t, 1> {
using type = int64x1_t;
};
template <>
struct Raw128<float, 2> {
using type = float32x2_t;
};
#if defined(__aarch64__)
template <>
struct Raw128<double, 1> {
using type = float64x1_t;
};
#endif
// 32 (same as 64)
template <>
struct Raw128<uint8_t, 4> {
using type = uint8x8_t;
};
template <>
struct Raw128<uint16_t, 2> {
using type = uint16x4_t;
};
template <>
struct Raw128<uint32_t, 1> {
using type = uint32x2_t;
};
template <>
struct Raw128<int8_t, 4> {
using type = int8x8_t;
};
template <>
struct Raw128<int16_t, 2> {
using type = int16x4_t;
};
template <>
struct Raw128<int32_t, 1> {
using type = int32x2_t;
};
template <>
struct Raw128<float, 1> {
using type = float32x2_t;
};
// 16 (same as 64)
template <>
struct Raw128<uint8_t, 2> {
using type = uint8x8_t;
};
template <>
struct Raw128<uint16_t, 1> {
using type = uint16x4_t;
};
template <>
struct Raw128<int8_t, 2> {
using type = int8x8_t;
};
template <>
struct Raw128<int16_t, 1> {
using type = int16x4_t;
};
// 8 (same as 64)
template <>
struct Raw128<uint8_t, 1> {
using type = uint8x8_t;
};
template <>
struct Raw128<int8_t, 1> {
using type = int8x8_t;
};
template <typename T>
using Full128 = Simd<T, 16 / sizeof(T)>;
template <typename T, size_t N = 16 / sizeof(T)>
class Vec128 {
using Raw = typename Raw128<T, N>::type;
public:
HWY_INLINE Vec128() {}
Vec128(const Vec128&) = default;
Vec128& operator=(const Vec128&) = default;
HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}
// Compound assignment. Only usable if there is a corresponding non-member
// binary operator overload. For example, only f32 and f64 support division.
HWY_INLINE Vec128& operator*=(const Vec128 other) {
return *this = (*this * other);
}
HWY_INLINE Vec128& operator/=(const Vec128 other) {
return *this = (*this / other);
}
HWY_INLINE Vec128& operator+=(const Vec128 other) {
return *this = (*this + other);
}
HWY_INLINE Vec128& operator-=(const Vec128 other) {
return *this = (*this - other);
}
HWY_INLINE Vec128& operator&=(const Vec128 other) {
return *this = (*this & other);
}
HWY_INLINE Vec128& operator|=(const Vec128 other) {
return *this = (*this | other);
}
HWY_INLINE Vec128& operator^=(const Vec128 other) {
return *this = (*this ^ other);
}
Raw raw;
};
// FF..FF or 0, also for floating-point - see README.
template <typename T, size_t N = 16 / sizeof(T)>
class Mask128 {
using Raw = typename Raw128<T, N>::type;
public:
HWY_INLINE Mask128() {}
Mask128(const Mask128&) = default;
Mask128& operator=(const Mask128&) = default;
HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {}
Raw raw;
};
// ------------------------------ Cast
// cast_to_u8
// Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
// vreinterpret*_u8_*() set of functions.
#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
Vec128<uint8_t, size * sizeof(type)>
#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type, size> v
#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
// Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
template <size_t N>
HWY_INLINE Vec128<uint8_t, N> cast_to_u8(Vec128<uint8_t, N> v) {
return v;
}
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(cast_to_u8, vreinterpret, _u8_, HWY_CAST_TO_U8)
HWY_NEON_DEF_FUNCTION_INTS(cast_to_u8, vreinterpret, _u8_, HWY_CAST_TO_U8)
HWY_NEON_DEF_FUNCTION_UINT_16(cast_to_u8, vreinterpret, _u8_, HWY_CAST_TO_U8)
HWY_NEON_DEF_FUNCTION_UINT_32(cast_to_u8, vreinterpret, _u8_, HWY_CAST_TO_U8)
HWY_NEON_DEF_FUNCTION_UINT_64(cast_to_u8, vreinterpret, _u8_, HWY_CAST_TO_U8)
#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
// cast_u8_to
template <size_t N>
HWY_INLINE Vec128<uint8_t, N> cast_u8_to(Simd<uint8_t, N> /* tag */,
Vec128<uint8_t, N> v) {
return v;
}
// 64-bit or less:
template <size_t N, HWY_IF_LE64(int8_t, N)>
HWY_INLINE Vec128<int8_t, N> cast_u8_to(Simd<int8_t, N> /* tag */,
Vec128<uint8_t, N> v) {
return Vec128<int8_t, N>(vreinterpret_s8_u8(v.raw));
}
template <size_t N, HWY_IF_LE64(uint16_t, N)>
HWY_INLINE Vec128<uint16_t, N> cast_u8_to(Simd<uint16_t, N> /* tag */,
Vec128<uint8_t, N * 2> v) {
return Vec128<uint16_t, N>(vreinterpret_u16_u8(v.raw));
}
template <size_t N, HWY_IF_LE64(int16_t, N)>
HWY_INLINE Vec128<int16_t, N> cast_u8_to(Simd<int16_t, N> /* tag */,
Vec128<uint8_t, N * 2> v) {
return Vec128<int16_t, N>(vreinterpret_s16_u8(v.raw));
}
template <size_t N, HWY_IF_LE64(uint32_t, N)>
HWY_INLINE Vec128<uint32_t, N> cast_u8_to(Simd<uint32_t, N> /* tag */,
Vec128<uint8_t, N * 4> v) {
return Vec128<uint32_t, N>(vreinterpret_u32_u8(v.raw));
}
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int32_t, N> cast_u8_to(Simd<int32_t, N> /* tag */,
Vec128<uint8_t, N * 4> v) {
return Vec128<int32_t, N>(vreinterpret_s32_u8(v.raw));
}
template <size_t N, HWY_IF_LE64(float, N)>
HWY_INLINE Vec128<float, N> cast_u8_to(Simd<float, N> /* tag */,
Vec128<uint8_t, N * 4> v) {
return Vec128<float, N>(vreinterpret_f32_u8(v.raw));
}
HWY_INLINE Vec128<uint64_t, 1> cast_u8_to(Simd<uint64_t, 1> /* tag */,
Vec128<uint8_t, 1 * 8> v) {
return Vec128<uint64_t, 1>(vreinterpret_u64_u8(v.raw));
}
HWY_INLINE Vec128<int64_t, 1> cast_u8_to(Simd<int64_t, 1> /* tag */,
Vec128<uint8_t, 1 * 8> v) {
return Vec128<int64_t, 1>(vreinterpret_s64_u8(v.raw));
}
#if defined(__aarch64__)
HWY_INLINE Vec128<double, 1> cast_u8_to(Simd<double, 1> /* tag */,
Vec128<uint8_t, 1 * 8> v) {
return Vec128<double, 1>(vreinterpret_f64_u8(v.raw));
}
#endif
// 128-bit full:
HWY_INLINE Vec128<int8_t> cast_u8_to(Full128<int8_t> /* tag */,
Vec128<uint8_t> v) {
return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
}
HWY_INLINE Vec128<uint16_t> cast_u8_to(Full128<uint16_t> /* tag */,
Vec128<uint8_t> v) {
return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
}
HWY_INLINE Vec128<int16_t> cast_u8_to(Full128<int16_t> /* tag */,
Vec128<uint8_t> v) {
return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
}
HWY_INLINE Vec128<uint32_t> cast_u8_to(Full128<uint32_t> /* tag */,
Vec128<uint8_t> v) {
return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
}
HWY_INLINE Vec128<int32_t> cast_u8_to(Full128<int32_t> /* tag */,
Vec128<uint8_t> v) {
return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
}
HWY_INLINE Vec128<float> cast_u8_to(Full128<float> /* tag */,
Vec128<uint8_t> v) {
return Vec128<float>(vreinterpretq_f32_u8(v.raw));
}
HWY_INLINE Vec128<uint64_t> cast_u8_to(Full128<uint64_t> /* tag */,
Vec128<uint8_t> v) {
return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
}
HWY_INLINE Vec128<int64_t> cast_u8_to(Full128<int64_t> /* tag */,
Vec128<uint8_t> v) {
return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
}
#if defined(__aarch64__)
HWY_INLINE Vec128<double> cast_u8_to(Full128<double> /* tag */,
Vec128<uint8_t> v) {
return Vec128<double>(vreinterpretq_f64_u8(v.raw));
}
#endif
// BitCast
template <typename T, size_t N, typename FromT>
HWY_INLINE Vec128<T, N> BitCast(
Simd<T, N> d, Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
const auto u8 = cast_to_u8(v);
return cast_u8_to(d, u8);
}
// ------------------------------ Set
// Returns a vector with all lanes set to "t".
#define HWY_NEON_BUILD_TPL_HWY_SET1
#define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128<type, size>
#define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \
Simd<type, size> /* tag */, const type t
#define HWY_NEON_BUILD_ARG_HWY_SET1 t
HWY_NEON_DEF_FUNCTION_ALL_TYPES(Set, vdup, _n_, HWY_SET1)
#undef HWY_NEON_BUILD_TPL_HWY_SET1
#undef HWY_NEON_BUILD_RET_HWY_SET1
#undef HWY_NEON_BUILD_PARAM_HWY_SET1
#undef HWY_NEON_BUILD_ARG_HWY_SET1
// Returns an all-zero vector.
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Zero(Simd<T, N> d) {
return Set(d, 0);
}
// Returns a vector with uninitialized elements.
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Undefined(Simd<T, N> /*d*/) {
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
typename Raw128<T, N>::type a;
return Vec128<T, N>(a);
HWY_DIAGNOSTICS(pop)
}
// ================================================== ARITHMETIC
// ------------------------------ Addition
HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2)
// ------------------------------ Subtraction
HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2)
// ------------------------------ Saturating addition and subtraction
// Only defined for uint8_t, uint16_t and their signed versions, as in other
// architectures.
// Returns a + b clamped to the destination range.
HWY_NEON_DEF_FUNCTION_INT_8(SaturatedAdd, vqadd, _, 2)
HWY_NEON_DEF_FUNCTION_INT_16(SaturatedAdd, vqadd, _, 2)
HWY_NEON_DEF_FUNCTION_UINT_8(SaturatedAdd, vqadd, _, 2)
HWY_NEON_DEF_FUNCTION_UINT_16(SaturatedAdd, vqadd, _, 2)
// Returns a - b clamped to the destination range.
HWY_NEON_DEF_FUNCTION_INT_8(SaturatedSub, vqsub, _, 2)
HWY_NEON_DEF_FUNCTION_INT_16(SaturatedSub, vqsub, _, 2)
HWY_NEON_DEF_FUNCTION_UINT_8(SaturatedSub, vqsub, _, 2)
HWY_NEON_DEF_FUNCTION_UINT_16(SaturatedSub, vqsub, _, 2)
// ------------------------------ Average
// Returns (a + b + 1) / 2
// Unsigned
HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2)
HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2)
// ------------------------------ Absolute value
// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
HWY_INLINE Vec128<int8_t> Abs(const Vec128<int8_t> v) {
return Vec128<int8_t>(vabsq_s8(v.raw));
}
HWY_INLINE Vec128<int16_t> Abs(const Vec128<int16_t> v) {
return Vec128<int16_t>(vabsq_s16(v.raw));
}
HWY_INLINE Vec128<int32_t> Abs(const Vec128<int32_t> v) {
return Vec128<int32_t>(vabsq_s32(v.raw));
}
HWY_INLINE Vec128<float> Abs(const Vec128<float> v) {
return Vec128<float>{vabsq_f32(v.raw)};
}
template <size_t N, HWY_IF_LE64(int8_t, N)>
HWY_INLINE Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
return Vec128<int8_t, N>(vabs_s8(v.raw));
}
template <size_t N, HWY_IF_LE64(int16_t, N)>
HWY_INLINE Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>(vabs_s16(v.raw));
}
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>(vabs_s32(v.raw));
}
template <size_t N, HWY_IF_LE64(float, N)>
HWY_INLINE Vec128<float, N> Abs(const Vec128<float, N> v) {
return Vec128<float, N>{vabs_f32(v.raw)};
}
#if defined(__aarch64__)
HWY_INLINE Vec128<double> Abs(const Vec128<double> v) {
return Vec128<double>{vabsq_f64(v.raw)};
}
HWY_INLINE Vec128<double, 1> Abs(const Vec128<double, 1> v) {
return Vec128<double, 1>{vabs_f64(v.raw)};
}
#endif
// ------------------------------ Shift lanes by constant #bits
// Only defined for ints and uints, except for signed i64 shr.
#define HWY_NEON_BUILD_TPL_HWY_SHIFT template <int kBits>
#define HWY_NEON_BUILD_RET_HWY_SHIFT(type, size) Vec128<type, size>
#define HWY_NEON_BUILD_PARAM_HWY_SHIFT(type, size) const Vec128<type, size> v
#define HWY_NEON_BUILD_ARG_HWY_SHIFT v.raw, kBits
HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, HWY_SHIFT)
HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, HWY_SHIFT)
HWY_NEON_DEF_FUNCTION_INT_8_16_32(ShiftRight, vshr, _n_, HWY_SHIFT)
#undef HWY_NEON_BUILD_TPL_HWY_SHIFT
#undef HWY_NEON_BUILD_RET_HWY_SHIFT
#undef HWY_NEON_BUILD_PARAM_HWY_SHIFT
#undef HWY_NEON_BUILD_ARG_HWY_SHIFT
// ------------------------------ Shift lanes by independent variable #bits
// Unsigned (no u8,u16)
HWY_INLINE Vec128<uint32_t> operator<<(const Vec128<uint32_t> v,
const Vec128<uint32_t> bits) {
return Vec128<uint32_t>(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw)));
}
HWY_INLINE Vec128<uint32_t> operator>>(const Vec128<uint32_t> v,
const Vec128<uint32_t> bits) {
return Vec128<uint32_t>(
vshlq_u32(v.raw, vnegq_s32(vreinterpretq_s32_u32(bits.raw))));
}
HWY_INLINE Vec128<uint64_t> operator<<(const Vec128<uint64_t> v,
const Vec128<uint64_t> bits) {
return Vec128<uint64_t>(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw)));
}
HWY_INLINE Vec128<uint64_t> operator>>(const Vec128<uint64_t> v,
const Vec128<uint64_t> bits) {
#if defined(__aarch64__)
const int64x2_t neg_bits = vnegq_s64(vreinterpretq_s64_u64(bits.raw));
#else
// A32 doesn't have vnegq_s64().
const int64x2_t neg_bits =
vsubq_s64(Set(Full128<int64_t>(), 0).raw, bits.raw);
#endif
return Vec128<uint64_t>(vshlq_u64(v.raw, neg_bits));
}
template <size_t N, HWY_IF_LE64(uint32_t, N)>
HWY_INLINE Vec128<uint32_t, N> operator<<(const Vec128<uint32_t, N> v,
const Vec128<uint32_t, N> bits) {
return Vec128<uint32_t, N>(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw)));
}
template <size_t N, HWY_IF_LE64(uint32_t, N)>
HWY_INLINE Vec128<uint32_t, N> operator>>(const Vec128<uint32_t, N> v,
const Vec128<uint32_t, N> bits) {
return Vec128<uint32_t, N>(
vshl_u32(v.raw, vneg_s32(vreinterpret_s32_u32(bits.raw))));
}
HWY_INLINE Vec128<uint64_t, 1> operator<<(const Vec128<uint64_t, 1> v,
const Vec128<uint64_t, 1> bits) {
return Vec128<uint64_t, 1>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
}
HWY_INLINE Vec128<uint64_t, 1> operator>>(const Vec128<uint64_t, 1> v,
const Vec128<uint64_t, 1> bits) {
#if defined(__aarch64__)
const int64x1_t neg_bits = vneg_s64(vreinterpret_s64_u64(bits.raw));
#else
// A32 doesn't have vneg_s64().
const int64x1_t neg_bits = vsub_s64(Set(Simd<int64_t, 1>(), 0).raw, bits.raw);
#endif
return Vec128<uint64_t, 1>(vshl_u64(v.raw, neg_bits));
}
// Signed (no i8,i16)
HWY_INLINE Vec128<int32_t> operator<<(const Vec128<int32_t> v,
const Vec128<int32_t> bits) {
return Vec128<int32_t>(vshlq_s32(v.raw, bits.raw));
}
HWY_INLINE Vec128<int32_t> operator>>(const Vec128<int32_t> v,
const Vec128<int32_t> bits) {
return Vec128<int32_t>(vshlq_s32(v.raw, vnegq_s32(bits.raw)));
}
HWY_INLINE Vec128<int64_t> operator<<(const Vec128<int64_t> v,
const Vec128<int64_t> bits) {
return Vec128<int64_t>(vshlq_s64(v.raw, bits.raw));
}
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int32_t, N> operator<<(const Vec128<int32_t, N> v,
const Vec128<int32_t, N> bits) {
return Vec128<int32_t, N>(vshl_s32(v.raw, bits.raw));
}
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int32_t, N> operator>>(const Vec128<int32_t, N> v,
const Vec128<int32_t, N> bits) {
return Vec128<int32_t, N>(vshl_s32(v.raw, vneg_s32(bits.raw)));
}
HWY_INLINE Vec128<int64_t, 1> operator<<(const Vec128<int64_t, 1> v,
const Vec128<int64_t, 1> bits) {
return Vec128<int64_t, 1>(vshl_s64(v.raw, bits.raw));
}
// ------------------------------ Minimum
// Unsigned (no u64)
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2)
// Signed (no i64)
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, vmin, _, 2)
// Float
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2)
// ------------------------------ Maximum
// Unsigned (no u64)
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max, vmax, _, 2)
// Signed (no i64)
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, vmax, _, 2)
// Float
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2)
// ------------------------------ Integer multiplication
// Unsigned
HWY_INLINE Vec128<uint16_t> operator*(const Vec128<uint16_t> a,
const Vec128<uint16_t> b) {
return Vec128<uint16_t>(vmulq_u16(a.raw, b.raw));
}
HWY_INLINE Vec128<uint32_t> operator*(const Vec128<uint32_t> a,
const Vec128<uint32_t> b) {
return Vec128<uint32_t>(vmulq_u32(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(uint16_t, N)>
HWY_INLINE Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>(vmul_u16(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(uint32_t, N)>
HWY_INLINE Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>(vmul_u32(a.raw, b.raw));
}
// Signed
HWY_INLINE Vec128<int16_t> operator*(const Vec128<int16_t> a,
const Vec128<int16_t> b) {
return Vec128<int16_t>(vmulq_s16(a.raw, b.raw));
}
HWY_INLINE Vec128<int32_t> operator*(const Vec128<int32_t> a,
const Vec128<int32_t> b) {
return Vec128<int32_t>(vmulq_s32(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(uint16_t, N)>
HWY_INLINE Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>(vmul_s16(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Vec128<int32_t, N>(vmul_s32(a.raw, b.raw));
}
// Returns the upper 16 bits of a * b in each lane.
HWY_INLINE Vec128<int16_t> MulHigh(const Vec128<int16_t> a,
const Vec128<int16_t> b) {
int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
#if defined(__aarch64__)
int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
#else
int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
#endif
return Vec128<int16_t>(
vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
}
HWY_INLINE Vec128<uint16_t> MulHigh(const Vec128<uint16_t> a,
const Vec128<uint16_t> b) {
uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
#if defined(__aarch64__)
uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
#else
uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
#endif
return Vec128<uint16_t>(
vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
}
template <size_t N, HWY_IF_LE64(int16_t, N)>
HWY_INLINE Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw));
return Vec128<int16_t, N>(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo)));
}
template <size_t N, HWY_IF_LE64(uint16_t, N)>
HWY_INLINE Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw));
return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo)));
}
// Multiplies even lanes (0, 2 ..) and places the double-wide result into
// even and the upper half into its odd neighbor lane.
HWY_INLINE Vec128<int64_t> MulEven(const Vec128<int32_t> a,
const Vec128<int32_t> b) {
int32x4_t a_packed = vuzp1q_s32(a.raw, a.raw);
int32x4_t b_packed = vuzp1q_s32(b.raw, b.raw);
return Vec128<int64_t>(
vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
}
HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint32_t> a,
const Vec128<uint32_t> b) {
uint32x4_t a_packed = vuzp1q_u32(a.raw, a.raw);
uint32x4_t b_packed = vuzp1q_u32(b.raw, b.raw);
return Vec128<uint64_t>(
vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
}
template <size_t N>
HWY_INLINE Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
int32x2_t a_packed = vuzp1_s32(a.raw, a.raw);
int32x2_t b_packed = vuzp1_s32(b.raw, b.raw);
return Vec128<int64_t, (N + 1) / 2>(
vget_low_s64(vmull_s32(a_packed, b_packed)));
}
template <size_t N>
HWY_INLINE Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
uint32x2_t a_packed = vuzp1_u32(a.raw, a.raw);
uint32x2_t b_packed = vuzp1_u32(b.raw, b.raw);
return Vec128<uint64_t, (N + 1) / 2>(
vget_low_u64(vmull_u32(a_packed, b_packed)));
}
// ------------------------------ Floating-point negate
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vneg, _, 1)
HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1)
HWY_INLINE Vec128<int64_t, 1> Neg(const Vec128<int64_t, 1> v) {
#if defined(__aarch64__)
return Vec128<int64_t, 1>(vneg_s64(v.raw));
#else
return Zero(Simd<int64_t, 1>()) - v;
#endif
}
HWY_INLINE Vec128<int64_t> Neg(const Vec128<int64_t> v) {
#if defined(__aarch64__)
return Vec128<int64_t>(vnegq_s64(v.raw));
#else
return Zero(Full128<int64_t>()) - v;
#endif
}
// ------------------------------ Floating-point mul / div
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
// Approximate reciprocal
HWY_INLINE Vec128<float> ApproximateReciprocal(const Vec128<float> v) {
return Vec128<float>(vrecpeq_f32(v.raw));
}
template <size_t N>
HWY_INLINE Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
return Vec128<float, N>(vrecpe_f32(v.raw));
}
#if defined(__aarch64__)
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
#else
// Emulated with approx reciprocal + Newton-Raphson + mul
template <size_t N>
HWY_INLINE Vec128<float, N> operator/(const Vec128<float, N> a,
const Vec128<float, N> b) {
auto x = ApproximateReciprocal(b);
// Newton-Raphson on 1/x - b
const auto two = Set(Simd<float, N>(), 2);
x = x * (two - b * x);
x = x * (two - b * x);
x = x * (two - b * x);
return a * x;
}
#endif
// Absolute value of difference.
HWY_INLINE Vec128<float> AbsDiff(const Vec128<float> a, const Vec128<float> b) {
return Vec128<float>(vabdq_f32(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(float, N)>
HWY_INLINE Vec128<float, N> AbsDiff(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Vec128<float, N>(vabd_f32(a.raw, b.raw));
}
// ------------------------------ Floating-point multiply-add variants
// Returns add + mul * x
#if defined(__aarch64__)
template <size_t N, HWY_IF_LE64(float, N)>
HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
const Vec128<float, N> x,
const Vec128<float, N> add) {
return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw));
}
HWY_INLINE Vec128<float> MulAdd(const Vec128<float> mul, const Vec128<float> x,
const Vec128<float> add) {
return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
}
HWY_INLINE Vec128<double, 1> MulAdd(const Vec128<double, 1> mul,
const Vec128<double, 1> x,
const Vec128<double, 1> add) {
return Vec128<double, 1>(vfma_f64(add.raw, mul.raw, x.raw));
}
HWY_INLINE Vec128<double> MulAdd(const Vec128<double> mul,
const Vec128<double> x,
const Vec128<double> add) {
return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
}
#else
// Emulate FMA for floats.
template <size_t N>
HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul,
const Vec128<float, N> x,
const Vec128<float, N> add) {
return mul * x + add;
}
#endif
// Returns add - mul * x
#if defined(__aarch64__)
template <size_t N, HWY_IF_LE64(float, N)>
HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
const Vec128<float, N> x,
const Vec128<float, N> add) {
return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw));
}
HWY_INLINE Vec128<float> NegMulAdd(const Vec128<float> mul,
const Vec128<float> x,
const Vec128<float> add) {
return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
}
HWY_INLINE Vec128<double, 1> NegMulAdd(const Vec128<double, 1> mul,
const Vec128<double, 1> x,
const Vec128<double, 1> add) {
return Vec128<double, 1>(vfms_f64(add.raw, mul.raw, x.raw));
}
HWY_INLINE Vec128<double> NegMulAdd(const Vec128<double> mul,
const Vec128<double> x,
const Vec128<double> add) {
return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
}
#else
// Emulate FMA for floats.
template <size_t N>
HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
const Vec128<float, N> x,
const Vec128<float, N> add) {
return add - mul * x;
}
#endif
// Returns mul * x - sub
template <size_t N>
HWY_INLINE Vec128<float, N> MulSub(const Vec128<float, N> mul,
const Vec128<float, N> x,
const Vec128<float, N> sub) {
return MulAdd(mul, x, Neg(sub));
}
template <size_t N>
HWY_INLINE Vec128<double, N> MulSub(const Vec128<double, N> mul,
const Vec128<double, N> x,
const Vec128<double, N> sub) {
return MulAdd(mul, x, Neg(sub));
}
// Returns -mul * x - sub
template <size_t N>
HWY_INLINE Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
const Vec128<float, N> x,
const Vec128<float, N> sub) {
return Neg(MulAdd(mul, x, sub));
}
template <size_t N>
HWY_INLINE Vec128<double, N> NegMulSub(const Vec128<double, N> mul,
const Vec128<double, N> x,
const Vec128<double, N> sub) {
return Neg(MulAdd(mul, x, sub));
}
// ------------------------------ Floating-point square root
// Approximate reciprocal square root
HWY_INLINE Vec128<float> ApproximateReciprocalSqrt(const Vec128<float> v) {
return Vec128<float>(vrsqrteq_f32(v.raw));
}
template <size_t N>
HWY_INLINE Vec128<float, N> ApproximateReciprocalSqrt(
const Vec128<float, N> v) {
return Vec128<float, N>(vrsqrte_f32(v.raw));
}
// Full precision square root
#if defined(__aarch64__)
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1)
#else
// Not defined on armv7: emulate with approx reciprocal sqrt + Goldschmidt.
template <size_t N>
HWY_INLINE Vec128<float, N> Sqrt(const Vec128<float, N> v) {
auto b = v;
auto Y = ApproximateReciprocalSqrt(v);
auto x = v * Y;
const auto half = Set(Simd<float, N>(), 0.5);
const auto oneandhalf = Set(Simd<float, N>(), 1.5);
for (size_t i = 0; i < 3; i++) {
b = b * Y * Y;
Y = oneandhalf - half * b;
x = x * Y;
}
return IfThenZeroElse(v == Zero(Simd<float, N>()), x);
}
#endif
// ------------------------------ Floating-point rounding
#if defined(__aarch64__)
// Toward nearest integer
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1)
// Toward zero, aka truncate
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Trunc, vrnd, _, 1)
// Toward +infinity, aka ceiling
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Ceil, vrndp, _, 1)
// Toward -infinity, aka floor
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor, vrndm, _, 1)
#else
template <size_t N>
HWY_INLINE Vec128<float, N> Trunc(const Vec128<float, N> v) {
const Simd<uint32_t, N> du;
const Simd<int32_t, N> di;
const Simd<float, N> df;
const auto v_bits = BitCast(du, v);
const auto biased_exp = ShiftRight<23>(v_bits) & Set(du, 0xFF);
const auto bits_to_remove =
Set(du, 150) - Max(Min(biased_exp, Set(du, 150)), Set(du, 127));
const auto mask = (Set(du, 1) << bits_to_remove) - Set(du, 1);
return BitCast(df, IfThenZeroElse(BitCast(di, biased_exp) < Set(di, 127),
BitCast(di, AndNot(mask, v_bits))));
}
// WARNING: does not quite have the same semantics as what NEON does on
// aarch64. In particular, does not break ties to even.
template <size_t N>
HWY_INLINE Vec128<float, N> Round(const Vec128<float, N> v) {
const Simd<uint32_t, N> du;
const Simd<float, N> df;
const auto sign_mask = BitCast(df, Set(du, 0x80000000u));
// move 0.5f away from 0 and call truncate.
return Trunc(v + ((v & sign_mask) | Set(df, 0.5f)));
}
template <size_t N>
HWY_INLINE Vec128<float, N> Ceil(const Vec128<float, N> v) {
const Simd<uint32_t, N> du;
const Simd<int32_t, N> di;
const Simd<float, N> df;
const auto sign_mask = Set(du, 0x80000000u);
const auto v_bits = BitCast(du, v);
const auto biased_exp = ShiftRight<23>(v_bits) & Set(du, 0xFF);
const auto bits_to_remove =
Set(du, 150) - Max(Min(biased_exp, Set(du, 150)), Set(du, 127));
const auto high_bit = Set(du, 1) << bits_to_remove;
const auto mask = high_bit - Set(du, 1);
const auto removed_bits = mask & v_bits;
// number is positive and at least one bit was set in the mantissa
const auto should_round_up = MaskFromVec(
BitCast(df, AndNot(VecFromMask(removed_bits == Zero(du)),
VecFromMask(Zero(du) == (v_bits & sign_mask)))));
const auto add_one = IfThenElseZero(should_round_up, Set(df, 1.0f));
const auto rounded =
BitCast(df, IfThenZeroElse(BitCast(di, biased_exp) < Set(di, 127),
BitCast(di, AndNot(mask, v_bits))));
return rounded + add_one;
}
template <size_t N>
HWY_INLINE Vec128<float, N> Floor(const Vec128<float, N> v) {
const Simd<float, N> df;
const auto zero = Zero(df);
return zero - Ceil(zero - v);
}
#endif
// ================================================== COMPARE
#define HWY_NEON_BUILD_TPL_HWY_COMPARE
#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size>
#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
const Vec128<type, size> a, const Vec128<type, size> b
#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
// Comparisons fill a lane with 1-bits if the condition is true, else 0.
// ------------------------------ Equality
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
#if defined(__aarch64__)
HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE);
#else
// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
#endif
// ------------------------------ Strict inequality
// Signed/float < (no unsigned)
#if defined(__aarch64__)
HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE)
#else
HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
#endif
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
// Signed/float > (no unsigned)
#if defined(__aarch64__)
HWY_NEON_DEF_FUNCTION_INTS(operator>, vcgt, _, HWY_COMPARE)
#else
HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator>, vcgt, _, HWY_COMPARE)
#endif
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>, vcgt, _, HWY_COMPARE)
// ------------------------------ Weak inequality
// Float <= >=
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>=, vcge, _, HWY_COMPARE)
#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
#undef HWY_NEON_BUILD_RET_HWY_COMPARE
#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
// ================================================== LOGICAL
// ------------------------------ Bitwise AND
HWY_NEON_DEF_FUNCTION_INTS_UINTS(And, vand, _, 2)
// These operator& rely on the special cases for uint32_t and uint64_t just
// defined by HWY_NEON_DEF_FUNCTION_INTS_UINTS() macro.
template <size_t N>
HWY_INLINE Vec128<float, N> And(const Vec128<float, N> a,
const Vec128<float, N> b) {
const Simd<uint32_t, N> d;
return BitCast(Simd<float, N>(), BitCast(d, a) & BitCast(d, b));
}
template <size_t N>
HWY_INLINE Vec128<double, N> And(const Vec128<double, N> a,
const Vec128<double, N> b) {
const Simd<uint64_t, N> d;
return BitCast(Simd<double, N>(), BitCast(d, a) & BitCast(d, b));
}
// ------------------------------ Bitwise AND-NOT
namespace internal {
// reversed_andnot returns a & ~b.
HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2)
} // namespace internal
// Returns ~not_mask & mask.
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> AndNot(const Vec128<T, N> not_mask,
const Vec128<T, N> mask) {
return internal::reversed_andnot(mask, not_mask);
}
// These AndNot() rely on the special cases for uint32_t and uint64_t just
// defined by HWY_NEON_DEF_FUNCTION_INTS_UINTS() macro.
template <size_t N>
HWY_INLINE Vec128<float, N> AndNot(const Vec128<float, N> not_mask,
const Vec128<float, N> mask) {
const Simd<uint32_t, N> du;
Vec128<uint32_t, N> ret =
internal::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask));
return BitCast(Simd<float, N>(), ret);
}
#if defined(__aarch64__)
template <size_t N>
HWY_INLINE Vec128<double, N> AndNot(const Vec128<double, N> not_mask,
const Vec128<double, N> mask) {
const Simd<uint64_t, N> du;
Vec128<uint64_t, N> ret =
internal::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask));
return BitCast(Simd<double, N>(), ret);
}
#endif
// ------------------------------ Bitwise OR
HWY_NEON_DEF_FUNCTION_INTS_UINTS(Or, vorr, _, 2)
// These operator| rely on the special cases for uint32_t and uint64_t just
// defined by HWY_NEON_DEF_FUNCTION_INTS_UINTS() macro.
template <size_t N>
HWY_INLINE Vec128<float, N> Or(const Vec128<float, N> a,
const Vec128<float, N> b) {
const Simd<uint32_t, N> d;
return BitCast(Simd<float, N>(), BitCast(d, a) | BitCast(d, b));
}
template <size_t N>
HWY_INLINE Vec128<double, N> Or(const Vec128<double, N> a,
const Vec128<double, N> b) {
const Simd<uint64_t, N> d;
return BitCast(Simd<double, N>(), BitCast(d, a) | BitCast(d, b));
}
// ------------------------------ Bitwise XOR
HWY_NEON_DEF_FUNCTION_INTS_UINTS(Xor, veor, _, 2)
// These operator| rely on the special cases for uint32_t and uint64_t just
// defined by HWY_NEON_DEF_FUNCTION_INTS_UINTS() macro.
template <size_t N>
HWY_INLINE Vec128<float, N> Xor(const Vec128<float, N> a,
const Vec128<float, N> b) {
const Simd<uint32_t, N> d;
return BitCast(Simd<float, N>(), BitCast(d, a) ^ BitCast(d, b));
}
template <size_t N>
HWY_INLINE Vec128<double, N> Xor(const Vec128<double, N> a,
const Vec128<double, N> b) {
const Simd<uint64_t, N> d;
return BitCast(Simd<double, N>(), BitCast(d, a) ^ BitCast(d, b));
}
// ------------------------------ Operator overloads (internal-only if float)
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
return And(a, b);
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
return Or(a, b);
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
return Xor(a, b);
}
// ------------------------------ CopySign
template <typename T, size_t N>
HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
const Vec128<T, N> sign) {
static_assert(IsFloat<T>(), "Only makes sense for floating-point");
const auto msb = SignBit(Simd<T, N>());
return Or(AndNot(msb, magn), And(msb, sign));
}
template <typename T, size_t N>
HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
const Vec128<T, N> sign) {
static_assert(IsFloat<T>(), "Only makes sense for floating-point");
return Or(abs, And(SignBit(Simd<T, N>()), sign));
}
// ------------------------------ Make mask
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
return (v & bit) == bit;
}
// Mask and Vec are the same (true = FF..FF).
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
return Mask128<T, N>(v.raw);
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
return Vec128<T, N>(v.raw);
}
// IfThenElse(mask, yes, no)
// Returns mask ? b : a.
#define HWY_NEON_BUILD_TPL_HWY_IF
#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type, size>
#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \
const Mask128<type, size> mask, const Vec128<type, size> yes, \
const Vec128<type, size> no
#define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF)
#undef HWY_NEON_BUILD_TPL_HWY_IF
#undef HWY_NEON_BUILD_RET_HWY_IF
#undef HWY_NEON_BUILD_PARAM_HWY_IF
#undef HWY_NEON_BUILD_ARG_HWY_IF
// mask ? yes : 0
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenElseZero(const Mask128<T, N> mask,
const Vec128<T, N> yes) {
return yes & VecFromMask(mask);
}
// mask ? 0 : no
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> IfThenZeroElse(const Mask128<T, N> mask,
const Vec128<T, N> no) {
return AndNot(VecFromMask(mask), no);
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
const auto zero = Zero(Simd<T, N>());
return Max(zero, v);
}
// ================================================== MEMORY
// ------------------------------ Load 128
HWY_INLINE Vec128<uint8_t> LoadU(Full128<uint8_t> /* tag */,
const uint8_t* HWY_RESTRICT aligned) {
return Vec128<uint8_t>(vld1q_u8(aligned));
}
HWY_INLINE Vec128<uint16_t> LoadU(Full128<uint16_t> /* tag */,
const uint16_t* HWY_RESTRICT aligned) {
return Vec128<uint16_t>(vld1q_u16(aligned));
}
HWY_INLINE Vec128<uint32_t> LoadU(Full128<uint32_t> /* tag */,
const uint32_t* HWY_RESTRICT aligned) {
return Vec128<uint32_t>(vld1q_u32(aligned));
}
HWY_INLINE Vec128<uint64_t> LoadU(Full128<uint64_t> /* tag */,
const uint64_t* HWY_RESTRICT aligned) {
return Vec128<uint64_t>(vld1q_u64(aligned));
}
HWY_INLINE Vec128<int8_t> LoadU(Full128<int8_t> /* tag */,
const int8_t* HWY_RESTRICT aligned) {
return Vec128<int8_t>(vld1q_s8(aligned));
}
HWY_INLINE Vec128<int16_t> LoadU(Full128<int16_t> /* tag */,
const int16_t* HWY_RESTRICT aligned) {
return Vec128<int16_t>(vld1q_s16(aligned));
}
HWY_INLINE Vec128<int32_t> LoadU(Full128<int32_t> /* tag */,
const int32_t* HWY_RESTRICT aligned) {
return Vec128<int32_t>(vld1q_s32(aligned));
}
HWY_INLINE Vec128<int64_t> LoadU(Full128<int64_t> /* tag */,
const int64_t* HWY_RESTRICT aligned) {
return Vec128<int64_t>(vld1q_s64(aligned));
}
HWY_INLINE Vec128<float> LoadU(Full128<float> /* tag */,
const float* HWY_RESTRICT aligned) {
return Vec128<float>(vld1q_f32(aligned));
}
#if defined(__aarch64__)
HWY_INLINE Vec128<double> LoadU(Full128<double> /* tag */,
const double* HWY_RESTRICT aligned) {
return Vec128<double>(vld1q_f64(aligned));
}
#endif
// ------------------------------ Load 64
HWY_INLINE Vec128<uint8_t, 8> LoadU(Simd<uint8_t, 8> /* tag */,
const uint8_t* HWY_RESTRICT p) {
return Vec128<uint8_t, 8>(vld1_u8(p));
}
HWY_INLINE Vec128<uint16_t, 4> LoadU(Simd<uint16_t, 4> /* tag */,
const uint16_t* HWY_RESTRICT p) {
return Vec128<uint16_t, 4>(vld1_u16(p));
}
HWY_INLINE Vec128<uint32_t, 2> LoadU(Simd<uint32_t, 2> /* tag */,
const uint32_t* HWY_RESTRICT p) {
return Vec128<uint32_t, 2>(vld1_u32(p));
}
HWY_INLINE Vec128<uint64_t, 1> LoadU(Simd<uint64_t, 1> /* tag */,
const uint64_t* HWY_RESTRICT p) {
return Vec128<uint64_t, 1>(vld1_u64(p));
}
HWY_INLINE Vec128<int8_t, 8> LoadU(Simd<int8_t, 8> /* tag */,
const int8_t* HWY_RESTRICT p) {
return Vec128<int8_t, 8>(vld1_s8(p));
}
HWY_INLINE Vec128<int16_t, 4> LoadU(Simd<int16_t, 4> /* tag */,
const int16_t* HWY_RESTRICT p) {
return Vec128<int16_t, 4>(vld1_s16(p));
}
HWY_INLINE Vec128<int32_t, 2> LoadU(Simd<int32_t, 2> /* tag */,
const int32_t* HWY_RESTRICT p) {
return Vec128<int32_t, 2>(vld1_s32(p));
}
HWY_INLINE Vec128<int64_t, 1> LoadU(Simd<int64_t, 1> /* tag */,
const int64_t* HWY_RESTRICT p) {
return Vec128<int64_t, 1>(vld1_s64(p));
}
HWY_INLINE Vec128<float, 2> LoadU(Simd<float, 2> /* tag */,
const float* HWY_RESTRICT p) {
return Vec128<float, 2>(vld1_f32(p));
}
#if defined(__aarch64__)
HWY_INLINE Vec128<double, 1> LoadU(Simd<double, 1> /* tag */,
const double* HWY_RESTRICT p) {
return Vec128<double, 1>(vld1_f64(p));
}
#endif
// ------------------------------ Load 32
// In the following load functions, |a| is purposely undefined.
// It is a required parameter to the intrinsic, however
// we don't actually care what is in it, and we don't want
// to introduce extra overhead by initializing it to something.
HWY_INLINE Vec128<uint8_t, 4> LoadU(Simd<uint8_t, 4> d,
const uint8_t* HWY_RESTRICT p) {
uint32x2_t a = Undefined(d).raw;
uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
return Vec128<uint8_t, 4>(vreinterpret_u8_u32(b));
}
HWY_INLINE Vec128<uint16_t, 2> LoadU(Simd<uint16_t, 2> d,
const uint16_t* HWY_RESTRICT p) {
uint32x2_t a = Undefined(d).raw;
uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0);
return Vec128<uint16_t, 2>(vreinterpret_u16_u32(b));
}
HWY_INLINE Vec128<uint32_t, 1> LoadU(Simd<uint32_t, 1> d,
const uint32_t* HWY_RESTRICT p) {
uint32x2_t a = Undefined(d).raw;
uint32x2_t b = vld1_lane_u32(p, a, 0);
return Vec128<uint32_t, 1>(b);
}
HWY_INLINE Vec128<int8_t, 4> LoadU(Simd<int8_t, 4> d,
const int8_t* HWY_RESTRICT p) {
int32x2_t a = Undefined(d).raw;
int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
return Vec128<int8_t, 4>(vreinterpret_s8_s32(b));
}
HWY_INLINE Vec128<int16_t, 2> LoadU(Simd<int16_t, 2> d,
const int16_t* HWY_RESTRICT p) {
int32x2_t a = Undefined(d).raw;
int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0);
return Vec128<int16_t, 2>(vreinterpret_s16_s32(b));
}
HWY_INLINE Vec128<int32_t, 1> LoadU(Simd<int32_t, 1> d,
const int32_t* HWY_RESTRICT p) {
int32x2_t a = Undefined(d).raw;
int32x2_t b = vld1_lane_s32(p, a, 0);
return Vec128<int32_t, 1>(b);
}
HWY_INLINE Vec128<float, 1> LoadU(Simd<float, 1> d,
const float* HWY_RESTRICT p) {
float32x2_t a = Undefined(d).raw;
float32x2_t b = vld1_lane_f32(p, a, 0);
return Vec128<float, 1>(b);
}
// ------------------------------ Load 16
HWY_INLINE Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2> d,
const uint8_t* HWY_RESTRICT p) {
uint16x4_t a = Undefined(d).raw;
uint16x4_t b = vld1_lane_u16(reinterpret_cast<const uint16_t*>(p), a, 0);
return Vec128<uint8_t, 2>(vreinterpret_u8_u16(b));
}
HWY_INLINE Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1> d,
const uint16_t* HWY_RESTRICT p) {
uint16x4_t a = Undefined(d).raw;
uint16x4_t b = vld1_lane_u16(p, a, 0);
return Vec128<uint16_t, 1>(b);
}
HWY_INLINE Vec128<int8_t, 2> LoadU(Simd<int8_t, 2> d,
const int8_t* HWY_RESTRICT p) {
int16x4_t a = Undefined(d).raw;
int16x4_t b = vld1_lane_s16(reinterpret_cast<const int16_t*>(p), a, 0);
return Vec128<int8_t, 2>(vreinterpret_s8_s16(b));
}
HWY_INLINE Vec128<int16_t, 1> LoadU(Simd<int16_t, 1> d,
const int16_t* HWY_RESTRICT p) {
int16x4_t a = Undefined(d).raw;
int16x4_t b = vld1_lane_s16(p, a, 0);
return Vec128<int16_t, 1>(b);
}
// ------------------------------ Load 8
HWY_INLINE Vec128<uint8_t, 1> LoadU(Simd<uint8_t, 1> d,
const uint8_t* HWY_RESTRICT p) {
uint8x8_t a = Undefined(d).raw;
uint8x8_t b = vld1_lane_u8(p, a, 0);
return Vec128<uint8_t, 1>(b);
}
HWY_INLINE Vec128<int8_t, 1> LoadU(Simd<int8_t, 1> d,
const int8_t* HWY_RESTRICT p) {
int8x8_t a = Undefined(d).raw;
int8x8_t b = vld1_lane_s8(p, a, 0);
return Vec128<int8_t, 1>(b);
}
// On ARM, Load is the same as LoadU.
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Load(Simd<T, N> d, const T* HWY_RESTRICT p) {
return LoadU(d, p);
}
// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_INLINE Vec128<T, N> LoadDup128(Simd<T, N> d,
const T* const HWY_RESTRICT p) {
return LoadU(d, p);
}
// ------------------------------ Store 128
HWY_INLINE void StoreU(const Vec128<uint8_t> v, Full128<uint8_t> /* tag */,
uint8_t* HWY_RESTRICT aligned) {
vst1q_u8(aligned, v.raw);
}
HWY_INLINE void StoreU(const Vec128<uint16_t> v, Full128<uint16_t> /* tag */,
uint16_t* HWY_RESTRICT aligned) {
vst1q_u16(aligned, v.raw);
}
HWY_INLINE void StoreU(const Vec128<uint32_t> v, Full128<uint32_t> /* tag */,
uint32_t* HWY_RESTRICT aligned) {
vst1q_u32(aligned, v.raw);
}
HWY_INLINE void StoreU(const Vec128<uint64_t> v, Full128<uint64_t> /* tag */,
uint64_t* HWY_RESTRICT aligned) {
vst1q_u64(aligned, v.raw);
}
HWY_INLINE void StoreU(const Vec128<int8_t> v, Full128<int8_t> /* tag */,
int8_t* HWY_RESTRICT aligned) {
vst1q_s8(aligned, v.raw);
}
HWY_INLINE void StoreU(const Vec128<int16_t> v, Full128<int16_t> /* tag */,
int16_t* HWY_RESTRICT aligned) {
vst1q_s16(aligned, v.raw);
}
HWY_INLINE void StoreU(const Vec128<int32_t> v, Full128<int32_t> /* tag */,
int32_t* HWY_RESTRICT aligned) {
vst1q_s32(aligned, v.raw);
}
HWY_INLINE void StoreU(const Vec128<int64_t> v, Full128<int64_t> /* tag */,
int64_t* HWY_RESTRICT aligned) {
vst1q_s64(aligned, v.raw);
}
HWY_INLINE void StoreU(const Vec128<float> v, Full128<float> /* tag */,
float* HWY_RESTRICT aligned) {
vst1q_f32(aligned, v.raw);
}
#if defined(__aarch64__)
HWY_INLINE void StoreU(const Vec128<double> v, Full128<double> /* tag */,
double* HWY_RESTRICT aligned) {
vst1q_f64(aligned, v.raw);
}
#endif
// ------------------------------ Store 64
HWY_INLINE void StoreU(const Vec128<uint8_t, 8> v, Simd<uint8_t, 8> /* tag */,
uint8_t* HWY_RESTRICT p) {
vst1_u8(p, v.raw);
}
HWY_INLINE void StoreU(const Vec128<uint16_t, 4> v, Simd<uint16_t, 4> /* tag */,
uint16_t* HWY_RESTRICT p) {
vst1_u16(p, v.raw);
}
HWY_INLINE void StoreU(const Vec128<uint32_t, 2> v, Simd<uint32_t, 2> /* tag */,
uint32_t* HWY_RESTRICT p) {
vst1_u32(p, v.raw);
}
HWY_INLINE void StoreU(const Vec128<uint64_t, 1> v, Simd<uint64_t, 1> /* tag */,
uint64_t* HWY_RESTRICT p) {
vst1_u64(p, v.raw);
}
HWY_INLINE void StoreU(const Vec128<int8_t, 8> v, Simd<int8_t, 8> /* tag */,
int8_t* HWY_RESTRICT p) {
vst1_s8(p, v.raw);
}
HWY_INLINE void StoreU(const Vec128<int16_t, 4> v, Simd<int16_t, 4> /* tag */,
int16_t* HWY_RESTRICT p) {
vst1_s16(p, v.raw);
}
HWY_INLINE void StoreU(const Vec128<int32_t, 2> v, Simd<int32_t, 2> /* tag */,
int32_t* HWY_RESTRICT p) {
vst1_s32(p, v.raw);
}
HWY_INLINE void StoreU(const Vec128<int64_t, 1> v, Simd<int64_t, 1> /* tag */,
int64_t* HWY_RESTRICT p) {
vst1_s64(p, v.raw);
}
HWY_INLINE void StoreU(const Vec128<float, 2> v, Simd<float, 2> /* tag */,
float* HWY_RESTRICT p) {
vst1_f32(p, v.raw);
}
#if defined(__aarch64__)
HWY_INLINE void StoreU(const Vec128<double, 1> v, Simd<double, 1> /* tag */,
double* HWY_RESTRICT p) {
vst1_f64(p, v.raw);
}
#endif
// ------------------------------ Store 32
HWY_INLINE void StoreU(const Vec128<uint8_t, 4> v, Simd<uint8_t, 4>,
uint8_t* HWY_RESTRICT p) {
uint32x2_t a = vreinterpret_u32_u8(v.raw);
vst1_lane_u32(p, a, 0);
}
HWY_INLINE void StoreU(const Vec128<uint16_t, 2> v, Simd<uint16_t, 2>,
uint16_t* HWY_RESTRICT p) {
uint32x2_t a = vreinterpret_u32_u16(v.raw);
vst1_lane_u32(p, a, 0);
}
HWY_INLINE void StoreU(const Vec128<uint32_t, 1> v, Simd<uint32_t, 1>,
uint32_t* HWY_RESTRICT p) {
vst1_lane_u32(p, v.raw, 0);
}
HWY_INLINE void StoreU(const Vec128<int8_t, 4> v, Simd<int8_t, 4>,
int8_t* HWY_RESTRICT p) {
int32x2_t a = vreinterpret_s32_s8(v.raw);
vst1_lane_s32(p, a, 0);
}
HWY_INLINE void StoreU(const Vec128<int16_t, 2> v, Simd<int16_t, 2>,
int16_t* HWY_RESTRICT p) {
int32x2_t a = vreinterpret_s32_s16(v.raw);
vst1_lane_s32(p, a, 0);
}
HWY_INLINE void StoreU(const Vec128<int32_t, 1> v, Simd<int32_t, 1>,
int32_t* HWY_RESTRICT p) {
vst1_lane_s32(p, v.raw, 0);
}
HWY_INLINE void StoreU(const Vec128<float, 1> v, Simd<float, 1>,
float* HWY_RESTRICT p) {
vst1_lane_f32(p, v.raw, 0);
}
// ------------------------------ Store 16
HWY_INLINE void StoreU(const Vec128<uint8_t, 2> v, Simd<uint8_t, 2>,
uint8_t* HWY_RESTRICT p) {
uint16x4_t a = vreinterpret_u16_u8(v.raw);
vst1_lane_u16(p, a, 0);
}
HWY_INLINE void StoreU(const Vec128<uint16_t, 1> v, Simd<uint16_t, 1>,
uint16_t* HWY_RESTRICT p) {
vst1_lane_u16(p, v.raw, 0);
}
HWY_INLINE void StoreU(const Vec128<int8_t, 2> v, Simd<int8_t, 2>,
int8_t* HWY_RESTRICT p) {
int16x4_t a = vreinterpret_s16_s8(v.raw);
vst1_lane_s16(p, a, 0);
}
HWY_INLINE void StoreU(const Vec128<int16_t, 1> v, Simd<int16_t, 1>,
int16_t* HWY_RESTRICT p) {
vst1_lane_s16(p, v.raw, 0);
}
// ------------------------------ Store 8
HWY_INLINE void StoreU(const Vec128<uint8_t, 1> v, Simd<uint8_t, 1>,
uint8_t* HWY_RESTRICT p) {
vst1_lane_u8(p, v.raw, 0);
}
HWY_INLINE void StoreU(const Vec128<int8_t, 1> v, Simd<int8_t, 1>,
int8_t* HWY_RESTRICT p) {
vst1_lane_s8(p, v.raw, 0);
}
// On ARM, Store is the same as StoreU.
template <typename T, size_t N>
HWY_INLINE void Store(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT p) {
StoreU(v, d, p);
}
// ------------------------------ Non-temporal stores
// Same as aligned stores on non-x86.
template <typename T, size_t N>
HWY_INLINE void Stream(const Vec128<T, N> v, Simd<T, N> d,
T* HWY_RESTRICT aligned) {
Store(v, d, aligned);
}
// ================================================== CONVERT
// ------------------------------ Promotions (part w/ narrow lanes -> full)
// Unsigned: zero-extend to full vector.
HWY_INLINE Vec128<uint16_t> PromoteTo(Full128<uint16_t> /* tag */,
const Vec128<uint8_t, 8> v) {
return Vec128<uint16_t>(vmovl_u8(v.raw));
}
HWY_INLINE Vec128<uint32_t> PromoteTo(Full128<uint32_t> /* tag */,
const Vec128<uint8_t, 4> v) {
uint16x8_t a = vmovl_u8(v.raw);
return Vec128<uint32_t>(vmovl_u16(vget_low_u16(a)));
}
HWY_INLINE Vec128<uint32_t> PromoteTo(Full128<uint32_t> /* tag */,
const Vec128<uint16_t, 4> v) {
return Vec128<uint32_t>(vmovl_u16(v.raw));
}
HWY_INLINE Vec128<uint64_t> PromoteTo(Full128<uint64_t> /* tag */,
const Vec128<uint32_t, 2> v) {
return Vec128<uint64_t>(vmovl_u32(v.raw));
}
HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> /* tag */,
const Vec128<uint8_t, 8> v) {
return Vec128<int16_t>(vmovl_u8(v.raw));
}
HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
const Vec128<uint8_t, 4> v) {
uint16x8_t a = vmovl_u8(v.raw);
return Vec128<int32_t>(vreinterpretq_s32_u16(vmovl_u16(vget_low_u16(a))));
}
HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
const Vec128<uint16_t, 4> v) {
return Vec128<int32_t>(vmovl_u16(v.raw));
}
// Unsigned: zero-extend to half vector.
template <size_t N, HWY_IF_LE64(uint16_t, N)>
HWY_INLINE Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw)));
}
template <size_t N, HWY_IF_LE64(uint32_t, N)>
HWY_INLINE Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
const Vec128<uint8_t, N> v) {
uint16x8_t a = vmovl_u8(v.raw);
return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(vget_low_u16(a))));
}
template <size_t N>
HWY_INLINE Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
const Vec128<uint16_t, N> v) {
return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(v.raw)));
}
template <size_t N, HWY_IF_LE64(uint64_t, N)>
HWY_INLINE Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N> /* tag */,
const Vec128<uint32_t, N> v) {
return Vec128<uint64_t, N>(vget_low_u64(vmovl_u32(v.raw)));
}
template <size_t N, HWY_IF_LE64(int16_t, N)>
HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<int16_t, N>(vget_low_s16(vmovl_u8(v.raw)));
}
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
const Vec128<uint8_t, N> v) {
uint16x8_t a = vmovl_u8(v.raw);
uint32x4_t b = vmovl_u16(vget_low_u16(a));
return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(b)));
}
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
const Vec128<uint16_t, N> v) {
uint32x4_t a = vmovl_u16(v.raw);
return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(a)));
}
HWY_INLINE Vec128<uint32_t> U32FromU8(const Vec128<uint8_t> v) {
return Vec128<uint32_t>(
vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v.raw)))));
}
// Signed: replicate sign bit to full vector.
HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> /* tag */,
const Vec128<int8_t, 8> v) {
return Vec128<int16_t>(vmovl_s8(v.raw));
}
HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
const Vec128<int8_t, 4> v) {
int16x8_t a = vmovl_s8(v.raw);
return Vec128<int32_t>(vmovl_s16(vget_low_s16(a)));
}
HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
const Vec128<int16_t, 4> v) {
return Vec128<int32_t>(vmovl_s16(v.raw));
}
HWY_INLINE Vec128<int64_t> PromoteTo(Full128<int64_t> /* tag */,
const Vec128<int32_t, 2> v) {
return Vec128<int64_t>(vmovl_s32(v.raw));
}
// Signed: replicate sign bit to half vector.
template <size_t N>
HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
const Vec128<int8_t, N> v) {
return Vec128<int16_t, N>(vget_low_s16(vmovl_s8(v.raw)));
}
template <size_t N>
HWY_INLINE Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
const Vec128<int8_t, N> v) {
int16x8_t a = vmovl_s8(v.raw);
int32x4_t b = vmovl_s16(vget_low_s16(a));
return Vec128<int32_t, N>(vget_low_s32(b));
}
template <size_t N>
HWY_INLINE Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
const Vec128<int16_t, N> v) {
return Vec128<int32_t, N>(vget_low_s32(vmovl_s16(v.raw)));
}
template <size_t N>
HWY_INLINE Vec128<int64_t, N> PromoteTo(Simd<int64_t, N> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<int64_t, N>(vget_low_s64(vmovl_s32(v.raw)));
}
#if defined(__aarch64__)
HWY_INLINE Vec128<double> PromoteTo(Full128<double> /* tag */,
const Vec128<float, 2> v) {
return Vec128<double>(vcvt_f64_f32(v.raw));
}
HWY_INLINE Vec128<double, 1> PromoteTo(Simd<double, 1> /* tag */,
const Vec128<float, 1> v) {
return Vec128<double, 1>(vget_low_f64(vcvt_f64_f32(v.raw)));
}
HWY_INLINE Vec128<double> PromoteTo(Full128<double> /* tag */,
const Vec128<int32_t, 2> v) {
const int64x2_t i64 = vmovl_s32(v.raw);
return Vec128<double>(vcvtq_f64_s64(i64));
}
HWY_INLINE Vec128<double, 1> PromoteTo(Simd<double, 1> /* tag */,
const Vec128<int32_t, 1> v) {
const int64x1_t i64 = vget_low_s64(vmovl_s32(v.raw));
return Vec128<double, 1>(vcvt_f64_s64(i64));
}
#endif
// ------------------------------ Demotions (full -> part w/ narrow lanes)
// From full vector to half or quarter
HWY_INLINE Vec128<uint16_t, 4> DemoteTo(Simd<uint16_t, 4> /* tag */,
const Vec128<int32_t> v) {
return Vec128<uint16_t, 4>(vqmovun_s32(v.raw));
}
HWY_INLINE Vec128<int16_t, 4> DemoteTo(Simd<int16_t, 4> /* tag */,
const Vec128<int32_t> v) {
return Vec128<int16_t, 4>(vqmovn_s32(v.raw));
}
HWY_INLINE Vec128<uint8_t, 4> DemoteTo(Simd<uint8_t, 4> /* tag */,
const Vec128<int32_t> v) {
const uint16x4_t a = vqmovun_s32(v.raw);
return Vec128<uint8_t, 4>(vqmovn_u16(vcombine_u16(a, a)));
}
HWY_INLINE Vec128<uint8_t, 8> DemoteTo(Simd<uint8_t, 8> /* tag */,
const Vec128<int16_t> v) {
return Vec128<uint8_t, 8>(vqmovun_s16(v.raw));
}
HWY_INLINE Vec128<int8_t, 4> DemoteTo(Simd<int8_t, 4> /* tag */,
const Vec128<int32_t> v) {
const int16x4_t a = vqmovn_s32(v.raw);
return Vec128<int8_t, 4>(vqmovn_s16(vcombine_s16(a, a)));
}
HWY_INLINE Vec128<int8_t, 8> DemoteTo(Simd<int8_t, 8> /* tag */,
const Vec128<int16_t> v) {
return Vec128<int8_t, 8>(vqmovn_s16(v.raw));
}
// From half vector to partial half
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<uint16_t, N>(vqmovun_s32(vcombine_s32(v.raw, v.raw)));
}
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int16_t, N> DemoteTo(Simd<int16_t, N> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<int16_t, N>(vqmovn_s32(vcombine_s32(v.raw, v.raw)));
}
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N> /* tag */,
const Vec128<int32_t, N> v) {
const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw));
return Vec128<uint8_t, N>(vqmovn_u16(vcombine_u16(a, a)));
}
template <size_t N, HWY_IF_LE64(int16_t, N)>
HWY_INLINE Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N> /* tag */,
const Vec128<int16_t, N> v) {
return Vec128<uint8_t, N>(vqmovun_s16(vcombine_s16(v.raw, v.raw)));
}
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int8_t, N> DemoteTo(Simd<int8_t, N> /* tag */,
const Vec128<int32_t, N> v) {
const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw));
return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(a, a)));
}
template <size_t N, HWY_IF_LE64(int16_t, N)>
HWY_INLINE Vec128<int8_t, N> DemoteTo(Simd<int8_t, N> /* tag */,
const Vec128<int16_t, N> v) {
return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(v.raw, v.raw)));
}
#if defined(__aarch64__)
HWY_INLINE Vec128<float, 2> DemoteTo(Simd<float, 2> /* tag */,
const Vec128<double> v) {
return Vec128<float, 2>(vcvt_f32_f64(v.raw));
}
HWY_INLINE Vec128<float, 1> DemoteTo(Simd<float, 1> /* tag */,
const Vec128<double, 1> v) {
return Vec128<float, 1>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
}
HWY_INLINE Vec128<int32_t, 2> DemoteTo(Simd<int32_t, 2> /* tag */,
const Vec128<double> v) {
const int64x2_t i64 = vcvtq_s64_f64(v.raw);
return Vec128<int32_t, 2>(vqmovn_s64(i64));
}
HWY_INLINE Vec128<int32_t, 1> DemoteTo(Simd<int32_t, 1> /* tag */,
const Vec128<double, 1> v) {
const int64x1_t i64 = vcvt_s64_f64(v.raw);
// There is no i64x1 -> i32x1 narrow, so expand to int64x2_t first.
const int64x2_t i64x2 = vcombine_s64(i64, i64);
return Vec128<int32_t, 1>(vqmovn_s64(i64x2));
}
#endif
HWY_INLINE Vec128<uint8_t, 4> U8FromU32(const Vec128<uint32_t> v) {
const uint8x16_t org_v = cast_to_u8(v).raw;
const uint8x16_t w = vuzp1q_u8(org_v, org_v);
return Vec128<uint8_t, 4>(vget_low_u8(vuzp1q_u8(w, w)));
}
// In the following DemoteTo functions, |b| is purposely undefined.
// The value a needs to be extended to 128 bits so that vqmovn can be
// used and |b| is undefined so that no extra overhead is introduced.
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
template <size_t N>
HWY_INLINE Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N> /* tag */,
const Vec128<int32_t> v) {
Vec128<uint16_t, N> a = DemoteTo(Simd<uint16_t, N>(), v);
Vec128<uint16_t, N> b;
uint16x8_t c = vcombine_u16(a.raw, b.raw);
return Vec128<uint8_t, N>(vqmovn_u16(c));
}
template <size_t N>
HWY_INLINE Vec128<int8_t, N> DemoteTo(Simd<int8_t, N> /* tag */,
const Vec128<int32_t> v) {
Vec128<int16_t, N> a = DemoteTo(Simd<int16_t, N>(), v);
Vec128<int16_t, N> b;
uint16x8_t c = vcombine_s16(a.raw, b.raw);
return Vec128<int8_t, N>(vqmovn_s16(c));
}
HWY_DIAGNOSTICS(pop)
// ------------------------------ Convert integer <=> floating-point
HWY_INLINE Vec128<float> ConvertTo(Full128<float> /* tag */,
const Vec128<int32_t> v) {
return Vec128<float>(vcvtq_f32_s32(v.raw));
}
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<float, N> ConvertTo(Simd<float, N> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<float, N>(vcvt_f32_s32(v.raw));
}
// Truncates (rounds toward zero).
HWY_INLINE Vec128<int32_t> ConvertTo(Full128<int32_t> /* tag */,
const Vec128<float> v) {
return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
}
template <size_t N, HWY_IF_LE64(float, N)>
HWY_INLINE Vec128<int32_t, N> ConvertTo(Simd<int32_t, N> /* tag */,
const Vec128<float, N> v) {
return Vec128<int32_t, N>(vcvt_s32_f32(v.raw));
}
#if defined(__aarch64__)
HWY_INLINE Vec128<double> ConvertTo(Full128<double> /* tag */,
const Vec128<int64_t> v) {
return Vec128<double>(vcvtq_f64_s64(v.raw));
}
HWY_INLINE Vec128<double, 1> ConvertTo(Simd<double, 1> /* tag */,
const Vec128<int64_t, 1> v) {
return Vec128<double, 1>(vcvt_f64_s64(v.raw));
}
// Truncates (rounds toward zero).
HWY_INLINE Vec128<int64_t> ConvertTo(Full128<int64_t> /* tag */,
const Vec128<double> v) {
return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
}
HWY_INLINE Vec128<int64_t, 1> ConvertTo(Simd<int64_t, 1> /* tag */,
const Vec128<double, 1> v) {
return Vec128<int64_t, 1>(vcvt_s64_f64(v.raw));
}
HWY_INLINE Vec128<int32_t> NearestInt(const Vec128<float> v) {
return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
}
template <size_t N, HWY_IF_LE64(float, N)>
HWY_INLINE Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
}
#else
template <size_t N>
HWY_INLINE Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
return ConvertTo(Simd<int32_t, N>(), Round(v));
}
#endif
// ================================================== SWIZZLE
// ------------------------------ Extract lane
HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, 16> v) {
return vget_lane_u8(vget_low_u8(v.raw), 0);
}
template <size_t N>
HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, N> v) {
return vget_lane_u8(v.raw, 0);
}
HWY_INLINE int8_t GetLane(const Vec128<int8_t, 16> v) {
return vget_lane_s8(vget_low_s8(v.raw), 0);
}
template <size_t N>
HWY_INLINE int8_t GetLane(const Vec128<int8_t, N> v) {
return vget_lane_s8(v.raw, 0);
}
HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, 8> v) {
return vget_lane_u16(vget_low_u16(v.raw), 0);
}
template <size_t N>
HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, N> v) {
return vget_lane_u16(v.raw, 0);
}
HWY_INLINE int16_t GetLane(const Vec128<int16_t, 8> v) {
return vget_lane_s16(vget_low_s16(v.raw), 0);
}
template <size_t N>
HWY_INLINE int16_t GetLane(const Vec128<int16_t, N> v) {
return vget_lane_s16(v.raw, 0);
}
HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, 4> v) {
return vget_lane_u32(vget_low_u32(v.raw), 0);
}
template <size_t N>
HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, N> v) {
return vget_lane_u32(v.raw, 0);
}
HWY_INLINE int32_t GetLane(const Vec128<int32_t, 4> v) {
return vget_lane_s32(vget_low_s32(v.raw), 0);
}
template <size_t N>
HWY_INLINE int32_t GetLane(const Vec128<int32_t, N> v) {
return vget_lane_s32(v.raw, 0);
}
HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 2> v) {
return vget_lane_u64(vget_low_u64(v.raw), 0);
}
HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 1> v) {
return vget_lane_u64(v.raw, 0);
}
HWY_INLINE int64_t GetLane(const Vec128<int64_t, 2> v) {
return vget_lane_s64(vget_low_s64(v.raw), 0);
}
HWY_INLINE int64_t GetLane(const Vec128<int64_t, 1> v) {
return vget_lane_s64(v.raw, 0);
}
HWY_INLINE float GetLane(const Vec128<float, 4> v) {
return vget_lane_f32(vget_low_f32(v.raw), 0);
}
HWY_INLINE float GetLane(const Vec128<float, 2> v) {
return vget_lane_f32(v.raw, 0);
}
HWY_INLINE float GetLane(const Vec128<float, 1> v) {
return vget_lane_f32(v.raw, 0);
}
#if defined(__aarch64__)
HWY_INLINE double GetLane(const Vec128<double, 2> v) {
return vget_lane_f64(vget_low_f64(v.raw), 0);
}
HWY_INLINE double GetLane(const Vec128<double, 1> v) {
return vget_lane_f64(v.raw, 0);
}
#endif
// ------------------------------ Extract half
// <= 64 bit: just return different type
template <typename T, size_t N, HWY_IF_LE64(uint8_t, N)>
HWY_INLINE Vec128<T, N / 2> LowerHalf(const Vec128<T, N> v) {
return Vec128<T, N / 2>(v.raw);
}
HWY_INLINE Vec128<uint8_t, 8> LowerHalf(const Vec128<uint8_t> v) {
return Vec128<uint8_t, 8>(vget_low_u8(v.raw));
}
HWY_INLINE Vec128<uint16_t, 4> LowerHalf(const Vec128<uint16_t> v) {
return Vec128<uint16_t, 4>(vget_low_u16(v.raw));
}
HWY_INLINE Vec128<uint32_t, 2> LowerHalf(const Vec128<uint32_t> v) {
return Vec128<uint32_t, 2>(vget_low_u32(v.raw));
}
HWY_INLINE Vec128<uint64_t, 1> LowerHalf(const Vec128<uint64_t> v) {
return Vec128<uint64_t, 1>(vget_low_u64(v.raw));
}
HWY_INLINE Vec128<int8_t, 8> LowerHalf(const Vec128<int8_t> v) {
return Vec128<int8_t, 8>(vget_low_s8(v.raw));
}
HWY_INLINE Vec128<int16_t, 4> LowerHalf(const Vec128<int16_t> v) {
return Vec128<int16_t, 4>(vget_low_s16(v.raw));
}
HWY_INLINE Vec128<int32_t, 2> LowerHalf(const Vec128<int32_t> v) {
return Vec128<int32_t, 2>(vget_low_s32(v.raw));
}
HWY_INLINE Vec128<int64_t, 1> LowerHalf(const Vec128<int64_t> v) {
return Vec128<int64_t, 1>(vget_low_s64(v.raw));
}
HWY_INLINE Vec128<float, 2> LowerHalf(const Vec128<float> v) {
return Vec128<float, 2>(vget_low_f32(v.raw));
}
#if defined(__aarch64__)
HWY_INLINE Vec128<double, 1> LowerHalf(const Vec128<double> v) {
return Vec128<double, 1>(vget_low_f64(v.raw));
}
#endif
HWY_INLINE Vec128<uint8_t, 8> UpperHalf(const Vec128<uint8_t> v) {
return Vec128<uint8_t, 8>(vget_high_u8(v.raw));
}
HWY_INLINE Vec128<uint16_t, 4> UpperHalf(const Vec128<uint16_t> v) {
return Vec128<uint16_t, 4>(vget_high_u16(v.raw));
}
HWY_INLINE Vec128<uint32_t, 2> UpperHalf(const Vec128<uint32_t> v) {
return Vec128<uint32_t, 2>(vget_high_u32(v.raw));
}
HWY_INLINE Vec128<uint64_t, 1> UpperHalf(const Vec128<uint64_t> v) {
return Vec128<uint64_t, 1>(vget_high_u64(v.raw));
}
HWY_INLINE Vec128<int8_t, 8> UpperHalf(const Vec128<int8_t> v) {
return Vec128<int8_t, 8>(vget_high_s8(v.raw));
}
HWY_INLINE Vec128<int16_t, 4> UpperHalf(const Vec128<int16_t> v) {
return Vec128<int16_t, 4>(vget_high_s16(v.raw));
}
HWY_INLINE Vec128<int32_t, 2> UpperHalf(const Vec128<int32_t> v) {
return Vec128<int32_t, 2>(vget_high_s32(v.raw));
}
HWY_INLINE Vec128<int64_t, 1> UpperHalf(const Vec128<int64_t> v) {
return Vec128<int64_t, 1>(vget_high_s64(v.raw));
}
HWY_INLINE Vec128<float, 2> UpperHalf(const Vec128<float> v) {
return Vec128<float, 2>(vget_high_f32(v.raw));
}
#if defined(__aarch64__)
HWY_INLINE Vec128<double, 1> UpperHalf(const Vec128<double> v) {
return Vec128<double, 1>(vget_high_f64(v.raw));
}
#endif
// ------------------------------ Extract from 2x 128-bit at constant offset
// Extracts 128 bits from <hi, lo> by skipping the least-significant kBytes.
template <int kBytes, typename T>
HWY_INLINE Vec128<T> CombineShiftRightBytes(const Vec128<T> hi,
const Vec128<T> lo) {
static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]");
const Full128<uint8_t> d8;
return BitCast(Full128<T>(),
Vec128<uint8_t>(vextq_u8(BitCast(d8, lo).raw,
BitCast(d8, hi).raw, kBytes)));
}
// ------------------------------ Shift vector by constant #bytes
namespace impl {
// Need to partially specialize because CombineShiftRightBytes<16> and <0> are
// compile errors.
template <int kBytes>
struct ShiftLeftBytesT {
template <class T, size_t N>
HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
return CombineShiftRightBytes<16 - kBytes>(v, Zero(Full128<T>()));
}
};
template <>
struct ShiftLeftBytesT<0> {
template <class T, size_t N>
HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
return v;
}
};
template <int kBytes>
struct ShiftRightBytesT {
template <class T, size_t N>
HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
return CombineShiftRightBytes<kBytes>(Zero(Full128<T>()), v);
}
};
template <>
struct ShiftRightBytesT<0> {
template <class T, size_t N>
HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
return v;
}
};
} // namespace impl
// 0x01..0F, kBytes = 1 => 0x02..0F00
template <int kBytes, typename T, size_t N>
HWY_INLINE Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
return impl::ShiftLeftBytesT<kBytes>()(v);
}
template <int kLanes, typename T, size_t N>
HWY_INLINE Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
const Simd<uint8_t, N * sizeof(T)> d8;
const Simd<T, N> d;
return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
}
// 0x01..0F, kBytes = 1 => 0x0001..0E
template <int kBytes, typename T, size_t N>
HWY_INLINE Vec128<T, N> ShiftRightBytes(const Vec128<T, N> v) {
return impl::ShiftRightBytesT<kBytes>()(v);
}
template <int kLanes, typename T, size_t N>
HWY_INLINE Vec128<T, N> ShiftRightLanes(const Vec128<T, N> v) {
const Simd<uint8_t, N * sizeof(T)> d8;
const Simd<T, N> d;
return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
}
// ------------------------------ Broadcast/splat any lane
#if defined(__aarch64__)
// Unsigned
template <int kLane>
HWY_INLINE Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) {
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane));
}
template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)>
HWY_INLINE Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
}
template <int kLane>
HWY_INLINE Vec128<uint32_t> Broadcast(const Vec128<uint32_t> v) {
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane));
}
template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)>
HWY_INLINE Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
}
template <int kLane>
HWY_INLINE Vec128<uint64_t> Broadcast(const Vec128<uint64_t> v) {
static_assert(0 <= kLane && kLane < 2, "Invalid lane");
return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane));
}
// Vec128<uint64_t, 1> is defined below.
// Signed
template <int kLane>
HWY_INLINE Vec128<int16_t> Broadcast(const Vec128<int16_t> v) {
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane));
}
template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)>
HWY_INLINE Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
}
template <int kLane>
HWY_INLINE Vec128<int32_t> Broadcast(const Vec128<int32_t> v) {
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane));
}
template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
}
template <int kLane>
HWY_INLINE Vec128<int64_t> Broadcast(const Vec128<int64_t> v) {
static_assert(0 <= kLane && kLane < 2, "Invalid lane");
return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane));
}
// Vec128<int64_t, 1> is defined below.
// Float
template <int kLane>
HWY_INLINE Vec128<float> Broadcast(const Vec128<float> v) {
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
return Vec128<float>(vdupq_laneq_f32(v.raw, kLane));
}
template <int kLane, size_t N, HWY_IF_LE64(float, N)>
HWY_INLINE Vec128<float, N> Broadcast(const Vec128<float, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
}
template <int kLane>
HWY_INLINE Vec128<double> Broadcast(const Vec128<double> v) {
static_assert(0 <= kLane && kLane < 2, "Invalid lane");
return Vec128<double>(vdupq_laneq_f64(v.raw, kLane));
}
template <int kLane>
HWY_INLINE Vec128<double, 1> Broadcast(const Vec128<double, 1> v) {
static_assert(0 <= kLane && kLane < 1, "Invalid lane");
return v;
}
#else
// No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*.
// Unsigned
template <int kLane>
HWY_INLINE Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) {
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
return Vec128<uint16_t>(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane)));
}
template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)>
HWY_INLINE Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
}
template <int kLane>
HWY_INLINE Vec128<uint32_t> Broadcast(const Vec128<uint32_t> v) {
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
return Vec128<uint32_t>(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane)));
}
template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)>
HWY_INLINE Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
}
template <int kLane>
HWY_INLINE Vec128<uint64_t> Broadcast(const Vec128<uint64_t> v) {
static_assert(0 <= kLane && kLane < 2, "Invalid lane");
return Vec128<uint64_t>(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane)));
}
// Vec128<uint64_t, 1> is defined below.
// Signed
template <int kLane>
HWY_INLINE Vec128<int16_t> Broadcast(const Vec128<int16_t> v) {
static_assert(0 <= kLane && kLane < 8, "Invalid lane");
return Vec128<int16_t>(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane)));
}
template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)>
HWY_INLINE Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
}
template <int kLane>
HWY_INLINE Vec128<int32_t> Broadcast(const Vec128<int32_t> v) {
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
return Vec128<int32_t>(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane)));
}
template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
}
template <int kLane>
HWY_INLINE Vec128<int64_t> Broadcast(const Vec128<int64_t> v) {
static_assert(0 <= kLane && kLane < 2, "Invalid lane");
return Vec128<int64_t>(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane)));
}
// Vec128<int64_t, 1> is defined below.
// Float
template <int kLane>
HWY_INLINE Vec128<float> Broadcast(const Vec128<float> v) {
static_assert(0 <= kLane && kLane < 4, "Invalid lane");
return Vec128<float>(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane)));
}
template <int kLane, size_t N, HWY_IF_LE64(float, N)>
HWY_INLINE Vec128<float, N> Broadcast(const Vec128<float, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
}
#endif
template <int kLane>
HWY_INLINE Vec128<uint64_t, 1> Broadcast(const Vec128<uint64_t, 1> v) {
static_assert(0 <= kLane && kLane < 1, "Invalid lane");
return v;
}
template <int kLane>
HWY_INLINE Vec128<int64_t, 1> Broadcast(const Vec128<int64_t, 1> v) {
static_assert(0 <= kLane && kLane < 1, "Invalid lane");
return v;
}
// ------------------------------ Shuffle bytes with variable indices
// Returns vector of bytes[from[i]]. "from" is also interpreted as bytes:
// either valid indices in [0, 16) or >= 0x80 to zero the i-th output byte.
template <typename T, typename TI>
HWY_INLINE Vec128<T> TableLookupBytes(const Vec128<T> bytes,
const Vec128<TI> from) {
const Full128<uint8_t> d8;
#if defined(__aarch64__)
return BitCast(Full128<T>(),
Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
BitCast(d8, from).raw)));
#else
uint8x16_t table0 = BitCast(d8, bytes).raw;
uint8x8x2_t table;
table.val[0] = vget_low_u8(table0);
table.val[1] = vget_high_u8(table0);
uint8x16_t idx = BitCast(d8, from).raw;
uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
return BitCast(Full128<T>(), Vec128<uint8_t>(vcombine_u8(low, hi)));
#endif
}
// ------------------------------ Hard-coded shuffles
// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
// Shuffle0321 rotates one lane to the right (the previous least-significant
// lane is now most-significant). These could also be implemented via
// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
// Swap 32-bit halves in 64-bits
HWY_INLINE Vec128<uint32_t, 2> Shuffle2301(const Vec128<uint32_t, 2> v) {
return Vec128<uint32_t, 2>(vrev64_u32(v.raw));
}
HWY_INLINE Vec128<int32_t, 2> Shuffle2301(const Vec128<int32_t, 2> v) {
return Vec128<int32_t, 2>(vrev64_s32(v.raw));
}
HWY_INLINE Vec128<float, 2> Shuffle2301(const Vec128<float, 2> v) {
return Vec128<float, 2>(vrev64_f32(v.raw));
}
HWY_INLINE Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
return Vec128<uint32_t>(vrev64q_u32(v.raw));
}
HWY_INLINE Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
return Vec128<int32_t>(vrev64q_s32(v.raw));
}
HWY_INLINE Vec128<float> Shuffle2301(const Vec128<float> v) {
return Vec128<float>(vrev64q_f32(v.raw));
}
// Swap 64-bit halves
template <typename T>
HWY_INLINE Vec128<T> Shuffle1032(const Vec128<T> v) {
return CombineShiftRightBytes<8>(v, v);
}
template <typename T>
HWY_INLINE Vec128<T> Shuffle01(const Vec128<T> v) {
return CombineShiftRightBytes<8>(v, v);
}
// Rotate right 32 bits
template <typename T>
HWY_INLINE Vec128<T> Shuffle0321(const Vec128<T> v) {
return CombineShiftRightBytes<4>(v, v);
}
// Rotate left 32 bits
template <typename T>
HWY_INLINE Vec128<T> Shuffle2103(const Vec128<T> v) {
return CombineShiftRightBytes<12>(v, v);
}
// Reverse
template <typename T>
HWY_INLINE Vec128<T> Shuffle0123(const Vec128<T> v) {
static_assert(sizeof(T) == 4,
"Shuffle0123 should only be applied to 32-bit types");
// TODO(janwas): more efficient implementation?,
// It is possible to use two instructions (vrev64q_u32 and vcombine_u32 of the
// high/low parts) instead of the extra memory and load.
static constexpr uint8_t bytes[16] = {12, 13, 14, 15, 8, 9, 10, 11,
4, 5, 6, 7, 0, 1, 2, 3};
return TableLookupBytes(v, Load(Full128<uint8_t>(), bytes));
}
// ------------------------------ Permute (runtime variable)
// Returned by SetTableIndices for use by TableLookupLanes.
template <typename T>
struct Permute128 {
uint8x16_t raw;
};
template <typename T>
HWY_INLINE Permute128<T> SetTableIndices(const Full128<T>, const int32_t* idx) {
#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
const size_t N = 16 / sizeof(T);
for (size_t i = 0; i < N; ++i) {
HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
}
#endif
const Full128<uint8_t> d8;
alignas(16) uint8_t control[16];
for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
const size_t idx_lane = idx_byte / sizeof(T);
const size_t mod = idx_byte % sizeof(T);
control[idx_byte] = idx[idx_lane] * sizeof(T) + mod;
}
return Permute128<T>{Load(d8, control).raw};
}
HWY_INLINE Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
const Permute128<uint32_t> idx) {
return TableLookupBytes(v, Vec128<uint8_t>(idx.raw));
}
HWY_INLINE Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
const Permute128<int32_t> idx) {
return TableLookupBytes(v, Vec128<uint8_t>(idx.raw));
}
HWY_INLINE Vec128<float> TableLookupLanes(const Vec128<float> v,
const Permute128<float> idx) {
const Full128<int32_t> di;
const Full128<float> df;
return BitCast(df,
TableLookupBytes(BitCast(di, v), Vec128<uint8_t>(idx.raw)));
}
// ------------------------------ Interleave lanes
// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
// the least-significant lane) and "b". To concatenate two half-width integers
// into one, use ZipLower/Upper instead (also works with scalar).
HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveLower, vzip1, _, 2)
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveLower, vzip1, _, 2)
HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2)
HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2)
#if defined(__aarch64__)
// For 64 bit types, we only have the "q" version of the function defined as
// interleaving 64-wide registers with 64-wide types in them makes no sense.
HWY_INLINE Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a,
const Vec128<uint64_t> b) {
return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw));
}
HWY_INLINE Vec128<int64_t> InterleaveLower(const Vec128<int64_t> a,
const Vec128<int64_t> b) {
return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw));
}
HWY_INLINE Vec128<uint64_t> InterleaveUpper(const Vec128<uint64_t> a,
const Vec128<uint64_t> b) {
return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw));
}
HWY_INLINE Vec128<int64_t> InterleaveUpper(const Vec128<int64_t> a,
const Vec128<int64_t> b) {
return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw));
}
#else
// ARMv7 emulation.
HWY_INLINE Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a,
const Vec128<uint64_t> b) {
auto flip = CombineShiftRightBytes<8>(a, a);
return CombineShiftRightBytes<8>(b, flip);
}
HWY_INLINE Vec128<int64_t> InterleaveLower(const Vec128<int64_t> a,
const Vec128<int64_t> b) {
auto flip = CombineShiftRightBytes<8>(a, a);
return CombineShiftRightBytes<8>(b, flip);
}
HWY_INLINE Vec128<uint64_t> InterleaveUpper(const Vec128<uint64_t> a,
const Vec128<uint64_t> b) {
auto flip = CombineShiftRightBytes<8>(b, b);
return CombineShiftRightBytes<8>(flip, a);
}
HWY_INLINE Vec128<int64_t> InterleaveUpper(const Vec128<int64_t> a,
const Vec128<int64_t> b) {
auto flip = CombineShiftRightBytes<8>(b, b);
return CombineShiftRightBytes<8>(flip, a);
}
#endif
// Floats
HWY_INLINE Vec128<float> InterleaveLower(const Vec128<float> a,
const Vec128<float> b) {
return Vec128<float>(vzip1q_f32(a.raw, b.raw));
}
#if defined(__aarch64__)
HWY_INLINE Vec128<double> InterleaveLower(const Vec128<double> a,
const Vec128<double> b) {
return Vec128<double>(vzip1q_f64(a.raw, b.raw));
}
#endif
HWY_INLINE Vec128<float> InterleaveUpper(const Vec128<float> a,
const Vec128<float> b) {
return Vec128<float>(vzip2q_f32(a.raw, b.raw));
}
#if defined(__aarch64__)
HWY_INLINE Vec128<double> InterleaveUpper(const Vec128<double> a,
const Vec128<double> b) {
return Vec128<double>(vzip2q_s64(a.raw, b.raw));
}
#endif
// ------------------------------ Zip lanes
// Same as interleave_*, except that the return lanes are double-width integers;
// this is necessary because the single-lane scalar cannot return two values.
// Full vectors
HWY_INLINE Vec128<uint16_t> ZipLower(const Vec128<uint8_t> a,
const Vec128<uint8_t> b) {
return Vec128<uint16_t>(vzip1q_u8(a.raw, b.raw));
}
HWY_INLINE Vec128<uint32_t> ZipLower(const Vec128<uint16_t> a,
const Vec128<uint16_t> b) {
return Vec128<uint32_t>(vzip1q_u16(a.raw, b.raw));
}
HWY_INLINE Vec128<uint64_t> ZipLower(const Vec128<uint32_t> a,
const Vec128<uint32_t> b) {
return Vec128<uint64_t>(vzip1q_u32(a.raw, b.raw));
}
HWY_INLINE Vec128<int16_t> ZipLower(const Vec128<int8_t> a,
const Vec128<int8_t> b) {
return Vec128<int16_t>(vzip1q_s8(a.raw, b.raw));
}
HWY_INLINE Vec128<int32_t> ZipLower(const Vec128<int16_t> a,
const Vec128<int16_t> b) {
return Vec128<int32_t>(vzip1q_s16(a.raw, b.raw));
}
HWY_INLINE Vec128<int64_t> ZipLower(const Vec128<int32_t> a,
const Vec128<int32_t> b) {
return Vec128<int64_t>(vzip1q_s32(a.raw, b.raw));
}
HWY_INLINE Vec128<uint16_t> ZipUpper(const Vec128<uint8_t> a,
const Vec128<uint8_t> b) {
return Vec128<uint16_t>(vzip2q_u8(a.raw, b.raw));
}
HWY_INLINE Vec128<uint32_t> ZipUpper(const Vec128<uint16_t> a,
const Vec128<uint16_t> b) {
return Vec128<uint32_t>(vzip2q_u16(a.raw, b.raw));
}
HWY_INLINE Vec128<uint64_t> ZipUpper(const Vec128<uint32_t> a,
const Vec128<uint32_t> b) {
return Vec128<uint64_t>(vzip2q_u32(a.raw, b.raw));
}
HWY_INLINE Vec128<int16_t> ZipUpper(const Vec128<int8_t> a,
const Vec128<int8_t> b) {
return Vec128<int16_t>(vzip2q_s8(a.raw, b.raw));
}
HWY_INLINE Vec128<int32_t> ZipUpper(const Vec128<int16_t> a,
const Vec128<int16_t> b) {
return Vec128<int32_t>(vzip2q_s16(a.raw, b.raw));
}
HWY_INLINE Vec128<int64_t> ZipUpper(const Vec128<int32_t> a,
const Vec128<int32_t> b) {
return Vec128<int64_t>(vzip2q_s32(a.raw, b.raw));
}
// Half vectors or less
template <size_t N, HWY_IF_LE64(uint8_t, N)>
HWY_INLINE Vec128<uint16_t, (N + 1) / 2> ZipLower(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint16_t, (N + 1) / 2>(vzip1_u8(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(uint16_t, N)>
HWY_INLINE Vec128<uint32_t, (N + 1) / 2> ZipLower(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint32_t, (N + 1) / 2>(vzip1_u16(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(uint32_t, N)>
HWY_INLINE Vec128<uint64_t, (N + 1) / 2> ZipLower(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Vec128<uint64_t, (N + 1) / 2>(vzip1_u32(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(int8_t, N)>
HWY_INLINE Vec128<int16_t, (N + 1) / 2> ZipLower(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int16_t, (N + 1) / 2>(vzip1_s8(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(int16_t, N)>
HWY_INLINE Vec128<int32_t, (N + 1) / 2> ZipLower(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int32_t, (N + 1) / 2>(vzip1_s16(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int64_t, (N + 1) / 2> ZipLower(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Vec128<int64_t, (N + 1) / 2>(vzip1_s32(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(uint8_t, N)>
HWY_INLINE Vec128<uint16_t, N / 2> ZipUpper(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint16_t, N / 2>(vzip2_u8(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(uint16_t, N)>
HWY_INLINE Vec128<uint32_t, N / 2> ZipUpper(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint32_t, N / 2>(vzip2_u16(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(uint32_t, N)>
HWY_INLINE Vec128<uint64_t, N / 2> ZipUpper(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Vec128<uint64_t, N / 2>(vzip2_u32(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(int8_t, N)>
HWY_INLINE Vec128<int16_t, N / 2> ZipUpper(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int16_t, N / 2>(vzip2_s8(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(int16_t, N)>
HWY_INLINE Vec128<int32_t, N / 2> ZipUpper(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int32_t, N / 2>(vzip2_s16(a.raw, b.raw));
}
template <size_t N, HWY_IF_LE64(int32_t, N)>
HWY_INLINE Vec128<int64_t, N / 2> ZipUpper(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Vec128<int64_t, N / 2>(vzip2_s32(a.raw, b.raw));
}
// ------------------------------ Blocks
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
template <typename T>
HWY_INLINE Vec128<T> ConcatLowerLower(const Vec128<T> hi, const Vec128<T> lo) {
const Full128<uint64_t> d64;
return BitCast(Full128<T>(),
InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
}
// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
template <typename T>
HWY_INLINE Vec128<T> ConcatUpperUpper(const Vec128<T> hi, const Vec128<T> lo) {
const Full128<uint64_t> d64;
return BitCast(Full128<T>(),
InterleaveUpper(BitCast(d64, lo), BitCast(d64, hi)));
}
// hiH,hiL loH,loL |-> hiL,loH (= inner halves)
template <typename T>
HWY_INLINE Vec128<T> ConcatLowerUpper(const Vec128<T> hi, const Vec128<T> lo) {
return CombineShiftRightBytes<8>(hi, lo);
}
// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
template <typename T>
HWY_INLINE Vec128<T> ConcatUpperLower(const Vec128<T> hi, const Vec128<T> lo) {
// TODO(janwas): more efficient implementation?
alignas(16) const uint8_t kBytes[16] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0};
const auto vec = BitCast(Full128<T>(), Load(Full128<uint8_t>(), kBytes));
return IfThenElse(MaskFromVec(vec), lo, hi);
}
// ------------------------------ Odd/even lanes
template <typename T>
HWY_INLINE Vec128<T> OddEven(const Vec128<T> a, const Vec128<T> b) {
alignas(16) constexpr uint8_t kBytes[16] = {
((0 / sizeof(T)) & 1) ? 0 : 0xFF, ((1 / sizeof(T)) & 1) ? 0 : 0xFF,
((2 / sizeof(T)) & 1) ? 0 : 0xFF, ((3 / sizeof(T)) & 1) ? 0 : 0xFF,
((4 / sizeof(T)) & 1) ? 0 : 0xFF, ((5 / sizeof(T)) & 1) ? 0 : 0xFF,
((6 / sizeof(T)) & 1) ? 0 : 0xFF, ((7 / sizeof(T)) & 1) ? 0 : 0xFF,
((8 / sizeof(T)) & 1) ? 0 : 0xFF, ((9 / sizeof(T)) & 1) ? 0 : 0xFF,
((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF,
((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF,
((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF,
};
const auto vec = BitCast(Full128<T>(), Load(Full128<uint8_t>(), kBytes));
return IfThenElse(MaskFromVec(vec), b, a);
}
// ================================================== MISC
// Returns a vector with lane i=[0, N) set to "first" + i.
template <typename T, size_t N, typename T2>
Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
HWY_ALIGN T lanes[16 / sizeof(T)];
for (size_t i = 0; i < 16 / sizeof(T); ++i) {
lanes[i] = static_cast<T>(first + static_cast<T2>(i));
}
return Load(d, lanes);
}
// ------------------------------ Gather (requires GetLane)
template <typename T, size_t N, typename Offset>
HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
const T* HWY_RESTRICT base,
const Vec128<Offset, N> offset) {
static_assert(N == 1, "NEON does not support full gather");
static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
T val;
CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
return Set(d, val);
}
template <typename T, size_t N, typename Index>
HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
const Vec128<Index, N> index) {
static_assert(N == 1, "NEON does not support full gather");
static_assert(sizeof(T) == sizeof(Index), "T must match Index");
return Set(d, base[GetLane(index)]);
}
// ------------------------------ ARMv7 int64 equality (requires Shuffle2301)
#if !defined(__aarch64__)
template <size_t N>
HWY_INLINE Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
const Vec128<int64_t, N> b) {
const Simd<int32_t, N * 2> d32;
const Simd<int64_t, N> d64;
const auto cmp32 = VecFromMask(BitCast(d32, a) == BitCast(d32, b));
const auto cmp64 = cmp32 & Shuffle2301(cmp32);
return MaskFromVec(BitCast(d64, cmp64));
}
template <size_t N>
HWY_INLINE Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
const Vec128<uint64_t, N> b) {
const Simd<uint32_t, N * 2> d32;
const Simd<uint64_t, N> d64;
const auto cmp32 = VecFromMask(BitCast(d32, a) == BitCast(d32, b));
const auto cmp64 = cmp32 & Shuffle2301(cmp32);
return MaskFromVec(BitCast(d64, cmp64));
}
#endif
// ------------------------------ Horizontal sum (reduction)
// Returns 64-bit sums of 8-byte groups.
HWY_INLINE Vec128<uint64_t> SumsOfU8x8(const Vec128<uint8_t> v) {
uint16x8_t a = vpaddlq_u8(v.raw);
uint32x4_t b = vpaddlq_u16(a);
return Vec128<uint64_t>(vpaddlq_u32(b));
}
HWY_INLINE Vec128<uint64_t, 1> SumsOfU8x8(const Vec128<uint8_t, 8> v) {
uint16x4_t a = vpaddl_u8(v.raw);
uint32x2_t b = vpaddl_u16(a);
return Vec128<uint64_t, 1>(vpaddl_u32(b));
}
#if defined(__aarch64__)
// Supported for 32b and 64b vector types. Returns the sum in each lane.
HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
}
HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(v.raw)));
}
HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
return Vec128<float>(vdupq_n_f32(vaddvq_f32(v.raw)));
}
HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(v.raw)));
}
HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(v.raw)));
}
HWY_INLINE Vec128<double> SumOfLanes(const Vec128<double> v) {
return Vec128<double>(vdupq_n_f64(vaddvq_f64(v.raw)));
}
#else
// ARMv7 version for everything except doubles.
HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
uint32x4x2_t v0 = vuzpq_u32(v.raw, v.raw);
uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
uint32x4x2_t v1 = vuzpq_u32(c0, c0);
return Vec128<uint32_t>(vaddq_u32(v1.val[0], v1.val[1]));
}
HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
int32x4x2_t v0 = vuzpq_s32(v.raw, v.raw);
int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
int32x4x2_t v1 = vuzpq_s32(c0, c0);
return Vec128<int32_t>(vaddq_s32(v1.val[0], v1.val[1]));
}
HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
float32x4x2_t v0 = vuzpq_f32(v.raw, v.raw);
float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
float32x4x2_t v1 = vuzpq_f32(c0, c0);
return Vec128<float>(vaddq_f32(v1.val[0], v1.val[1]));
}
HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
return v + CombineShiftRightBytes<8>(v, v);
}
HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
return v + CombineShiftRightBytes<8>(v, v);
}
#endif
namespace detail {
// For u32/i32/f32.
template <typename T, size_t N>
HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<T, N> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return Min(v20_31_20_31, v31_20_31_20);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<T, N> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return Max(v20_31_20_31, v31_20_31_20);
}
// For u64/i64[/f64].
template <typename T, size_t N>
HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
const Vec128<T, N> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return Min(v10, v01);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
const Vec128<T, N> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return Max(v10, v01);
}
} // namespace detail
template <typename T, size_t N>
HWY_API Vec128<T, N> MinOfLanes(const Vec128<T, N> v) {
return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> MaxOfLanes(const Vec128<T, N> v) {
return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
}
// ------------------------------ Mask
template <typename T>
HWY_INLINE bool AllFalse(const Mask128<T> v) {
const auto v64 = BitCast(Full128<uint64_t>(), VecFromMask(v));
uint32x2_t a = vqmovn_u64(v64.raw);
return vreinterpret_u64_u32(a)[0] == 0;
}
template <typename T>
HWY_INLINE bool AllTrue(const Mask128<T> v) {
return AllFalse(VecFromMask(v) == Zero(Full128<T>()));
}
namespace impl {
template <typename T>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
const Mask128<T> mask) {
constexpr uint8x16_t kCollapseMask = {
1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
};
const Full128<uint8_t> du;
const uint8x16_t values = BitCast(du, VecFromMask(mask)).raw & kCollapseMask;
#if defined(__aarch64__)
// Can't vaddv - we need two separate bytes (16 bits).
const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values, values));
const uint8x8_t x4 = vpadd_u8(x2, x2);
const uint8x8_t x8 = vpadd_u8(x4, x4);
return vreinterpret_u16_u8(x8)[0];
#else
// Don't have vpaddq, so keep doubling lane size.
const uint16x8_t x2 = vpaddlq_u8(values);
const uint32x4_t x4 = vpaddlq_u16(x2);
const uint64x2_t x8 = vpaddlq_u32(x4);
return (uint64_t(x8[1]) << 8) | x8[0];
#endif
}
template <typename T>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
const Mask128<T> mask) {
constexpr uint16x8_t kCollapseMask = {1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80};
const Full128<uint16_t> du;
const uint16x8_t values = BitCast(du, VecFromMask(mask)).raw & kCollapseMask;
#if defined(__aarch64__)
return vaddvq_u16(values);
#else
const uint32x4_t x2 = vpaddlq_u16(values);
const uint64x2_t x4 = vpaddlq_u32(x2);
return x4[0] + x4[1];
#endif
}
template <typename T>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
const Mask128<T> mask) {
constexpr uint32x4_t kCollapseMask = {1, 2, 4, 8};
const Full128<uint32_t> du;
const uint32x4_t values = BitCast(du, VecFromMask(mask)).raw & kCollapseMask;
#if defined(__aarch64__)
return vaddvq_u32(values);
#else
const uint64x2_t x2 = vpaddlq_u32(values);
return x2[0] + x2[1];
#endif
}
template <typename T>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, const Mask128<T> v) {
constexpr uint64x2_t kCollapseMask = {1, 2};
const Full128<uint64_t> du;
const uint64x2_t values = BitCast(du, VecFromMask(v)).raw & kCollapseMask;
#if defined(__aarch64__)
return vaddvq_u64(values);
#else
return values[0] + values[1];
#endif
}
// Returns number of lanes whose mask is set.
//
// Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op
// ("vsubv"). ANDing with 1 would work but requires a constant. Negating also
// changes each lane to 1 (if mask set) or 0.
template <typename T>
HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> mask) {
const Full128<int8_t> di;
const int8x16_t ones = vnegq_s8(BitCast(di, VecFromMask(mask)).raw);
#if defined(__aarch64__)
return vaddvq_s8(ones);
#else
const int16x8_t x2 = vpaddlq_s8(ones);
const int32x4_t x4 = vpaddlq_s16(x2);
const int64x2_t x8 = vpaddlq_s32(x4);
return x8[0] + x8[1];
#endif
}
template <typename T>
HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> mask) {
const Full128<int16_t> di;
const int16x8_t ones = vnegq_s16(BitCast(di, VecFromMask(mask)).raw);
#if defined(__aarch64__)
return vaddvq_s16(ones);
#else
const int32x4_t x2 = vpaddlq_s16(ones);
const int64x2_t x4 = vpaddlq_s32(x2);
return x4[0] + x4[1];
#endif
}
template <typename T>
HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> mask) {
const Full128<int32_t> di;
const int32x4_t ones = vnegq_s32(BitCast(di, VecFromMask(mask)).raw);
#if defined(__aarch64__)
return vaddvq_s32(ones);
#else
const int64x2_t x2 = vpaddlq_s32(ones);
return x2[0] + x2[1];
#endif
}
template <typename T>
HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> mask) {
#if defined(__aarch64__)
const Full128<int64_t> di;
const int64x2_t ones = vnegq_s64(BitCast(di, VecFromMask(mask)).raw);
return vaddvq_s64(ones);
#else
const Full128<int64_t> di;
const int64x2_t ones = vshrq_n_u64(BitCast(di, VecFromMask(mask)).raw, 63);
return ones[0] + ones[1];
#endif
}
} // namespace impl
template <typename T>
HWY_INLINE uint64_t BitsFromMask(const Mask128<T> mask) {
return impl::BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask);
}
template <typename T>
HWY_INLINE size_t CountTrue(const Mask128<T> mask) {
return impl::CountTrue(hwy::SizeTag<sizeof(T)>(), mask);
}
#if !defined(__aarch64__)
#undef vuzp1_s8
#undef vuzp1_u8
#undef vuzp1_s16
#undef vuzp1_u16
#undef vuzp1_s32
#undef vuzp1_u32
#undef vuzp1_f32
#undef vuzp1q_s8
#undef vuzp1q_u8
#undef vuzp1q_s16
#undef vuzp1q_u16
#undef vuzp1q_s32
#undef vuzp1q_u32
#undef vuzp1q_f32
#undef vuzp2_s8
#undef vuzp2_u8
#undef vuzp2_s16
#undef vuzp2_u16
#undef vuzp2_s32
#undef vuzp2_u32
#undef vuzp2_f32
#undef vuzp2q_s8
#undef vuzp2q_u8
#undef vuzp2q_s16
#undef vuzp2q_u16
#undef vuzp2q_s32
#undef vuzp2q_u32
#undef vuzp2q_f32
#undef vzip1_s8
#undef vzip1_u8
#undef vzip1_s16
#undef vzip1_u16
#undef vzip1_s32
#undef vzip1_u32
#undef vzip1_f32
#undef vzip1q_s8
#undef vzip1q_u8
#undef vzip1q_s16
#undef vzip1q_u16
#undef vzip1q_s32
#undef vzip1q_u32
#undef vzip1q_f32
#undef vzip2_s8
#undef vzip2_u8
#undef vzip2_s16
#undef vzip2_u16
#undef vzip2_s32
#undef vzip2_u32
#undef vzip2_f32
#undef vzip2q_s8
#undef vzip2q_u8
#undef vzip2q_s16
#undef vzip2q_u16
#undef vzip2q_s32
#undef vzip2q_u32
#undef vzip2q_f32
#endif
#undef HWY_NEON_BUILD_ARG_1
#undef HWY_NEON_BUILD_ARG_2
#undef HWY_NEON_BUILD_ARG_3
#undef HWY_NEON_BUILD_PARAM_1
#undef HWY_NEON_BUILD_PARAM_2
#undef HWY_NEON_BUILD_PARAM_3
#undef HWY_NEON_BUILD_RET_1
#undef HWY_NEON_BUILD_RET_2
#undef HWY_NEON_BUILD_RET_3
#undef HWY_NEON_BUILD_TPL_1
#undef HWY_NEON_BUILD_TPL_2
#undef HWY_NEON_BUILD_TPL_3
#undef HWY_NEON_DEF_FUNCTION
#undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
#undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
#undef HWY_NEON_DEF_FUNCTION_INT_8
#undef HWY_NEON_DEF_FUNCTION_INT_16
#undef HWY_NEON_DEF_FUNCTION_INT_32
#undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
#undef HWY_NEON_DEF_FUNCTION_INTS
#undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
#undef HWY_NEON_DEF_FUNCTION_TPL
#undef HWY_NEON_DEF_FUNCTION_UINT_8
#undef HWY_NEON_DEF_FUNCTION_UINT_16
#undef HWY_NEON_DEF_FUNCTION_UINT_32
#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
#undef HWY_NEON_DEF_FUNCTION_UINTS
#undef HWY_NEON_EVAL
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();