| // Copyright 2019 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // 128-bit ARM64 NEON vectors and operations. |
| // External include guard in highway.h - see comment there. |
| |
| #include <arm_neon.h> |
| |
| #include "hwy/ops/shared-inl.h" |
| |
| HWY_BEFORE_NAMESPACE(); |
| namespace hwy { |
| namespace HWY_NAMESPACE { |
| |
| // Macros used to define single and double function calls for multiple types |
| // for full and half vectors. These macros are undefined at the end of the file. |
| |
| // HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function. |
| #define HWY_NEON_BUILD_TPL_1 |
| #define HWY_NEON_BUILD_TPL_2 |
| #define HWY_NEON_BUILD_TPL_3 |
| |
| // HWY_NEON_BUILD_RET_* is return type. |
| #define HWY_NEON_BUILD_RET_1(type, size) Vec128<type, size> |
| #define HWY_NEON_BUILD_RET_2(type, size) Vec128<type, size> |
| #define HWY_NEON_BUILD_RET_3(type, size) Vec128<type, size> |
| |
| // HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives. |
| #define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type, size> a |
| #define HWY_NEON_BUILD_PARAM_2(type, size) \ |
| const Vec128<type, size> a, const Vec128<type, size> b |
| #define HWY_NEON_BUILD_PARAM_3(type, size) \ |
| const Vec128<type, size> a, const Vec128<type, size> b, \ |
| const Vec128<type, size> c |
| |
| // HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying |
| // function. |
| #define HWY_NEON_BUILD_ARG_1 a.raw |
| #define HWY_NEON_BUILD_ARG_2 a.raw, b.raw |
| #define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw |
| |
| // We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after |
| // the __VA_ARGS__ have been expanded. This allows "func" to be a macro on |
| // itself like with some of the library "functions" such as vshlq_u8. For |
| // example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as |
| // "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed. |
| // Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro |
| // expects two arguments. |
| #define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__) |
| |
| // Main macro definition that defines a single function for the given type and |
| // size of vector, using the underlying (prefix##infix##suffix) function and |
| // the template, return type, parameters and arguments defined by the "args" |
| // parameters passed here (see HWY_NEON_BUILD_* macros defined before). |
| #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ |
| HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \ |
| HWY_INLINE HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \ |
| name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \ |
| return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \ |
| HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \ |
| } |
| |
| // The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function |
| // called "name" using the set of neon functions starting with the given |
| // "prefix" for all the variants of certain types, as specified next to each |
| // macro. For example, the prefix "vsub" can be used to define the operator- |
| // using args=2. |
| |
| // uint8_t |
| #define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION(uint8_t, 16, name, prefix##q, infix, u8, args) \ |
| HWY_NEON_DEF_FUNCTION(uint8_t, 8, name, prefix, infix, u8, args) \ |
| HWY_NEON_DEF_FUNCTION(uint8_t, 4, name, prefix, infix, u8, args) \ |
| HWY_NEON_DEF_FUNCTION(uint8_t, 2, name, prefix, infix, u8, args) \ |
| HWY_NEON_DEF_FUNCTION(uint8_t, 1, name, prefix, infix, u8, args) |
| |
| // int8_t |
| #define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION(int8_t, 16, name, prefix##q, infix, s8, args) \ |
| HWY_NEON_DEF_FUNCTION(int8_t, 8, name, prefix, infix, s8, args) \ |
| HWY_NEON_DEF_FUNCTION(int8_t, 4, name, prefix, infix, s8, args) \ |
| HWY_NEON_DEF_FUNCTION(int8_t, 2, name, prefix, infix, s8, args) \ |
| HWY_NEON_DEF_FUNCTION(int8_t, 1, name, prefix, infix, s8, args) |
| |
| // uint16_t |
| #define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION(uint16_t, 8, name, prefix##q, infix, u16, args) \ |
| HWY_NEON_DEF_FUNCTION(uint16_t, 4, name, prefix, infix, u16, args) \ |
| HWY_NEON_DEF_FUNCTION(uint16_t, 2, name, prefix, infix, u16, args) \ |
| HWY_NEON_DEF_FUNCTION(uint16_t, 1, name, prefix, infix, u16, args) |
| |
| // int16_t |
| #define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION(int16_t, 8, name, prefix##q, infix, s16, args) \ |
| HWY_NEON_DEF_FUNCTION(int16_t, 4, name, prefix, infix, s16, args) \ |
| HWY_NEON_DEF_FUNCTION(int16_t, 2, name, prefix, infix, s16, args) \ |
| HWY_NEON_DEF_FUNCTION(int16_t, 1, name, prefix, infix, s16, args) |
| |
| // uint32_t |
| #define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION(uint32_t, 4, name, prefix##q, infix, u32, args) \ |
| HWY_NEON_DEF_FUNCTION(uint32_t, 2, name, prefix, infix, u32, args) \ |
| HWY_NEON_DEF_FUNCTION(uint32_t, 1, name, prefix, infix, u32, args) |
| |
| // int32_t |
| #define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION(int32_t, 4, name, prefix##q, infix, s32, args) \ |
| HWY_NEON_DEF_FUNCTION(int32_t, 2, name, prefix, infix, s32, args) \ |
| HWY_NEON_DEF_FUNCTION(int32_t, 1, name, prefix, infix, s32, args) |
| |
| // uint64_t |
| #define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION(uint64_t, 2, name, prefix##q, infix, u64, args) \ |
| HWY_NEON_DEF_FUNCTION(uint64_t, 1, name, prefix, infix, u64, args) |
| |
| // int64_t |
| #define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION(int64_t, 2, name, prefix##q, infix, s64, args) \ |
| HWY_NEON_DEF_FUNCTION(int64_t, 1, name, prefix, infix, s64, args) |
| |
| // float and double |
| #if defined(__aarch64__) |
| #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args) \ |
| HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args) \ |
| HWY_NEON_DEF_FUNCTION(float, 1, name, prefix, infix, f32, args) \ |
| HWY_NEON_DEF_FUNCTION(double, 2, name, prefix##q, infix, f64, args) \ |
| HWY_NEON_DEF_FUNCTION(double, 1, name, prefix, infix, f64, args) |
| #else |
| #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION(float, 4, name, prefix##q, infix, f32, args) \ |
| HWY_NEON_DEF_FUNCTION(float, 2, name, prefix, infix, f32, args) \ |
| HWY_NEON_DEF_FUNCTION(float, 1, name, prefix, infix, f32, args) |
| #endif |
| |
| // Helper macros to define for more than one type. |
| // uint8_t, uint16_t and uint32_t |
| #define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) |
| |
| // int8_t, int16_t and int32_t |
| #define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) |
| |
| // uint8_t, uint16_t, uint32_t and uint64_t |
| #define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) |
| |
| // int8_t, int16_t, int32_t and int64_t |
| #define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) |
| |
| // All int*_t and uint*_t up to 64 |
| #define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) |
| |
| // All previous types. |
| #define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \ |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) |
| |
| // Emulation of some intrinsics on armv7. |
| #if !defined(__aarch64__) |
| #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0] |
| #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0] |
| #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0] |
| #define vuzp1_u16(x, y) vuzp_u16(x, y).val[0] |
| #define vuzp1_s32(x, y) vuzp_s32(x, y).val[0] |
| #define vuzp1_u32(x, y) vuzp_u32(x, y).val[0] |
| #define vuzp1_f32(x, y) vuzp_f32(x, y).val[0] |
| #define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0] |
| #define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0] |
| #define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0] |
| #define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0] |
| #define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0] |
| #define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0] |
| #define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0] |
| #define vuzp2_s8(x, y) vuzp_s8(x, y).val[1] |
| #define vuzp2_u8(x, y) vuzp_u8(x, y).val[1] |
| #define vuzp2_s16(x, y) vuzp_s16(x, y).val[1] |
| #define vuzp2_u16(x, y) vuzp_u16(x, y).val[1] |
| #define vuzp2_s32(x, y) vuzp_s32(x, y).val[1] |
| #define vuzp2_u32(x, y) vuzp_u32(x, y).val[1] |
| #define vuzp2_f32(x, y) vuzp_f32(x, y).val[1] |
| #define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1] |
| #define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1] |
| #define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1] |
| #define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1] |
| #define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1] |
| #define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1] |
| #define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1] |
| #define vzip1_s8(x, y) vzip_s8(x, y).val[0] |
| #define vzip1_u8(x, y) vzip_u8(x, y).val[0] |
| #define vzip1_s16(x, y) vzip_s16(x, y).val[0] |
| #define vzip1_u16(x, y) vzip_u16(x, y).val[0] |
| #define vzip1_f32(x, y) vzip_f32(x, y).val[0] |
| #define vzip1_u32(x, y) vzip_u32(x, y).val[0] |
| #define vzip1_s32(x, y) vzip_s32(x, y).val[0] |
| #define vzip1q_s8(x, y) vzipq_s8(x, y).val[0] |
| #define vzip1q_u8(x, y) vzipq_u8(x, y).val[0] |
| #define vzip1q_s16(x, y) vzipq_s16(x, y).val[0] |
| #define vzip1q_u16(x, y) vzipq_u16(x, y).val[0] |
| #define vzip1q_s32(x, y) vzipq_s32(x, y).val[0] |
| #define vzip1q_u32(x, y) vzipq_u32(x, y).val[0] |
| #define vzip1q_f32(x, y) vzipq_f32(x, y).val[0] |
| #define vzip2_s8(x, y) vzip_s8(x, y).val[1] |
| #define vzip2_u8(x, y) vzip_u8(x, y).val[1] |
| #define vzip2_s16(x, y) vzip_s16(x, y).val[1] |
| #define vzip2_u16(x, y) vzip_u16(x, y).val[1] |
| #define vzip2_s32(x, y) vzip_s32(x, y).val[1] |
| #define vzip2_u32(x, y) vzip_u32(x, y).val[1] |
| #define vzip2_f32(x, y) vzip_f32(x, y).val[1] |
| #define vzip2q_s8(x, y) vzipq_s8(x, y).val[1] |
| #define vzip2q_u8(x, y) vzipq_u8(x, y).val[1] |
| #define vzip2q_s16(x, y) vzipq_s16(x, y).val[1] |
| #define vzip2q_u16(x, y) vzipq_u16(x, y).val[1] |
| #define vzip2q_s32(x, y) vzipq_s32(x, y).val[1] |
| #define vzip2q_u32(x, y) vzipq_u32(x, y).val[1] |
| #define vzip2q_f32(x, y) vzipq_f32(x, y).val[1] |
| #endif |
| |
| template <typename T, size_t N> |
| struct Raw128; |
| |
| // 128 |
| template <> |
| struct Raw128<uint8_t, 16> { |
| using type = uint8x16_t; |
| }; |
| |
| template <> |
| struct Raw128<uint16_t, 8> { |
| using type = uint16x8_t; |
| }; |
| |
| template <> |
| struct Raw128<uint32_t, 4> { |
| using type = uint32x4_t; |
| }; |
| |
| template <> |
| struct Raw128<uint64_t, 2> { |
| using type = uint64x2_t; |
| }; |
| |
| template <> |
| struct Raw128<int8_t, 16> { |
| using type = int8x16_t; |
| }; |
| |
| template <> |
| struct Raw128<int16_t, 8> { |
| using type = int16x8_t; |
| }; |
| |
| template <> |
| struct Raw128<int32_t, 4> { |
| using type = int32x4_t; |
| }; |
| |
| template <> |
| struct Raw128<int64_t, 2> { |
| using type = int64x2_t; |
| }; |
| |
| template <> |
| struct Raw128<float, 4> { |
| using type = float32x4_t; |
| }; |
| |
| #if defined(__aarch64__) |
| template <> |
| struct Raw128<double, 2> { |
| using type = float64x2_t; |
| }; |
| #endif |
| |
| // 64 |
| template <> |
| struct Raw128<uint8_t, 8> { |
| using type = uint8x8_t; |
| }; |
| |
| template <> |
| struct Raw128<uint16_t, 4> { |
| using type = uint16x4_t; |
| }; |
| |
| template <> |
| struct Raw128<uint32_t, 2> { |
| using type = uint32x2_t; |
| }; |
| |
| template <> |
| struct Raw128<uint64_t, 1> { |
| using type = uint64x1_t; |
| }; |
| |
| template <> |
| struct Raw128<int8_t, 8> { |
| using type = int8x8_t; |
| }; |
| |
| template <> |
| struct Raw128<int16_t, 4> { |
| using type = int16x4_t; |
| }; |
| |
| template <> |
| struct Raw128<int32_t, 2> { |
| using type = int32x2_t; |
| }; |
| |
| template <> |
| struct Raw128<int64_t, 1> { |
| using type = int64x1_t; |
| }; |
| |
| template <> |
| struct Raw128<float, 2> { |
| using type = float32x2_t; |
| }; |
| |
| #if defined(__aarch64__) |
| template <> |
| struct Raw128<double, 1> { |
| using type = float64x1_t; |
| }; |
| #endif |
| |
| // 32 (same as 64) |
| template <> |
| struct Raw128<uint8_t, 4> { |
| using type = uint8x8_t; |
| }; |
| |
| template <> |
| struct Raw128<uint16_t, 2> { |
| using type = uint16x4_t; |
| }; |
| |
| template <> |
| struct Raw128<uint32_t, 1> { |
| using type = uint32x2_t; |
| }; |
| |
| template <> |
| struct Raw128<int8_t, 4> { |
| using type = int8x8_t; |
| }; |
| |
| template <> |
| struct Raw128<int16_t, 2> { |
| using type = int16x4_t; |
| }; |
| |
| template <> |
| struct Raw128<int32_t, 1> { |
| using type = int32x2_t; |
| }; |
| |
| template <> |
| struct Raw128<float, 1> { |
| using type = float32x2_t; |
| }; |
| |
| // 16 (same as 64) |
| template <> |
| struct Raw128<uint8_t, 2> { |
| using type = uint8x8_t; |
| }; |
| |
| template <> |
| struct Raw128<uint16_t, 1> { |
| using type = uint16x4_t; |
| }; |
| |
| template <> |
| struct Raw128<int8_t, 2> { |
| using type = int8x8_t; |
| }; |
| |
| template <> |
| struct Raw128<int16_t, 1> { |
| using type = int16x4_t; |
| }; |
| |
| // 8 (same as 64) |
| template <> |
| struct Raw128<uint8_t, 1> { |
| using type = uint8x8_t; |
| }; |
| |
| template <> |
| struct Raw128<int8_t, 1> { |
| using type = int8x8_t; |
| }; |
| |
| template <typename T> |
| using Full128 = Simd<T, 16 / sizeof(T)>; |
| |
| template <typename T, size_t N = 16 / sizeof(T)> |
| class Vec128 { |
| using Raw = typename Raw128<T, N>::type; |
| |
| public: |
| HWY_INLINE Vec128() {} |
| Vec128(const Vec128&) = default; |
| Vec128& operator=(const Vec128&) = default; |
| HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {} |
| |
| // Compound assignment. Only usable if there is a corresponding non-member |
| // binary operator overload. For example, only f32 and f64 support division. |
| HWY_INLINE Vec128& operator*=(const Vec128 other) { |
| return *this = (*this * other); |
| } |
| HWY_INLINE Vec128& operator/=(const Vec128 other) { |
| return *this = (*this / other); |
| } |
| HWY_INLINE Vec128& operator+=(const Vec128 other) { |
| return *this = (*this + other); |
| } |
| HWY_INLINE Vec128& operator-=(const Vec128 other) { |
| return *this = (*this - other); |
| } |
| HWY_INLINE Vec128& operator&=(const Vec128 other) { |
| return *this = (*this & other); |
| } |
| HWY_INLINE Vec128& operator|=(const Vec128 other) { |
| return *this = (*this | other); |
| } |
| HWY_INLINE Vec128& operator^=(const Vec128 other) { |
| return *this = (*this ^ other); |
| } |
| |
| Raw raw; |
| }; |
| |
| // FF..FF or 0, also for floating-point - see README. |
| template <typename T, size_t N = 16 / sizeof(T)> |
| class Mask128 { |
| using Raw = typename Raw128<T, N>::type; |
| |
| public: |
| HWY_INLINE Mask128() {} |
| Mask128(const Mask128&) = default; |
| Mask128& operator=(const Mask128&) = default; |
| HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {} |
| |
| Raw raw; |
| }; |
| |
| // ------------------------------ Cast |
| |
| // cast_to_u8 |
| |
| // Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the |
| // vreinterpret*_u8_*() set of functions. |
| #define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8 |
| #define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \ |
| Vec128<uint8_t, size * sizeof(type)> |
| #define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type, size> v |
| #define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw |
| |
| // Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined. |
| template <size_t N> |
| HWY_INLINE Vec128<uint8_t, N> cast_to_u8(Vec128<uint8_t, N> v) { |
| return v; |
| } |
| |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(cast_to_u8, vreinterpret, _u8_, HWY_CAST_TO_U8) |
| HWY_NEON_DEF_FUNCTION_INTS(cast_to_u8, vreinterpret, _u8_, HWY_CAST_TO_U8) |
| HWY_NEON_DEF_FUNCTION_UINT_16(cast_to_u8, vreinterpret, _u8_, HWY_CAST_TO_U8) |
| HWY_NEON_DEF_FUNCTION_UINT_32(cast_to_u8, vreinterpret, _u8_, HWY_CAST_TO_U8) |
| HWY_NEON_DEF_FUNCTION_UINT_64(cast_to_u8, vreinterpret, _u8_, HWY_CAST_TO_U8) |
| |
| #undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8 |
| #undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8 |
| #undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8 |
| #undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 |
| |
| // cast_u8_to |
| |
| template <size_t N> |
| HWY_INLINE Vec128<uint8_t, N> cast_u8_to(Simd<uint8_t, N> /* tag */, |
| Vec128<uint8_t, N> v) { |
| return v; |
| } |
| |
| // 64-bit or less: |
| |
| template <size_t N, HWY_IF_LE64(int8_t, N)> |
| HWY_INLINE Vec128<int8_t, N> cast_u8_to(Simd<int8_t, N> /* tag */, |
| Vec128<uint8_t, N> v) { |
| return Vec128<int8_t, N>(vreinterpret_s8_u8(v.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(uint16_t, N)> |
| HWY_INLINE Vec128<uint16_t, N> cast_u8_to(Simd<uint16_t, N> /* tag */, |
| Vec128<uint8_t, N * 2> v) { |
| return Vec128<uint16_t, N>(vreinterpret_u16_u8(v.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(int16_t, N)> |
| HWY_INLINE Vec128<int16_t, N> cast_u8_to(Simd<int16_t, N> /* tag */, |
| Vec128<uint8_t, N * 2> v) { |
| return Vec128<int16_t, N>(vreinterpret_s16_u8(v.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(uint32_t, N)> |
| HWY_INLINE Vec128<uint32_t, N> cast_u8_to(Simd<uint32_t, N> /* tag */, |
| Vec128<uint8_t, N * 4> v) { |
| return Vec128<uint32_t, N>(vreinterpret_u32_u8(v.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<int32_t, N> cast_u8_to(Simd<int32_t, N> /* tag */, |
| Vec128<uint8_t, N * 4> v) { |
| return Vec128<int32_t, N>(vreinterpret_s32_u8(v.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(float, N)> |
| HWY_INLINE Vec128<float, N> cast_u8_to(Simd<float, N> /* tag */, |
| Vec128<uint8_t, N * 4> v) { |
| return Vec128<float, N>(vreinterpret_f32_u8(v.raw)); |
| } |
| HWY_INLINE Vec128<uint64_t, 1> cast_u8_to(Simd<uint64_t, 1> /* tag */, |
| Vec128<uint8_t, 1 * 8> v) { |
| return Vec128<uint64_t, 1>(vreinterpret_u64_u8(v.raw)); |
| } |
| HWY_INLINE Vec128<int64_t, 1> cast_u8_to(Simd<int64_t, 1> /* tag */, |
| Vec128<uint8_t, 1 * 8> v) { |
| return Vec128<int64_t, 1>(vreinterpret_s64_u8(v.raw)); |
| } |
| #if defined(__aarch64__) |
| HWY_INLINE Vec128<double, 1> cast_u8_to(Simd<double, 1> /* tag */, |
| Vec128<uint8_t, 1 * 8> v) { |
| return Vec128<double, 1>(vreinterpret_f64_u8(v.raw)); |
| } |
| #endif |
| |
| // 128-bit full: |
| |
| HWY_INLINE Vec128<int8_t> cast_u8_to(Full128<int8_t> /* tag */, |
| Vec128<uint8_t> v) { |
| return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw)); |
| } |
| HWY_INLINE Vec128<uint16_t> cast_u8_to(Full128<uint16_t> /* tag */, |
| Vec128<uint8_t> v) { |
| return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw)); |
| } |
| HWY_INLINE Vec128<int16_t> cast_u8_to(Full128<int16_t> /* tag */, |
| Vec128<uint8_t> v) { |
| return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw)); |
| } |
| HWY_INLINE Vec128<uint32_t> cast_u8_to(Full128<uint32_t> /* tag */, |
| Vec128<uint8_t> v) { |
| return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw)); |
| } |
| HWY_INLINE Vec128<int32_t> cast_u8_to(Full128<int32_t> /* tag */, |
| Vec128<uint8_t> v) { |
| return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw)); |
| } |
| HWY_INLINE Vec128<float> cast_u8_to(Full128<float> /* tag */, |
| Vec128<uint8_t> v) { |
| return Vec128<float>(vreinterpretq_f32_u8(v.raw)); |
| } |
| HWY_INLINE Vec128<uint64_t> cast_u8_to(Full128<uint64_t> /* tag */, |
| Vec128<uint8_t> v) { |
| return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw)); |
| } |
| HWY_INLINE Vec128<int64_t> cast_u8_to(Full128<int64_t> /* tag */, |
| Vec128<uint8_t> v) { |
| return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw)); |
| } |
| |
| #if defined(__aarch64__) |
| HWY_INLINE Vec128<double> cast_u8_to(Full128<double> /* tag */, |
| Vec128<uint8_t> v) { |
| return Vec128<double>(vreinterpretq_f64_u8(v.raw)); |
| } |
| #endif |
| |
| // BitCast |
| template <typename T, size_t N, typename FromT> |
| HWY_INLINE Vec128<T, N> BitCast( |
| Simd<T, N> d, Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) { |
| const auto u8 = cast_to_u8(v); |
| return cast_u8_to(d, u8); |
| } |
| |
| // ------------------------------ Set |
| |
| // Returns a vector with all lanes set to "t". |
| #define HWY_NEON_BUILD_TPL_HWY_SET1 |
| #define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128<type, size> |
| #define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \ |
| Simd<type, size> /* tag */, const type t |
| #define HWY_NEON_BUILD_ARG_HWY_SET1 t |
| |
| HWY_NEON_DEF_FUNCTION_ALL_TYPES(Set, vdup, _n_, HWY_SET1) |
| |
| #undef HWY_NEON_BUILD_TPL_HWY_SET1 |
| #undef HWY_NEON_BUILD_RET_HWY_SET1 |
| #undef HWY_NEON_BUILD_PARAM_HWY_SET1 |
| #undef HWY_NEON_BUILD_ARG_HWY_SET1 |
| |
| // Returns an all-zero vector. |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> Zero(Simd<T, N> d) { |
| return Set(d, 0); |
| } |
| |
| // Returns a vector with uninitialized elements. |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> Undefined(Simd<T, N> /*d*/) { |
| HWY_DIAGNOSTICS(push) |
| HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") |
| typename Raw128<T, N>::type a; |
| return Vec128<T, N>(a); |
| HWY_DIAGNOSTICS(pop) |
| } |
| |
| // ================================================== ARITHMETIC |
| |
| // ------------------------------ Addition |
| HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2) |
| |
| // ------------------------------ Subtraction |
| HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2) |
| |
| // ------------------------------ Saturating addition and subtraction |
| // Only defined for uint8_t, uint16_t and their signed versions, as in other |
| // architectures. |
| |
| // Returns a + b clamped to the destination range. |
| HWY_NEON_DEF_FUNCTION_INT_8(SaturatedAdd, vqadd, _, 2) |
| HWY_NEON_DEF_FUNCTION_INT_16(SaturatedAdd, vqadd, _, 2) |
| HWY_NEON_DEF_FUNCTION_UINT_8(SaturatedAdd, vqadd, _, 2) |
| HWY_NEON_DEF_FUNCTION_UINT_16(SaturatedAdd, vqadd, _, 2) |
| |
| // Returns a - b clamped to the destination range. |
| HWY_NEON_DEF_FUNCTION_INT_8(SaturatedSub, vqsub, _, 2) |
| HWY_NEON_DEF_FUNCTION_INT_16(SaturatedSub, vqsub, _, 2) |
| HWY_NEON_DEF_FUNCTION_UINT_8(SaturatedSub, vqsub, _, 2) |
| HWY_NEON_DEF_FUNCTION_UINT_16(SaturatedSub, vqsub, _, 2) |
| |
| // ------------------------------ Average |
| |
| // Returns (a + b + 1) / 2 |
| |
| // Unsigned |
| HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2) |
| HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2) |
| |
| // ------------------------------ Absolute value |
| |
| // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. |
| HWY_INLINE Vec128<int8_t> Abs(const Vec128<int8_t> v) { |
| return Vec128<int8_t>(vabsq_s8(v.raw)); |
| } |
| HWY_INLINE Vec128<int16_t> Abs(const Vec128<int16_t> v) { |
| return Vec128<int16_t>(vabsq_s16(v.raw)); |
| } |
| HWY_INLINE Vec128<int32_t> Abs(const Vec128<int32_t> v) { |
| return Vec128<int32_t>(vabsq_s32(v.raw)); |
| } |
| HWY_INLINE Vec128<float> Abs(const Vec128<float> v) { |
| return Vec128<float>{vabsq_f32(v.raw)}; |
| } |
| |
| template <size_t N, HWY_IF_LE64(int8_t, N)> |
| HWY_INLINE Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) { |
| return Vec128<int8_t, N>(vabs_s8(v.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(int16_t, N)> |
| HWY_INLINE Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) { |
| return Vec128<int16_t, N>(vabs_s16(v.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) { |
| return Vec128<int32_t, N>(vabs_s32(v.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(float, N)> |
| HWY_INLINE Vec128<float, N> Abs(const Vec128<float, N> v) { |
| return Vec128<float, N>{vabs_f32(v.raw)}; |
| } |
| |
| #if defined(__aarch64__) |
| HWY_INLINE Vec128<double> Abs(const Vec128<double> v) { |
| return Vec128<double>{vabsq_f64(v.raw)}; |
| } |
| |
| HWY_INLINE Vec128<double, 1> Abs(const Vec128<double, 1> v) { |
| return Vec128<double, 1>{vabs_f64(v.raw)}; |
| } |
| #endif |
| |
| // ------------------------------ Shift lanes by constant #bits |
| |
| // Only defined for ints and uints, except for signed i64 shr. |
| #define HWY_NEON_BUILD_TPL_HWY_SHIFT template <int kBits> |
| #define HWY_NEON_BUILD_RET_HWY_SHIFT(type, size) Vec128<type, size> |
| #define HWY_NEON_BUILD_PARAM_HWY_SHIFT(type, size) const Vec128<type, size> v |
| #define HWY_NEON_BUILD_ARG_HWY_SHIFT v.raw, kBits |
| |
| HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, HWY_SHIFT) |
| |
| HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, HWY_SHIFT) |
| HWY_NEON_DEF_FUNCTION_INT_8_16_32(ShiftRight, vshr, _n_, HWY_SHIFT) |
| |
| #undef HWY_NEON_BUILD_TPL_HWY_SHIFT |
| #undef HWY_NEON_BUILD_RET_HWY_SHIFT |
| #undef HWY_NEON_BUILD_PARAM_HWY_SHIFT |
| #undef HWY_NEON_BUILD_ARG_HWY_SHIFT |
| |
| // ------------------------------ Shift lanes by independent variable #bits |
| |
| // Unsigned (no u8,u16) |
| HWY_INLINE Vec128<uint32_t> operator<<(const Vec128<uint32_t> v, |
| const Vec128<uint32_t> bits) { |
| return Vec128<uint32_t>(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw))); |
| } |
| HWY_INLINE Vec128<uint32_t> operator>>(const Vec128<uint32_t> v, |
| const Vec128<uint32_t> bits) { |
| return Vec128<uint32_t>( |
| vshlq_u32(v.raw, vnegq_s32(vreinterpretq_s32_u32(bits.raw)))); |
| } |
| HWY_INLINE Vec128<uint64_t> operator<<(const Vec128<uint64_t> v, |
| const Vec128<uint64_t> bits) { |
| return Vec128<uint64_t>(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw))); |
| } |
| HWY_INLINE Vec128<uint64_t> operator>>(const Vec128<uint64_t> v, |
| const Vec128<uint64_t> bits) { |
| #if defined(__aarch64__) |
| const int64x2_t neg_bits = vnegq_s64(vreinterpretq_s64_u64(bits.raw)); |
| #else |
| // A32 doesn't have vnegq_s64(). |
| const int64x2_t neg_bits = |
| vsubq_s64(Set(Full128<int64_t>(), 0).raw, bits.raw); |
| #endif |
| return Vec128<uint64_t>(vshlq_u64(v.raw, neg_bits)); |
| } |
| |
| template <size_t N, HWY_IF_LE64(uint32_t, N)> |
| HWY_INLINE Vec128<uint32_t, N> operator<<(const Vec128<uint32_t, N> v, |
| const Vec128<uint32_t, N> bits) { |
| return Vec128<uint32_t, N>(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw))); |
| } |
| template <size_t N, HWY_IF_LE64(uint32_t, N)> |
| HWY_INLINE Vec128<uint32_t, N> operator>>(const Vec128<uint32_t, N> v, |
| const Vec128<uint32_t, N> bits) { |
| return Vec128<uint32_t, N>( |
| vshl_u32(v.raw, vneg_s32(vreinterpret_s32_u32(bits.raw)))); |
| } |
| HWY_INLINE Vec128<uint64_t, 1> operator<<(const Vec128<uint64_t, 1> v, |
| const Vec128<uint64_t, 1> bits) { |
| return Vec128<uint64_t, 1>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw))); |
| } |
| HWY_INLINE Vec128<uint64_t, 1> operator>>(const Vec128<uint64_t, 1> v, |
| const Vec128<uint64_t, 1> bits) { |
| #if defined(__aarch64__) |
| const int64x1_t neg_bits = vneg_s64(vreinterpret_s64_u64(bits.raw)); |
| #else |
| // A32 doesn't have vneg_s64(). |
| const int64x1_t neg_bits = vsub_s64(Set(Simd<int64_t, 1>(), 0).raw, bits.raw); |
| #endif |
| return Vec128<uint64_t, 1>(vshl_u64(v.raw, neg_bits)); |
| } |
| |
| // Signed (no i8,i16) |
| HWY_INLINE Vec128<int32_t> operator<<(const Vec128<int32_t> v, |
| const Vec128<int32_t> bits) { |
| return Vec128<int32_t>(vshlq_s32(v.raw, bits.raw)); |
| } |
| HWY_INLINE Vec128<int32_t> operator>>(const Vec128<int32_t> v, |
| const Vec128<int32_t> bits) { |
| return Vec128<int32_t>(vshlq_s32(v.raw, vnegq_s32(bits.raw))); |
| } |
| HWY_INLINE Vec128<int64_t> operator<<(const Vec128<int64_t> v, |
| const Vec128<int64_t> bits) { |
| return Vec128<int64_t>(vshlq_s64(v.raw, bits.raw)); |
| } |
| |
| template <size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<int32_t, N> operator<<(const Vec128<int32_t, N> v, |
| const Vec128<int32_t, N> bits) { |
| return Vec128<int32_t, N>(vshl_s32(v.raw, bits.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<int32_t, N> operator>>(const Vec128<int32_t, N> v, |
| const Vec128<int32_t, N> bits) { |
| return Vec128<int32_t, N>(vshl_s32(v.raw, vneg_s32(bits.raw))); |
| } |
| HWY_INLINE Vec128<int64_t, 1> operator<<(const Vec128<int64_t, 1> v, |
| const Vec128<int64_t, 1> bits) { |
| return Vec128<int64_t, 1>(vshl_s64(v.raw, bits.raw)); |
| } |
| |
| // ------------------------------ Minimum |
| |
| // Unsigned (no u64) |
| HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2) |
| |
| // Signed (no i64) |
| HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, vmin, _, 2) |
| |
| // Float |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2) |
| |
| // ------------------------------ Maximum |
| |
| // Unsigned (no u64) |
| HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max, vmax, _, 2) |
| |
| // Signed (no i64) |
| HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, vmax, _, 2) |
| |
| // Float |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2) |
| |
| // ------------------------------ Integer multiplication |
| |
| // Unsigned |
| HWY_INLINE Vec128<uint16_t> operator*(const Vec128<uint16_t> a, |
| const Vec128<uint16_t> b) { |
| return Vec128<uint16_t>(vmulq_u16(a.raw, b.raw)); |
| } |
| HWY_INLINE Vec128<uint32_t> operator*(const Vec128<uint32_t> a, |
| const Vec128<uint32_t> b) { |
| return Vec128<uint32_t>(vmulq_u32(a.raw, b.raw)); |
| } |
| |
| template <size_t N, HWY_IF_LE64(uint16_t, N)> |
| HWY_INLINE Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a, |
| const Vec128<uint16_t, N> b) { |
| return Vec128<uint16_t, N>(vmul_u16(a.raw, b.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(uint32_t, N)> |
| HWY_INLINE Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a, |
| const Vec128<uint32_t, N> b) { |
| return Vec128<uint32_t, N>(vmul_u32(a.raw, b.raw)); |
| } |
| |
| // Signed |
| HWY_INLINE Vec128<int16_t> operator*(const Vec128<int16_t> a, |
| const Vec128<int16_t> b) { |
| return Vec128<int16_t>(vmulq_s16(a.raw, b.raw)); |
| } |
| HWY_INLINE Vec128<int32_t> operator*(const Vec128<int32_t> a, |
| const Vec128<int32_t> b) { |
| return Vec128<int32_t>(vmulq_s32(a.raw, b.raw)); |
| } |
| |
| template <size_t N, HWY_IF_LE64(uint16_t, N)> |
| HWY_INLINE Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a, |
| const Vec128<int16_t, N> b) { |
| return Vec128<int16_t, N>(vmul_s16(a.raw, b.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a, |
| const Vec128<int32_t, N> b) { |
| return Vec128<int32_t, N>(vmul_s32(a.raw, b.raw)); |
| } |
| |
| // Returns the upper 16 bits of a * b in each lane. |
| HWY_INLINE Vec128<int16_t> MulHigh(const Vec128<int16_t> a, |
| const Vec128<int16_t> b) { |
| int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw)); |
| #if defined(__aarch64__) |
| int32x4_t rhi = vmull_high_s16(a.raw, b.raw); |
| #else |
| int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw)); |
| #endif |
| return Vec128<int16_t>( |
| vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi))); |
| } |
| HWY_INLINE Vec128<uint16_t> MulHigh(const Vec128<uint16_t> a, |
| const Vec128<uint16_t> b) { |
| uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw)); |
| #if defined(__aarch64__) |
| uint32x4_t rhi = vmull_high_u16(a.raw, b.raw); |
| #else |
| uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw)); |
| #endif |
| return Vec128<uint16_t>( |
| vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi))); |
| } |
| |
| template <size_t N, HWY_IF_LE64(int16_t, N)> |
| HWY_INLINE Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a, |
| const Vec128<int16_t, N> b) { |
| int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw)); |
| return Vec128<int16_t, N>(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo))); |
| } |
| template <size_t N, HWY_IF_LE64(uint16_t, N)> |
| HWY_INLINE Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a, |
| const Vec128<uint16_t, N> b) { |
| uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw)); |
| return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo))); |
| } |
| |
| // Multiplies even lanes (0, 2 ..) and places the double-wide result into |
| // even and the upper half into its odd neighbor lane. |
| HWY_INLINE Vec128<int64_t> MulEven(const Vec128<int32_t> a, |
| const Vec128<int32_t> b) { |
| int32x4_t a_packed = vuzp1q_s32(a.raw, a.raw); |
| int32x4_t b_packed = vuzp1q_s32(b.raw, b.raw); |
| return Vec128<int64_t>( |
| vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed))); |
| } |
| HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint32_t> a, |
| const Vec128<uint32_t> b) { |
| uint32x4_t a_packed = vuzp1q_u32(a.raw, a.raw); |
| uint32x4_t b_packed = vuzp1q_u32(b.raw, b.raw); |
| return Vec128<uint64_t>( |
| vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed))); |
| } |
| |
| template <size_t N> |
| HWY_INLINE Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a, |
| const Vec128<int32_t, N> b) { |
| int32x2_t a_packed = vuzp1_s32(a.raw, a.raw); |
| int32x2_t b_packed = vuzp1_s32(b.raw, b.raw); |
| return Vec128<int64_t, (N + 1) / 2>( |
| vget_low_s64(vmull_s32(a_packed, b_packed))); |
| } |
| template <size_t N> |
| HWY_INLINE Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a, |
| const Vec128<uint32_t, N> b) { |
| uint32x2_t a_packed = vuzp1_u32(a.raw, a.raw); |
| uint32x2_t b_packed = vuzp1_u32(b.raw, b.raw); |
| return Vec128<uint64_t, (N + 1) / 2>( |
| vget_low_u64(vmull_u32(a_packed, b_packed))); |
| } |
| |
| // ------------------------------ Floating-point negate |
| |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vneg, _, 1) |
| HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) |
| |
| HWY_INLINE Vec128<int64_t, 1> Neg(const Vec128<int64_t, 1> v) { |
| #if defined(__aarch64__) |
| return Vec128<int64_t, 1>(vneg_s64(v.raw)); |
| #else |
| return Zero(Simd<int64_t, 1>()) - v; |
| #endif |
| } |
| |
| HWY_INLINE Vec128<int64_t> Neg(const Vec128<int64_t> v) { |
| #if defined(__aarch64__) |
| return Vec128<int64_t>(vnegq_s64(v.raw)); |
| #else |
| return Zero(Full128<int64_t>()) - v; |
| #endif |
| } |
| |
| // ------------------------------ Floating-point mul / div |
| |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2) |
| |
| // Approximate reciprocal |
| HWY_INLINE Vec128<float> ApproximateReciprocal(const Vec128<float> v) { |
| return Vec128<float>(vrecpeq_f32(v.raw)); |
| } |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) { |
| return Vec128<float, N>(vrecpe_f32(v.raw)); |
| } |
| |
| #if defined(__aarch64__) |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2) |
| #else |
| // Emulated with approx reciprocal + Newton-Raphson + mul |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> operator/(const Vec128<float, N> a, |
| const Vec128<float, N> b) { |
| auto x = ApproximateReciprocal(b); |
| // Newton-Raphson on 1/x - b |
| const auto two = Set(Simd<float, N>(), 2); |
| x = x * (two - b * x); |
| x = x * (two - b * x); |
| x = x * (two - b * x); |
| return a * x; |
| } |
| #endif |
| |
| // Absolute value of difference. |
| HWY_INLINE Vec128<float> AbsDiff(const Vec128<float> a, const Vec128<float> b) { |
| return Vec128<float>(vabdq_f32(a.raw, b.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(float, N)> |
| HWY_INLINE Vec128<float, N> AbsDiff(const Vec128<float, N> a, |
| const Vec128<float, N> b) { |
| return Vec128<float, N>(vabd_f32(a.raw, b.raw)); |
| } |
| |
| // ------------------------------ Floating-point multiply-add variants |
| |
| // Returns add + mul * x |
| #if defined(__aarch64__) |
| template <size_t N, HWY_IF_LE64(float, N)> |
| HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul, |
| const Vec128<float, N> x, |
| const Vec128<float, N> add) { |
| return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw)); |
| } |
| HWY_INLINE Vec128<float> MulAdd(const Vec128<float> mul, const Vec128<float> x, |
| const Vec128<float> add) { |
| return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw)); |
| } |
| HWY_INLINE Vec128<double, 1> MulAdd(const Vec128<double, 1> mul, |
| const Vec128<double, 1> x, |
| const Vec128<double, 1> add) { |
| return Vec128<double, 1>(vfma_f64(add.raw, mul.raw, x.raw)); |
| } |
| HWY_INLINE Vec128<double> MulAdd(const Vec128<double> mul, |
| const Vec128<double> x, |
| const Vec128<double> add) { |
| return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw)); |
| } |
| #else |
| // Emulate FMA for floats. |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> MulAdd(const Vec128<float, N> mul, |
| const Vec128<float, N> x, |
| const Vec128<float, N> add) { |
| return mul * x + add; |
| } |
| #endif |
| |
| // Returns add - mul * x |
| #if defined(__aarch64__) |
| template <size_t N, HWY_IF_LE64(float, N)> |
| HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul, |
| const Vec128<float, N> x, |
| const Vec128<float, N> add) { |
| return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw)); |
| } |
| HWY_INLINE Vec128<float> NegMulAdd(const Vec128<float> mul, |
| const Vec128<float> x, |
| const Vec128<float> add) { |
| return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw)); |
| } |
| |
| HWY_INLINE Vec128<double, 1> NegMulAdd(const Vec128<double, 1> mul, |
| const Vec128<double, 1> x, |
| const Vec128<double, 1> add) { |
| return Vec128<double, 1>(vfms_f64(add.raw, mul.raw, x.raw)); |
| } |
| HWY_INLINE Vec128<double> NegMulAdd(const Vec128<double> mul, |
| const Vec128<double> x, |
| const Vec128<double> add) { |
| return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw)); |
| } |
| #else |
| // Emulate FMA for floats. |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> NegMulAdd(const Vec128<float, N> mul, |
| const Vec128<float, N> x, |
| const Vec128<float, N> add) { |
| return add - mul * x; |
| } |
| #endif |
| |
| // Returns mul * x - sub |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> MulSub(const Vec128<float, N> mul, |
| const Vec128<float, N> x, |
| const Vec128<float, N> sub) { |
| return MulAdd(mul, x, Neg(sub)); |
| } |
| template <size_t N> |
| HWY_INLINE Vec128<double, N> MulSub(const Vec128<double, N> mul, |
| const Vec128<double, N> x, |
| const Vec128<double, N> sub) { |
| return MulAdd(mul, x, Neg(sub)); |
| } |
| |
| // Returns -mul * x - sub |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> NegMulSub(const Vec128<float, N> mul, |
| const Vec128<float, N> x, |
| const Vec128<float, N> sub) { |
| return Neg(MulAdd(mul, x, sub)); |
| } |
| template <size_t N> |
| HWY_INLINE Vec128<double, N> NegMulSub(const Vec128<double, N> mul, |
| const Vec128<double, N> x, |
| const Vec128<double, N> sub) { |
| return Neg(MulAdd(mul, x, sub)); |
| } |
| |
| // ------------------------------ Floating-point square root |
| |
| // Approximate reciprocal square root |
| HWY_INLINE Vec128<float> ApproximateReciprocalSqrt(const Vec128<float> v) { |
| return Vec128<float>(vrsqrteq_f32(v.raw)); |
| } |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> ApproximateReciprocalSqrt( |
| const Vec128<float, N> v) { |
| return Vec128<float, N>(vrsqrte_f32(v.raw)); |
| } |
| |
| // Full precision square root |
| #if defined(__aarch64__) |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1) |
| #else |
| // Not defined on armv7: emulate with approx reciprocal sqrt + Goldschmidt. |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> Sqrt(const Vec128<float, N> v) { |
| auto b = v; |
| auto Y = ApproximateReciprocalSqrt(v); |
| auto x = v * Y; |
| const auto half = Set(Simd<float, N>(), 0.5); |
| const auto oneandhalf = Set(Simd<float, N>(), 1.5); |
| for (size_t i = 0; i < 3; i++) { |
| b = b * Y * Y; |
| Y = oneandhalf - half * b; |
| x = x * Y; |
| } |
| return IfThenZeroElse(v == Zero(Simd<float, N>()), x); |
| } |
| #endif |
| |
| // ------------------------------ Floating-point rounding |
| |
| #if defined(__aarch64__) |
| // Toward nearest integer |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1) |
| |
| // Toward zero, aka truncate |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Trunc, vrnd, _, 1) |
| |
| // Toward +infinity, aka ceiling |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Ceil, vrndp, _, 1) |
| |
| // Toward -infinity, aka floor |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor, vrndm, _, 1) |
| #else |
| |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> Trunc(const Vec128<float, N> v) { |
| const Simd<uint32_t, N> du; |
| const Simd<int32_t, N> di; |
| const Simd<float, N> df; |
| const auto v_bits = BitCast(du, v); |
| const auto biased_exp = ShiftRight<23>(v_bits) & Set(du, 0xFF); |
| const auto bits_to_remove = |
| Set(du, 150) - Max(Min(biased_exp, Set(du, 150)), Set(du, 127)); |
| const auto mask = (Set(du, 1) << bits_to_remove) - Set(du, 1); |
| return BitCast(df, IfThenZeroElse(BitCast(di, biased_exp) < Set(di, 127), |
| BitCast(di, AndNot(mask, v_bits)))); |
| } |
| |
| // WARNING: does not quite have the same semantics as what NEON does on |
| // aarch64. In particular, does not break ties to even. |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> Round(const Vec128<float, N> v) { |
| const Simd<uint32_t, N> du; |
| const Simd<float, N> df; |
| const auto sign_mask = BitCast(df, Set(du, 0x80000000u)); |
| // move 0.5f away from 0 and call truncate. |
| return Trunc(v + ((v & sign_mask) | Set(df, 0.5f))); |
| } |
| |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> Ceil(const Vec128<float, N> v) { |
| const Simd<uint32_t, N> du; |
| const Simd<int32_t, N> di; |
| const Simd<float, N> df; |
| const auto sign_mask = Set(du, 0x80000000u); |
| const auto v_bits = BitCast(du, v); |
| const auto biased_exp = ShiftRight<23>(v_bits) & Set(du, 0xFF); |
| const auto bits_to_remove = |
| Set(du, 150) - Max(Min(biased_exp, Set(du, 150)), Set(du, 127)); |
| const auto high_bit = Set(du, 1) << bits_to_remove; |
| const auto mask = high_bit - Set(du, 1); |
| const auto removed_bits = mask & v_bits; |
| // number is positive and at least one bit was set in the mantissa |
| const auto should_round_up = MaskFromVec( |
| BitCast(df, AndNot(VecFromMask(removed_bits == Zero(du)), |
| VecFromMask(Zero(du) == (v_bits & sign_mask))))); |
| const auto add_one = IfThenElseZero(should_round_up, Set(df, 1.0f)); |
| const auto rounded = |
| BitCast(df, IfThenZeroElse(BitCast(di, biased_exp) < Set(di, 127), |
| BitCast(di, AndNot(mask, v_bits)))); |
| return rounded + add_one; |
| } |
| |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> Floor(const Vec128<float, N> v) { |
| const Simd<float, N> df; |
| const auto zero = Zero(df); |
| return zero - Ceil(zero - v); |
| } |
| |
| #endif |
| |
| // ================================================== COMPARE |
| |
| #define HWY_NEON_BUILD_TPL_HWY_COMPARE |
| #define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type, size> |
| #define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \ |
| const Vec128<type, size> a, const Vec128<type, size> b |
| #define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw |
| |
| // Comparisons fill a lane with 1-bits if the condition is true, else 0. |
| |
| // ------------------------------ Equality |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE) |
| #if defined(__aarch64__) |
| HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE); |
| #else |
| // No 64-bit comparisons on armv7: emulate them below, after Shuffle2301. |
| HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE) |
| HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE) |
| #endif |
| |
| // ------------------------------ Strict inequality |
| |
| // Signed/float < (no unsigned) |
| #if defined(__aarch64__) |
| HWY_NEON_DEF_FUNCTION_INTS(operator<, vclt, _, HWY_COMPARE) |
| #else |
| HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE) |
| #endif |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE) |
| |
| // Signed/float > (no unsigned) |
| #if defined(__aarch64__) |
| HWY_NEON_DEF_FUNCTION_INTS(operator>, vcgt, _, HWY_COMPARE) |
| #else |
| HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator>, vcgt, _, HWY_COMPARE) |
| #endif |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>, vcgt, _, HWY_COMPARE) |
| |
| // ------------------------------ Weak inequality |
| |
| // Float <= >= |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE) |
| HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator>=, vcge, _, HWY_COMPARE) |
| |
| #undef HWY_NEON_BUILD_TPL_HWY_COMPARE |
| #undef HWY_NEON_BUILD_RET_HWY_COMPARE |
| #undef HWY_NEON_BUILD_PARAM_HWY_COMPARE |
| #undef HWY_NEON_BUILD_ARG_HWY_COMPARE |
| |
| // ================================================== LOGICAL |
| |
| // ------------------------------ Bitwise AND |
| HWY_NEON_DEF_FUNCTION_INTS_UINTS(And, vand, _, 2) |
| // These operator& rely on the special cases for uint32_t and uint64_t just |
| // defined by HWY_NEON_DEF_FUNCTION_INTS_UINTS() macro. |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> And(const Vec128<float, N> a, |
| const Vec128<float, N> b) { |
| const Simd<uint32_t, N> d; |
| return BitCast(Simd<float, N>(), BitCast(d, a) & BitCast(d, b)); |
| } |
| template <size_t N> |
| HWY_INLINE Vec128<double, N> And(const Vec128<double, N> a, |
| const Vec128<double, N> b) { |
| const Simd<uint64_t, N> d; |
| return BitCast(Simd<double, N>(), BitCast(d, a) & BitCast(d, b)); |
| } |
| |
| // ------------------------------ Bitwise AND-NOT |
| |
| namespace internal { |
| // reversed_andnot returns a & ~b. |
| HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2) |
| } // namespace internal |
| |
| // Returns ~not_mask & mask. |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> AndNot(const Vec128<T, N> not_mask, |
| const Vec128<T, N> mask) { |
| return internal::reversed_andnot(mask, not_mask); |
| } |
| |
| // These AndNot() rely on the special cases for uint32_t and uint64_t just |
| // defined by HWY_NEON_DEF_FUNCTION_INTS_UINTS() macro. |
| |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> AndNot(const Vec128<float, N> not_mask, |
| const Vec128<float, N> mask) { |
| const Simd<uint32_t, N> du; |
| Vec128<uint32_t, N> ret = |
| internal::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask)); |
| return BitCast(Simd<float, N>(), ret); |
| } |
| |
| #if defined(__aarch64__) |
| template <size_t N> |
| HWY_INLINE Vec128<double, N> AndNot(const Vec128<double, N> not_mask, |
| const Vec128<double, N> mask) { |
| const Simd<uint64_t, N> du; |
| Vec128<uint64_t, N> ret = |
| internal::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask)); |
| return BitCast(Simd<double, N>(), ret); |
| } |
| #endif |
| |
| // ------------------------------ Bitwise OR |
| |
| HWY_NEON_DEF_FUNCTION_INTS_UINTS(Or, vorr, _, 2) |
| |
| // These operator| rely on the special cases for uint32_t and uint64_t just |
| // defined by HWY_NEON_DEF_FUNCTION_INTS_UINTS() macro. |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> Or(const Vec128<float, N> a, |
| const Vec128<float, N> b) { |
| const Simd<uint32_t, N> d; |
| return BitCast(Simd<float, N>(), BitCast(d, a) | BitCast(d, b)); |
| } |
| template <size_t N> |
| HWY_INLINE Vec128<double, N> Or(const Vec128<double, N> a, |
| const Vec128<double, N> b) { |
| const Simd<uint64_t, N> d; |
| return BitCast(Simd<double, N>(), BitCast(d, a) | BitCast(d, b)); |
| } |
| |
| // ------------------------------ Bitwise XOR |
| |
| HWY_NEON_DEF_FUNCTION_INTS_UINTS(Xor, veor, _, 2) |
| |
| // These operator| rely on the special cases for uint32_t and uint64_t just |
| // defined by HWY_NEON_DEF_FUNCTION_INTS_UINTS() macro. |
| template <size_t N> |
| HWY_INLINE Vec128<float, N> Xor(const Vec128<float, N> a, |
| const Vec128<float, N> b) { |
| const Simd<uint32_t, N> d; |
| return BitCast(Simd<float, N>(), BitCast(d, a) ^ BitCast(d, b)); |
| } |
| template <size_t N> |
| HWY_INLINE Vec128<double, N> Xor(const Vec128<double, N> a, |
| const Vec128<double, N> b) { |
| const Simd<uint64_t, N> d; |
| return BitCast(Simd<double, N>(), BitCast(d, a) ^ BitCast(d, b)); |
| } |
| |
| // ------------------------------ Operator overloads (internal-only if float) |
| |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) { |
| return And(a, b); |
| } |
| |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) { |
| return Or(a, b); |
| } |
| |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) { |
| return Xor(a, b); |
| } |
| |
| // ------------------------------ CopySign |
| |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn, |
| const Vec128<T, N> sign) { |
| static_assert(IsFloat<T>(), "Only makes sense for floating-point"); |
| const auto msb = SignBit(Simd<T, N>()); |
| return Or(AndNot(msb, magn), And(msb, sign)); |
| } |
| |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs, |
| const Vec128<T, N> sign) { |
| static_assert(IsFloat<T>(), "Only makes sense for floating-point"); |
| return Or(abs, And(SignBit(Simd<T, N>()), sign)); |
| } |
| |
| // ------------------------------ Make mask |
| |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) { |
| static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported"); |
| return (v & bit) == bit; |
| } |
| |
| // Mask and Vec are the same (true = FF..FF). |
| template <typename T, size_t N> |
| HWY_INLINE Mask128<T, N> MaskFromVec(const Vec128<T, N> v) { |
| return Mask128<T, N>(v.raw); |
| } |
| |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> VecFromMask(const Mask128<T, N> v) { |
| return Vec128<T, N>(v.raw); |
| } |
| |
| // IfThenElse(mask, yes, no) |
| // Returns mask ? b : a. |
| #define HWY_NEON_BUILD_TPL_HWY_IF |
| #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type, size> |
| #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \ |
| const Mask128<type, size> mask, const Vec128<type, size> yes, \ |
| const Vec128<type, size> no |
| #define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw |
| |
| HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF) |
| |
| #undef HWY_NEON_BUILD_TPL_HWY_IF |
| #undef HWY_NEON_BUILD_RET_HWY_IF |
| #undef HWY_NEON_BUILD_PARAM_HWY_IF |
| #undef HWY_NEON_BUILD_ARG_HWY_IF |
| |
| // mask ? yes : 0 |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> IfThenElseZero(const Mask128<T, N> mask, |
| const Vec128<T, N> yes) { |
| return yes & VecFromMask(mask); |
| } |
| |
| // mask ? 0 : no |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> IfThenZeroElse(const Mask128<T, N> mask, |
| const Vec128<T, N> no) { |
| return AndNot(VecFromMask(mask), no); |
| } |
| |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) { |
| const auto zero = Zero(Simd<T, N>()); |
| return Max(zero, v); |
| } |
| |
| // ================================================== MEMORY |
| |
| // ------------------------------ Load 128 |
| |
| HWY_INLINE Vec128<uint8_t> LoadU(Full128<uint8_t> /* tag */, |
| const uint8_t* HWY_RESTRICT aligned) { |
| return Vec128<uint8_t>(vld1q_u8(aligned)); |
| } |
| HWY_INLINE Vec128<uint16_t> LoadU(Full128<uint16_t> /* tag */, |
| const uint16_t* HWY_RESTRICT aligned) { |
| return Vec128<uint16_t>(vld1q_u16(aligned)); |
| } |
| HWY_INLINE Vec128<uint32_t> LoadU(Full128<uint32_t> /* tag */, |
| const uint32_t* HWY_RESTRICT aligned) { |
| return Vec128<uint32_t>(vld1q_u32(aligned)); |
| } |
| HWY_INLINE Vec128<uint64_t> LoadU(Full128<uint64_t> /* tag */, |
| const uint64_t* HWY_RESTRICT aligned) { |
| return Vec128<uint64_t>(vld1q_u64(aligned)); |
| } |
| HWY_INLINE Vec128<int8_t> LoadU(Full128<int8_t> /* tag */, |
| const int8_t* HWY_RESTRICT aligned) { |
| return Vec128<int8_t>(vld1q_s8(aligned)); |
| } |
| HWY_INLINE Vec128<int16_t> LoadU(Full128<int16_t> /* tag */, |
| const int16_t* HWY_RESTRICT aligned) { |
| return Vec128<int16_t>(vld1q_s16(aligned)); |
| } |
| HWY_INLINE Vec128<int32_t> LoadU(Full128<int32_t> /* tag */, |
| const int32_t* HWY_RESTRICT aligned) { |
| return Vec128<int32_t>(vld1q_s32(aligned)); |
| } |
| HWY_INLINE Vec128<int64_t> LoadU(Full128<int64_t> /* tag */, |
| const int64_t* HWY_RESTRICT aligned) { |
| return Vec128<int64_t>(vld1q_s64(aligned)); |
| } |
| HWY_INLINE Vec128<float> LoadU(Full128<float> /* tag */, |
| const float* HWY_RESTRICT aligned) { |
| return Vec128<float>(vld1q_f32(aligned)); |
| } |
| #if defined(__aarch64__) |
| HWY_INLINE Vec128<double> LoadU(Full128<double> /* tag */, |
| const double* HWY_RESTRICT aligned) { |
| return Vec128<double>(vld1q_f64(aligned)); |
| } |
| #endif |
| |
| // ------------------------------ Load 64 |
| |
| HWY_INLINE Vec128<uint8_t, 8> LoadU(Simd<uint8_t, 8> /* tag */, |
| const uint8_t* HWY_RESTRICT p) { |
| return Vec128<uint8_t, 8>(vld1_u8(p)); |
| } |
| HWY_INLINE Vec128<uint16_t, 4> LoadU(Simd<uint16_t, 4> /* tag */, |
| const uint16_t* HWY_RESTRICT p) { |
| return Vec128<uint16_t, 4>(vld1_u16(p)); |
| } |
| HWY_INLINE Vec128<uint32_t, 2> LoadU(Simd<uint32_t, 2> /* tag */, |
| const uint32_t* HWY_RESTRICT p) { |
| return Vec128<uint32_t, 2>(vld1_u32(p)); |
| } |
| HWY_INLINE Vec128<uint64_t, 1> LoadU(Simd<uint64_t, 1> /* tag */, |
| const uint64_t* HWY_RESTRICT p) { |
| return Vec128<uint64_t, 1>(vld1_u64(p)); |
| } |
| HWY_INLINE Vec128<int8_t, 8> LoadU(Simd<int8_t, 8> /* tag */, |
| const int8_t* HWY_RESTRICT p) { |
| return Vec128<int8_t, 8>(vld1_s8(p)); |
| } |
| HWY_INLINE Vec128<int16_t, 4> LoadU(Simd<int16_t, 4> /* tag */, |
| const int16_t* HWY_RESTRICT p) { |
| return Vec128<int16_t, 4>(vld1_s16(p)); |
| } |
| HWY_INLINE Vec128<int32_t, 2> LoadU(Simd<int32_t, 2> /* tag */, |
| const int32_t* HWY_RESTRICT p) { |
| return Vec128<int32_t, 2>(vld1_s32(p)); |
| } |
| HWY_INLINE Vec128<int64_t, 1> LoadU(Simd<int64_t, 1> /* tag */, |
| const int64_t* HWY_RESTRICT p) { |
| return Vec128<int64_t, 1>(vld1_s64(p)); |
| } |
| HWY_INLINE Vec128<float, 2> LoadU(Simd<float, 2> /* tag */, |
| const float* HWY_RESTRICT p) { |
| return Vec128<float, 2>(vld1_f32(p)); |
| } |
| #if defined(__aarch64__) |
| HWY_INLINE Vec128<double, 1> LoadU(Simd<double, 1> /* tag */, |
| const double* HWY_RESTRICT p) { |
| return Vec128<double, 1>(vld1_f64(p)); |
| } |
| #endif |
| |
| // ------------------------------ Load 32 |
| |
| // In the following load functions, |a| is purposely undefined. |
| // It is a required parameter to the intrinsic, however |
| // we don't actually care what is in it, and we don't want |
| // to introduce extra overhead by initializing it to something. |
| |
| HWY_INLINE Vec128<uint8_t, 4> LoadU(Simd<uint8_t, 4> d, |
| const uint8_t* HWY_RESTRICT p) { |
| uint32x2_t a = Undefined(d).raw; |
| uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0); |
| return Vec128<uint8_t, 4>(vreinterpret_u8_u32(b)); |
| } |
| HWY_INLINE Vec128<uint16_t, 2> LoadU(Simd<uint16_t, 2> d, |
| const uint16_t* HWY_RESTRICT p) { |
| uint32x2_t a = Undefined(d).raw; |
| uint32x2_t b = vld1_lane_u32(reinterpret_cast<const uint32_t*>(p), a, 0); |
| return Vec128<uint16_t, 2>(vreinterpret_u16_u32(b)); |
| } |
| HWY_INLINE Vec128<uint32_t, 1> LoadU(Simd<uint32_t, 1> d, |
| const uint32_t* HWY_RESTRICT p) { |
| uint32x2_t a = Undefined(d).raw; |
| uint32x2_t b = vld1_lane_u32(p, a, 0); |
| return Vec128<uint32_t, 1>(b); |
| } |
| HWY_INLINE Vec128<int8_t, 4> LoadU(Simd<int8_t, 4> d, |
| const int8_t* HWY_RESTRICT p) { |
| int32x2_t a = Undefined(d).raw; |
| int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0); |
| return Vec128<int8_t, 4>(vreinterpret_s8_s32(b)); |
| } |
| HWY_INLINE Vec128<int16_t, 2> LoadU(Simd<int16_t, 2> d, |
| const int16_t* HWY_RESTRICT p) { |
| int32x2_t a = Undefined(d).raw; |
| int32x2_t b = vld1_lane_s32(reinterpret_cast<const int32_t*>(p), a, 0); |
| return Vec128<int16_t, 2>(vreinterpret_s16_s32(b)); |
| } |
| HWY_INLINE Vec128<int32_t, 1> LoadU(Simd<int32_t, 1> d, |
| const int32_t* HWY_RESTRICT p) { |
| int32x2_t a = Undefined(d).raw; |
| int32x2_t b = vld1_lane_s32(p, a, 0); |
| return Vec128<int32_t, 1>(b); |
| } |
| HWY_INLINE Vec128<float, 1> LoadU(Simd<float, 1> d, |
| const float* HWY_RESTRICT p) { |
| float32x2_t a = Undefined(d).raw; |
| float32x2_t b = vld1_lane_f32(p, a, 0); |
| return Vec128<float, 1>(b); |
| } |
| |
| // ------------------------------ Load 16 |
| |
| HWY_INLINE Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2> d, |
| const uint8_t* HWY_RESTRICT p) { |
| uint16x4_t a = Undefined(d).raw; |
| uint16x4_t b = vld1_lane_u16(reinterpret_cast<const uint16_t*>(p), a, 0); |
| return Vec128<uint8_t, 2>(vreinterpret_u8_u16(b)); |
| } |
| HWY_INLINE Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1> d, |
| const uint16_t* HWY_RESTRICT p) { |
| uint16x4_t a = Undefined(d).raw; |
| uint16x4_t b = vld1_lane_u16(p, a, 0); |
| return Vec128<uint16_t, 1>(b); |
| } |
| |
| HWY_INLINE Vec128<int8_t, 2> LoadU(Simd<int8_t, 2> d, |
| const int8_t* HWY_RESTRICT p) { |
| int16x4_t a = Undefined(d).raw; |
| int16x4_t b = vld1_lane_s16(reinterpret_cast<const int16_t*>(p), a, 0); |
| return Vec128<int8_t, 2>(vreinterpret_s8_s16(b)); |
| } |
| HWY_INLINE Vec128<int16_t, 1> LoadU(Simd<int16_t, 1> d, |
| const int16_t* HWY_RESTRICT p) { |
| int16x4_t a = Undefined(d).raw; |
| int16x4_t b = vld1_lane_s16(p, a, 0); |
| return Vec128<int16_t, 1>(b); |
| } |
| |
| // ------------------------------ Load 8 |
| |
| HWY_INLINE Vec128<uint8_t, 1> LoadU(Simd<uint8_t, 1> d, |
| const uint8_t* HWY_RESTRICT p) { |
| uint8x8_t a = Undefined(d).raw; |
| uint8x8_t b = vld1_lane_u8(p, a, 0); |
| return Vec128<uint8_t, 1>(b); |
| } |
| |
| HWY_INLINE Vec128<int8_t, 1> LoadU(Simd<int8_t, 1> d, |
| const int8_t* HWY_RESTRICT p) { |
| int8x8_t a = Undefined(d).raw; |
| int8x8_t b = vld1_lane_s8(p, a, 0); |
| return Vec128<int8_t, 1>(b); |
| } |
| |
| // On ARM, Load is the same as LoadU. |
| template <typename T, size_t N> |
| HWY_INLINE Vec128<T, N> Load(Simd<T, N> d, const T* HWY_RESTRICT p) { |
| return LoadU(d, p); |
| } |
| |
| // 128-bit SIMD => nothing to duplicate, same as an unaligned load. |
| template <typename T, size_t N, HWY_IF_LE128(T, N)> |
| HWY_INLINE Vec128<T, N> LoadDup128(Simd<T, N> d, |
| const T* const HWY_RESTRICT p) { |
| return LoadU(d, p); |
| } |
| |
| // ------------------------------ Store 128 |
| |
| HWY_INLINE void StoreU(const Vec128<uint8_t> v, Full128<uint8_t> /* tag */, |
| uint8_t* HWY_RESTRICT aligned) { |
| vst1q_u8(aligned, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<uint16_t> v, Full128<uint16_t> /* tag */, |
| uint16_t* HWY_RESTRICT aligned) { |
| vst1q_u16(aligned, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<uint32_t> v, Full128<uint32_t> /* tag */, |
| uint32_t* HWY_RESTRICT aligned) { |
| vst1q_u32(aligned, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<uint64_t> v, Full128<uint64_t> /* tag */, |
| uint64_t* HWY_RESTRICT aligned) { |
| vst1q_u64(aligned, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<int8_t> v, Full128<int8_t> /* tag */, |
| int8_t* HWY_RESTRICT aligned) { |
| vst1q_s8(aligned, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<int16_t> v, Full128<int16_t> /* tag */, |
| int16_t* HWY_RESTRICT aligned) { |
| vst1q_s16(aligned, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<int32_t> v, Full128<int32_t> /* tag */, |
| int32_t* HWY_RESTRICT aligned) { |
| vst1q_s32(aligned, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<int64_t> v, Full128<int64_t> /* tag */, |
| int64_t* HWY_RESTRICT aligned) { |
| vst1q_s64(aligned, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<float> v, Full128<float> /* tag */, |
| float* HWY_RESTRICT aligned) { |
| vst1q_f32(aligned, v.raw); |
| } |
| #if defined(__aarch64__) |
| HWY_INLINE void StoreU(const Vec128<double> v, Full128<double> /* tag */, |
| double* HWY_RESTRICT aligned) { |
| vst1q_f64(aligned, v.raw); |
| } |
| #endif |
| |
| // ------------------------------ Store 64 |
| |
| HWY_INLINE void StoreU(const Vec128<uint8_t, 8> v, Simd<uint8_t, 8> /* tag */, |
| uint8_t* HWY_RESTRICT p) { |
| vst1_u8(p, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<uint16_t, 4> v, Simd<uint16_t, 4> /* tag */, |
| uint16_t* HWY_RESTRICT p) { |
| vst1_u16(p, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<uint32_t, 2> v, Simd<uint32_t, 2> /* tag */, |
| uint32_t* HWY_RESTRICT p) { |
| vst1_u32(p, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<uint64_t, 1> v, Simd<uint64_t, 1> /* tag */, |
| uint64_t* HWY_RESTRICT p) { |
| vst1_u64(p, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<int8_t, 8> v, Simd<int8_t, 8> /* tag */, |
| int8_t* HWY_RESTRICT p) { |
| vst1_s8(p, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<int16_t, 4> v, Simd<int16_t, 4> /* tag */, |
| int16_t* HWY_RESTRICT p) { |
| vst1_s16(p, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<int32_t, 2> v, Simd<int32_t, 2> /* tag */, |
| int32_t* HWY_RESTRICT p) { |
| vst1_s32(p, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<int64_t, 1> v, Simd<int64_t, 1> /* tag */, |
| int64_t* HWY_RESTRICT p) { |
| vst1_s64(p, v.raw); |
| } |
| HWY_INLINE void StoreU(const Vec128<float, 2> v, Simd<float, 2> /* tag */, |
| float* HWY_RESTRICT p) { |
| vst1_f32(p, v.raw); |
| } |
| #if defined(__aarch64__) |
| HWY_INLINE void StoreU(const Vec128<double, 1> v, Simd<double, 1> /* tag */, |
| double* HWY_RESTRICT p) { |
| vst1_f64(p, v.raw); |
| } |
| #endif |
| |
| // ------------------------------ Store 32 |
| |
| HWY_INLINE void StoreU(const Vec128<uint8_t, 4> v, Simd<uint8_t, 4>, |
| uint8_t* HWY_RESTRICT p) { |
| uint32x2_t a = vreinterpret_u32_u8(v.raw); |
| vst1_lane_u32(p, a, 0); |
| } |
| HWY_INLINE void StoreU(const Vec128<uint16_t, 2> v, Simd<uint16_t, 2>, |
| uint16_t* HWY_RESTRICT p) { |
| uint32x2_t a = vreinterpret_u32_u16(v.raw); |
| vst1_lane_u32(p, a, 0); |
| } |
| HWY_INLINE void StoreU(const Vec128<uint32_t, 1> v, Simd<uint32_t, 1>, |
| uint32_t* HWY_RESTRICT p) { |
| vst1_lane_u32(p, v.raw, 0); |
| } |
| HWY_INLINE void StoreU(const Vec128<int8_t, 4> v, Simd<int8_t, 4>, |
| int8_t* HWY_RESTRICT p) { |
| int32x2_t a = vreinterpret_s32_s8(v.raw); |
| vst1_lane_s32(p, a, 0); |
| } |
| HWY_INLINE void StoreU(const Vec128<int16_t, 2> v, Simd<int16_t, 2>, |
| int16_t* HWY_RESTRICT p) { |
| int32x2_t a = vreinterpret_s32_s16(v.raw); |
| vst1_lane_s32(p, a, 0); |
| } |
| HWY_INLINE void StoreU(const Vec128<int32_t, 1> v, Simd<int32_t, 1>, |
| int32_t* HWY_RESTRICT p) { |
| vst1_lane_s32(p, v.raw, 0); |
| } |
| HWY_INLINE void StoreU(const Vec128<float, 1> v, Simd<float, 1>, |
| float* HWY_RESTRICT p) { |
| vst1_lane_f32(p, v.raw, 0); |
| } |
| |
| // ------------------------------ Store 16 |
| |
| HWY_INLINE void StoreU(const Vec128<uint8_t, 2> v, Simd<uint8_t, 2>, |
| uint8_t* HWY_RESTRICT p) { |
| uint16x4_t a = vreinterpret_u16_u8(v.raw); |
| vst1_lane_u16(p, a, 0); |
| } |
| HWY_INLINE void StoreU(const Vec128<uint16_t, 1> v, Simd<uint16_t, 1>, |
| uint16_t* HWY_RESTRICT p) { |
| vst1_lane_u16(p, v.raw, 0); |
| } |
| HWY_INLINE void StoreU(const Vec128<int8_t, 2> v, Simd<int8_t, 2>, |
| int8_t* HWY_RESTRICT p) { |
| int16x4_t a = vreinterpret_s16_s8(v.raw); |
| vst1_lane_s16(p, a, 0); |
| } |
| HWY_INLINE void StoreU(const Vec128<int16_t, 1> v, Simd<int16_t, 1>, |
| int16_t* HWY_RESTRICT p) { |
| vst1_lane_s16(p, v.raw, 0); |
| } |
| |
| // ------------------------------ Store 8 |
| |
| HWY_INLINE void StoreU(const Vec128<uint8_t, 1> v, Simd<uint8_t, 1>, |
| uint8_t* HWY_RESTRICT p) { |
| vst1_lane_u8(p, v.raw, 0); |
| } |
| HWY_INLINE void StoreU(const Vec128<int8_t, 1> v, Simd<int8_t, 1>, |
| int8_t* HWY_RESTRICT p) { |
| vst1_lane_s8(p, v.raw, 0); |
| } |
| |
| // On ARM, Store is the same as StoreU. |
| template <typename T, size_t N> |
| HWY_INLINE void Store(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT p) { |
| StoreU(v, d, p); |
| } |
| |
| // ------------------------------ Non-temporal stores |
| |
| // Same as aligned stores on non-x86. |
| |
| template <typename T, size_t N> |
| HWY_INLINE void Stream(const Vec128<T, N> v, Simd<T, N> d, |
| T* HWY_RESTRICT aligned) { |
| Store(v, d, aligned); |
| } |
| |
| // ================================================== CONVERT |
| |
| // ------------------------------ Promotions (part w/ narrow lanes -> full) |
| |
| // Unsigned: zero-extend to full vector. |
| HWY_INLINE Vec128<uint16_t> PromoteTo(Full128<uint16_t> /* tag */, |
| const Vec128<uint8_t, 8> v) { |
| return Vec128<uint16_t>(vmovl_u8(v.raw)); |
| } |
| HWY_INLINE Vec128<uint32_t> PromoteTo(Full128<uint32_t> /* tag */, |
| const Vec128<uint8_t, 4> v) { |
| uint16x8_t a = vmovl_u8(v.raw); |
| return Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))); |
| } |
| HWY_INLINE Vec128<uint32_t> PromoteTo(Full128<uint32_t> /* tag */, |
| const Vec128<uint16_t, 4> v) { |
| return Vec128<uint32_t>(vmovl_u16(v.raw)); |
| } |
| HWY_INLINE Vec128<uint64_t> PromoteTo(Full128<uint64_t> /* tag */, |
| const Vec128<uint32_t, 2> v) { |
| return Vec128<uint64_t>(vmovl_u32(v.raw)); |
| } |
| HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> /* tag */, |
| const Vec128<uint8_t, 8> v) { |
| return Vec128<int16_t>(vmovl_u8(v.raw)); |
| } |
| HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */, |
| const Vec128<uint8_t, 4> v) { |
| uint16x8_t a = vmovl_u8(v.raw); |
| return Vec128<int32_t>(vreinterpretq_s32_u16(vmovl_u16(vget_low_u16(a)))); |
| } |
| HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */, |
| const Vec128<uint16_t, 4> v) { |
| return Vec128<int32_t>(vmovl_u16(v.raw)); |
| } |
| |
| // Unsigned: zero-extend to half vector. |
| template <size_t N, HWY_IF_LE64(uint16_t, N)> |
| HWY_INLINE Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N> /* tag */, |
| const Vec128<uint8_t, N> v) { |
| return Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw))); |
| } |
| template <size_t N, HWY_IF_LE64(uint32_t, N)> |
| HWY_INLINE Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */, |
| const Vec128<uint8_t, N> v) { |
| uint16x8_t a = vmovl_u8(v.raw); |
| return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(vget_low_u16(a)))); |
| } |
| template <size_t N> |
| HWY_INLINE Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */, |
| const Vec128<uint16_t, N> v) { |
| return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(v.raw))); |
| } |
| template <size_t N, HWY_IF_LE64(uint64_t, N)> |
| HWY_INLINE Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N> /* tag */, |
| const Vec128<uint32_t, N> v) { |
| return Vec128<uint64_t, N>(vget_low_u64(vmovl_u32(v.raw))); |
| } |
| template <size_t N, HWY_IF_LE64(int16_t, N)> |
| HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */, |
| const Vec128<uint8_t, N> v) { |
| return Vec128<int16_t, N>(vget_low_s16(vmovl_u8(v.raw))); |
| } |
| template <size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */, |
| const Vec128<uint8_t, N> v) { |
| uint16x8_t a = vmovl_u8(v.raw); |
| uint32x4_t b = vmovl_u16(vget_low_u16(a)); |
| return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(b))); |
| } |
| template <size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */, |
| const Vec128<uint16_t, N> v) { |
| uint32x4_t a = vmovl_u16(v.raw); |
| return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(a))); |
| } |
| |
| HWY_INLINE Vec128<uint32_t> U32FromU8(const Vec128<uint8_t> v) { |
| return Vec128<uint32_t>( |
| vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v.raw))))); |
| } |
| |
| // Signed: replicate sign bit to full vector. |
| HWY_INLINE Vec128<int16_t> PromoteTo(Full128<int16_t> /* tag */, |
| const Vec128<int8_t, 8> v) { |
| return Vec128<int16_t>(vmovl_s8(v.raw)); |
| } |
| HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */, |
| const Vec128<int8_t, 4> v) { |
| int16x8_t a = vmovl_s8(v.raw); |
| return Vec128<int32_t>(vmovl_s16(vget_low_s16(a))); |
| } |
| HWY_INLINE Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */, |
| const Vec128<int16_t, 4> v) { |
| return Vec128<int32_t>(vmovl_s16(v.raw)); |
| } |
| HWY_INLINE Vec128<int64_t> PromoteTo(Full128<int64_t> /* tag */, |
| const Vec128<int32_t, 2> v) { |
| return Vec128<int64_t>(vmovl_s32(v.raw)); |
| } |
| |
| // Signed: replicate sign bit to half vector. |
| template <size_t N> |
| HWY_INLINE Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */, |
| const Vec128<int8_t, N> v) { |
| return Vec128<int16_t, N>(vget_low_s16(vmovl_s8(v.raw))); |
| } |
| template <size_t N> |
| HWY_INLINE Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */, |
| const Vec128<int8_t, N> v) { |
| int16x8_t a = vmovl_s8(v.raw); |
| int32x4_t b = vmovl_s16(vget_low_s16(a)); |
| return Vec128<int32_t, N>(vget_low_s32(b)); |
| } |
| template <size_t N> |
| HWY_INLINE Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */, |
| const Vec128<int16_t, N> v) { |
| return Vec128<int32_t, N>(vget_low_s32(vmovl_s16(v.raw))); |
| } |
| template <size_t N> |
| HWY_INLINE Vec128<int64_t, N> PromoteTo(Simd<int64_t, N> /* tag */, |
| const Vec128<int32_t, N> v) { |
| return Vec128<int64_t, N>(vget_low_s64(vmovl_s32(v.raw))); |
| } |
| |
| #if defined(__aarch64__) |
| HWY_INLINE Vec128<double> PromoteTo(Full128<double> /* tag */, |
| const Vec128<float, 2> v) { |
| return Vec128<double>(vcvt_f64_f32(v.raw)); |
| } |
| |
| HWY_INLINE Vec128<double, 1> PromoteTo(Simd<double, 1> /* tag */, |
| const Vec128<float, 1> v) { |
| return Vec128<double, 1>(vget_low_f64(vcvt_f64_f32(v.raw))); |
| } |
| |
| HWY_INLINE Vec128<double> PromoteTo(Full128<double> /* tag */, |
| const Vec128<int32_t, 2> v) { |
| const int64x2_t i64 = vmovl_s32(v.raw); |
| return Vec128<double>(vcvtq_f64_s64(i64)); |
| } |
| |
| HWY_INLINE Vec128<double, 1> PromoteTo(Simd<double, 1> /* tag */, |
| const Vec128<int32_t, 1> v) { |
| const int64x1_t i64 = vget_low_s64(vmovl_s32(v.raw)); |
| return Vec128<double, 1>(vcvt_f64_s64(i64)); |
| } |
| |
| #endif |
| |
| // ------------------------------ Demotions (full -> part w/ narrow lanes) |
| |
| // From full vector to half or quarter |
| HWY_INLINE Vec128<uint16_t, 4> DemoteTo(Simd<uint16_t, 4> /* tag */, |
| const Vec128<int32_t> v) { |
| return Vec128<uint16_t, 4>(vqmovun_s32(v.raw)); |
| } |
| HWY_INLINE Vec128<int16_t, 4> DemoteTo(Simd<int16_t, 4> /* tag */, |
| const Vec128<int32_t> v) { |
| return Vec128<int16_t, 4>(vqmovn_s32(v.raw)); |
| } |
| HWY_INLINE Vec128<uint8_t, 4> DemoteTo(Simd<uint8_t, 4> /* tag */, |
| const Vec128<int32_t> v) { |
| const uint16x4_t a = vqmovun_s32(v.raw); |
| return Vec128<uint8_t, 4>(vqmovn_u16(vcombine_u16(a, a))); |
| } |
| HWY_INLINE Vec128<uint8_t, 8> DemoteTo(Simd<uint8_t, 8> /* tag */, |
| const Vec128<int16_t> v) { |
| return Vec128<uint8_t, 8>(vqmovun_s16(v.raw)); |
| } |
| HWY_INLINE Vec128<int8_t, 4> DemoteTo(Simd<int8_t, 4> /* tag */, |
| const Vec128<int32_t> v) { |
| const int16x4_t a = vqmovn_s32(v.raw); |
| return Vec128<int8_t, 4>(vqmovn_s16(vcombine_s16(a, a))); |
| } |
| HWY_INLINE Vec128<int8_t, 8> DemoteTo(Simd<int8_t, 8> /* tag */, |
| const Vec128<int16_t> v) { |
| return Vec128<int8_t, 8>(vqmovn_s16(v.raw)); |
| } |
| |
| // From half vector to partial half |
| template <size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N> /* tag */, |
| const Vec128<int32_t, N> v) { |
| return Vec128<uint16_t, N>(vqmovun_s32(vcombine_s32(v.raw, v.raw))); |
| } |
| template <size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<int16_t, N> DemoteTo(Simd<int16_t, N> /* tag */, |
| const Vec128<int32_t, N> v) { |
| return Vec128<int16_t, N>(vqmovn_s32(vcombine_s32(v.raw, v.raw))); |
| } |
| template <size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N> /* tag */, |
| const Vec128<int32_t, N> v) { |
| const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw)); |
| return Vec128<uint8_t, N>(vqmovn_u16(vcombine_u16(a, a))); |
| } |
| template <size_t N, HWY_IF_LE64(int16_t, N)> |
| HWY_INLINE Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N> /* tag */, |
| const Vec128<int16_t, N> v) { |
| return Vec128<uint8_t, N>(vqmovun_s16(vcombine_s16(v.raw, v.raw))); |
| } |
| template <size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<int8_t, N> DemoteTo(Simd<int8_t, N> /* tag */, |
| const Vec128<int32_t, N> v) { |
| const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw)); |
| return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(a, a))); |
| } |
| template <size_t N, HWY_IF_LE64(int16_t, N)> |
| HWY_INLINE Vec128<int8_t, N> DemoteTo(Simd<int8_t, N> /* tag */, |
| const Vec128<int16_t, N> v) { |
| return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(v.raw, v.raw))); |
| } |
| |
| #if defined(__aarch64__) |
| HWY_INLINE Vec128<float, 2> DemoteTo(Simd<float, 2> /* tag */, |
| const Vec128<double> v) { |
| return Vec128<float, 2>(vcvt_f32_f64(v.raw)); |
| } |
| HWY_INLINE Vec128<float, 1> DemoteTo(Simd<float, 1> /* tag */, |
| const Vec128<double, 1> v) { |
| return Vec128<float, 1>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw))); |
| } |
| |
| HWY_INLINE Vec128<int32_t, 2> DemoteTo(Simd<int32_t, 2> /* tag */, |
| const Vec128<double> v) { |
| const int64x2_t i64 = vcvtq_s64_f64(v.raw); |
| return Vec128<int32_t, 2>(vqmovn_s64(i64)); |
| } |
| HWY_INLINE Vec128<int32_t, 1> DemoteTo(Simd<int32_t, 1> /* tag */, |
| const Vec128<double, 1> v) { |
| const int64x1_t i64 = vcvt_s64_f64(v.raw); |
| // There is no i64x1 -> i32x1 narrow, so expand to int64x2_t first. |
| const int64x2_t i64x2 = vcombine_s64(i64, i64); |
| return Vec128<int32_t, 1>(vqmovn_s64(i64x2)); |
| } |
| |
| #endif |
| |
| HWY_INLINE Vec128<uint8_t, 4> U8FromU32(const Vec128<uint32_t> v) { |
| const uint8x16_t org_v = cast_to_u8(v).raw; |
| const uint8x16_t w = vuzp1q_u8(org_v, org_v); |
| return Vec128<uint8_t, 4>(vget_low_u8(vuzp1q_u8(w, w))); |
| } |
| |
| // In the following DemoteTo functions, |b| is purposely undefined. |
| // The value a needs to be extended to 128 bits so that vqmovn can be |
| // used and |b| is undefined so that no extra overhead is introduced. |
| HWY_DIAGNOSTICS(push) |
| HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") |
| |
| template <size_t N> |
| HWY_INLINE Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N> /* tag */, |
| const Vec128<int32_t> v) { |
| Vec128<uint16_t, N> a = DemoteTo(Simd<uint16_t, N>(), v); |
| Vec128<uint16_t, N> b; |
| uint16x8_t c = vcombine_u16(a.raw, b.raw); |
| return Vec128<uint8_t, N>(vqmovn_u16(c)); |
| } |
| |
| template <size_t N> |
| HWY_INLINE Vec128<int8_t, N> DemoteTo(Simd<int8_t, N> /* tag */, |
| const Vec128<int32_t> v) { |
| Vec128<int16_t, N> a = DemoteTo(Simd<int16_t, N>(), v); |
| Vec128<int16_t, N> b; |
| uint16x8_t c = vcombine_s16(a.raw, b.raw); |
| return Vec128<int8_t, N>(vqmovn_s16(c)); |
| } |
| |
| HWY_DIAGNOSTICS(pop) |
| |
| // ------------------------------ Convert integer <=> floating-point |
| |
| HWY_INLINE Vec128<float> ConvertTo(Full128<float> /* tag */, |
| const Vec128<int32_t> v) { |
| return Vec128<float>(vcvtq_f32_s32(v.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<float, N> ConvertTo(Simd<float, N> /* tag */, |
| const Vec128<int32_t, N> v) { |
| return Vec128<float, N>(vcvt_f32_s32(v.raw)); |
| } |
| |
| // Truncates (rounds toward zero). |
| HWY_INLINE Vec128<int32_t> ConvertTo(Full128<int32_t> /* tag */, |
| const Vec128<float> v) { |
| return Vec128<int32_t>(vcvtq_s32_f32(v.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(float, N)> |
| HWY_INLINE Vec128<int32_t, N> ConvertTo(Simd<int32_t, N> /* tag */, |
| const Vec128<float, N> v) { |
| return Vec128<int32_t, N>(vcvt_s32_f32(v.raw)); |
| } |
| |
| #if defined(__aarch64__) |
| |
| HWY_INLINE Vec128<double> ConvertTo(Full128<double> /* tag */, |
| const Vec128<int64_t> v) { |
| return Vec128<double>(vcvtq_f64_s64(v.raw)); |
| } |
| HWY_INLINE Vec128<double, 1> ConvertTo(Simd<double, 1> /* tag */, |
| const Vec128<int64_t, 1> v) { |
| return Vec128<double, 1>(vcvt_f64_s64(v.raw)); |
| } |
| |
| // Truncates (rounds toward zero). |
| HWY_INLINE Vec128<int64_t> ConvertTo(Full128<int64_t> /* tag */, |
| const Vec128<double> v) { |
| return Vec128<int64_t>(vcvtq_s64_f64(v.raw)); |
| } |
| HWY_INLINE Vec128<int64_t, 1> ConvertTo(Simd<int64_t, 1> /* tag */, |
| const Vec128<double, 1> v) { |
| return Vec128<int64_t, 1>(vcvt_s64_f64(v.raw)); |
| } |
| |
| HWY_INLINE Vec128<int32_t> NearestInt(const Vec128<float> v) { |
| return Vec128<int32_t>(vcvtnq_s32_f32(v.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(float, N)> |
| HWY_INLINE Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) { |
| return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw)); |
| } |
| |
| #else |
| |
| template <size_t N> |
| HWY_INLINE Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) { |
| return ConvertTo(Simd<int32_t, N>(), Round(v)); |
| } |
| #endif |
| |
| // ================================================== SWIZZLE |
| |
| // ------------------------------ Extract lane |
| |
| HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, 16> v) { |
| return vget_lane_u8(vget_low_u8(v.raw), 0); |
| } |
| template <size_t N> |
| HWY_INLINE uint8_t GetLane(const Vec128<uint8_t, N> v) { |
| return vget_lane_u8(v.raw, 0); |
| } |
| |
| HWY_INLINE int8_t GetLane(const Vec128<int8_t, 16> v) { |
| return vget_lane_s8(vget_low_s8(v.raw), 0); |
| } |
| template <size_t N> |
| HWY_INLINE int8_t GetLane(const Vec128<int8_t, N> v) { |
| return vget_lane_s8(v.raw, 0); |
| } |
| |
| HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, 8> v) { |
| return vget_lane_u16(vget_low_u16(v.raw), 0); |
| } |
| template <size_t N> |
| HWY_INLINE uint16_t GetLane(const Vec128<uint16_t, N> v) { |
| return vget_lane_u16(v.raw, 0); |
| } |
| |
| HWY_INLINE int16_t GetLane(const Vec128<int16_t, 8> v) { |
| return vget_lane_s16(vget_low_s16(v.raw), 0); |
| } |
| template <size_t N> |
| HWY_INLINE int16_t GetLane(const Vec128<int16_t, N> v) { |
| return vget_lane_s16(v.raw, 0); |
| } |
| |
| HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, 4> v) { |
| return vget_lane_u32(vget_low_u32(v.raw), 0); |
| } |
| template <size_t N> |
| HWY_INLINE uint32_t GetLane(const Vec128<uint32_t, N> v) { |
| return vget_lane_u32(v.raw, 0); |
| } |
| |
| HWY_INLINE int32_t GetLane(const Vec128<int32_t, 4> v) { |
| return vget_lane_s32(vget_low_s32(v.raw), 0); |
| } |
| template <size_t N> |
| HWY_INLINE int32_t GetLane(const Vec128<int32_t, N> v) { |
| return vget_lane_s32(v.raw, 0); |
| } |
| |
| HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 2> v) { |
| return vget_lane_u64(vget_low_u64(v.raw), 0); |
| } |
| HWY_INLINE uint64_t GetLane(const Vec128<uint64_t, 1> v) { |
| return vget_lane_u64(v.raw, 0); |
| } |
| HWY_INLINE int64_t GetLane(const Vec128<int64_t, 2> v) { |
| return vget_lane_s64(vget_low_s64(v.raw), 0); |
| } |
| HWY_INLINE int64_t GetLane(const Vec128<int64_t, 1> v) { |
| return vget_lane_s64(v.raw, 0); |
| } |
| |
| HWY_INLINE float GetLane(const Vec128<float, 4> v) { |
| return vget_lane_f32(vget_low_f32(v.raw), 0); |
| } |
| HWY_INLINE float GetLane(const Vec128<float, 2> v) { |
| return vget_lane_f32(v.raw, 0); |
| } |
| HWY_INLINE float GetLane(const Vec128<float, 1> v) { |
| return vget_lane_f32(v.raw, 0); |
| } |
| #if defined(__aarch64__) |
| HWY_INLINE double GetLane(const Vec128<double, 2> v) { |
| return vget_lane_f64(vget_low_f64(v.raw), 0); |
| } |
| HWY_INLINE double GetLane(const Vec128<double, 1> v) { |
| return vget_lane_f64(v.raw, 0); |
| } |
| #endif |
| |
| // ------------------------------ Extract half |
| |
| // <= 64 bit: just return different type |
| template <typename T, size_t N, HWY_IF_LE64(uint8_t, N)> |
| HWY_INLINE Vec128<T, N / 2> LowerHalf(const Vec128<T, N> v) { |
| return Vec128<T, N / 2>(v.raw); |
| } |
| |
| HWY_INLINE Vec128<uint8_t, 8> LowerHalf(const Vec128<uint8_t> v) { |
| return Vec128<uint8_t, 8>(vget_low_u8(v.raw)); |
| } |
| HWY_INLINE Vec128<uint16_t, 4> LowerHalf(const Vec128<uint16_t> v) { |
| return Vec128<uint16_t, 4>(vget_low_u16(v.raw)); |
| } |
| HWY_INLINE Vec128<uint32_t, 2> LowerHalf(const Vec128<uint32_t> v) { |
| return Vec128<uint32_t, 2>(vget_low_u32(v.raw)); |
| } |
| HWY_INLINE Vec128<uint64_t, 1> LowerHalf(const Vec128<uint64_t> v) { |
| return Vec128<uint64_t, 1>(vget_low_u64(v.raw)); |
| } |
| HWY_INLINE Vec128<int8_t, 8> LowerHalf(const Vec128<int8_t> v) { |
| return Vec128<int8_t, 8>(vget_low_s8(v.raw)); |
| } |
| HWY_INLINE Vec128<int16_t, 4> LowerHalf(const Vec128<int16_t> v) { |
| return Vec128<int16_t, 4>(vget_low_s16(v.raw)); |
| } |
| HWY_INLINE Vec128<int32_t, 2> LowerHalf(const Vec128<int32_t> v) { |
| return Vec128<int32_t, 2>(vget_low_s32(v.raw)); |
| } |
| HWY_INLINE Vec128<int64_t, 1> LowerHalf(const Vec128<int64_t> v) { |
| return Vec128<int64_t, 1>(vget_low_s64(v.raw)); |
| } |
| HWY_INLINE Vec128<float, 2> LowerHalf(const Vec128<float> v) { |
| return Vec128<float, 2>(vget_low_f32(v.raw)); |
| } |
| #if defined(__aarch64__) |
| HWY_INLINE Vec128<double, 1> LowerHalf(const Vec128<double> v) { |
| return Vec128<double, 1>(vget_low_f64(v.raw)); |
| } |
| #endif |
| |
| HWY_INLINE Vec128<uint8_t, 8> UpperHalf(const Vec128<uint8_t> v) { |
| return Vec128<uint8_t, 8>(vget_high_u8(v.raw)); |
| } |
| HWY_INLINE Vec128<uint16_t, 4> UpperHalf(const Vec128<uint16_t> v) { |
| return Vec128<uint16_t, 4>(vget_high_u16(v.raw)); |
| } |
| HWY_INLINE Vec128<uint32_t, 2> UpperHalf(const Vec128<uint32_t> v) { |
| return Vec128<uint32_t, 2>(vget_high_u32(v.raw)); |
| } |
| HWY_INLINE Vec128<uint64_t, 1> UpperHalf(const Vec128<uint64_t> v) { |
| return Vec128<uint64_t, 1>(vget_high_u64(v.raw)); |
| } |
| HWY_INLINE Vec128<int8_t, 8> UpperHalf(const Vec128<int8_t> v) { |
| return Vec128<int8_t, 8>(vget_high_s8(v.raw)); |
| } |
| HWY_INLINE Vec128<int16_t, 4> UpperHalf(const Vec128<int16_t> v) { |
| return Vec128<int16_t, 4>(vget_high_s16(v.raw)); |
| } |
| HWY_INLINE Vec128<int32_t, 2> UpperHalf(const Vec128<int32_t> v) { |
| return Vec128<int32_t, 2>(vget_high_s32(v.raw)); |
| } |
| HWY_INLINE Vec128<int64_t, 1> UpperHalf(const Vec128<int64_t> v) { |
| return Vec128<int64_t, 1>(vget_high_s64(v.raw)); |
| } |
| HWY_INLINE Vec128<float, 2> UpperHalf(const Vec128<float> v) { |
| return Vec128<float, 2>(vget_high_f32(v.raw)); |
| } |
| #if defined(__aarch64__) |
| HWY_INLINE Vec128<double, 1> UpperHalf(const Vec128<double> v) { |
| return Vec128<double, 1>(vget_high_f64(v.raw)); |
| } |
| #endif |
| |
| // ------------------------------ Extract from 2x 128-bit at constant offset |
| |
| // Extracts 128 bits from <hi, lo> by skipping the least-significant kBytes. |
| template <int kBytes, typename T> |
| HWY_INLINE Vec128<T> CombineShiftRightBytes(const Vec128<T> hi, |
| const Vec128<T> lo) { |
| static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]"); |
| const Full128<uint8_t> d8; |
| return BitCast(Full128<T>(), |
| Vec128<uint8_t>(vextq_u8(BitCast(d8, lo).raw, |
| BitCast(d8, hi).raw, kBytes))); |
| } |
| |
| // ------------------------------ Shift vector by constant #bytes |
| |
| namespace impl { |
| |
| // Need to partially specialize because CombineShiftRightBytes<16> and <0> are |
| // compile errors. |
| template <int kBytes> |
| struct ShiftLeftBytesT { |
| template <class T, size_t N> |
| HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) { |
| return CombineShiftRightBytes<16 - kBytes>(v, Zero(Full128<T>())); |
| } |
| }; |
| template <> |
| struct ShiftLeftBytesT<0> { |
| template <class T, size_t N> |
| HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) { |
| return v; |
| } |
| }; |
| |
| template <int kBytes> |
| struct ShiftRightBytesT { |
| template <class T, size_t N> |
| HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) { |
| return CombineShiftRightBytes<kBytes>(Zero(Full128<T>()), v); |
| } |
| }; |
| template <> |
| struct ShiftRightBytesT<0> { |
| template <class T, size_t N> |
| HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) { |
| return v; |
| } |
| }; |
| |
| } // namespace impl |
| |
| // 0x01..0F, kBytes = 1 => 0x02..0F00 |
| template <int kBytes, typename T, size_t N> |
| HWY_INLINE Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) { |
| return impl::ShiftLeftBytesT<kBytes>()(v); |
| } |
| |
| template <int kLanes, typename T, size_t N> |
| HWY_INLINE Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) { |
| const Simd<uint8_t, N * sizeof(T)> d8; |
| const Simd<T, N> d; |
| return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v))); |
| } |
| |
| // 0x01..0F, kBytes = 1 => 0x0001..0E |
| template <int kBytes, typename T, size_t N> |
| HWY_INLINE Vec128<T, N> ShiftRightBytes(const Vec128<T, N> v) { |
| return impl::ShiftRightBytesT<kBytes>()(v); |
| } |
| |
| template <int kLanes, typename T, size_t N> |
| HWY_INLINE Vec128<T, N> ShiftRightLanes(const Vec128<T, N> v) { |
| const Simd<uint8_t, N * sizeof(T)> d8; |
| const Simd<T, N> d; |
| return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v))); |
| } |
| |
| // ------------------------------ Broadcast/splat any lane |
| |
| #if defined(__aarch64__) |
| // Unsigned |
| template <int kLane> |
| HWY_INLINE Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) { |
| static_assert(0 <= kLane && kLane < 8, "Invalid lane"); |
| return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane)); |
| } |
| template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)> |
| HWY_INLINE Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane)); |
| } |
| template <int kLane> |
| HWY_INLINE Vec128<uint32_t> Broadcast(const Vec128<uint32_t> v) { |
| static_assert(0 <= kLane && kLane < 4, "Invalid lane"); |
| return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane)); |
| } |
| template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)> |
| HWY_INLINE Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane)); |
| } |
| template <int kLane> |
| HWY_INLINE Vec128<uint64_t> Broadcast(const Vec128<uint64_t> v) { |
| static_assert(0 <= kLane && kLane < 2, "Invalid lane"); |
| return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane)); |
| } |
| // Vec128<uint64_t, 1> is defined below. |
| |
| // Signed |
| template <int kLane> |
| HWY_INLINE Vec128<int16_t> Broadcast(const Vec128<int16_t> v) { |
| static_assert(0 <= kLane && kLane < 8, "Invalid lane"); |
| return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane)); |
| } |
| template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)> |
| HWY_INLINE Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane)); |
| } |
| template <int kLane> |
| HWY_INLINE Vec128<int32_t> Broadcast(const Vec128<int32_t> v) { |
| static_assert(0 <= kLane && kLane < 4, "Invalid lane"); |
| return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane)); |
| } |
| template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane)); |
| } |
| template <int kLane> |
| HWY_INLINE Vec128<int64_t> Broadcast(const Vec128<int64_t> v) { |
| static_assert(0 <= kLane && kLane < 2, "Invalid lane"); |
| return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane)); |
| } |
| // Vec128<int64_t, 1> is defined below. |
| |
| // Float |
| template <int kLane> |
| HWY_INLINE Vec128<float> Broadcast(const Vec128<float> v) { |
| static_assert(0 <= kLane && kLane < 4, "Invalid lane"); |
| return Vec128<float>(vdupq_laneq_f32(v.raw, kLane)); |
| } |
| template <int kLane, size_t N, HWY_IF_LE64(float, N)> |
| HWY_INLINE Vec128<float, N> Broadcast(const Vec128<float, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| return Vec128<float, N>(vdup_lane_f32(v.raw, kLane)); |
| } |
| template <int kLane> |
| HWY_INLINE Vec128<double> Broadcast(const Vec128<double> v) { |
| static_assert(0 <= kLane && kLane < 2, "Invalid lane"); |
| return Vec128<double>(vdupq_laneq_f64(v.raw, kLane)); |
| } |
| template <int kLane> |
| HWY_INLINE Vec128<double, 1> Broadcast(const Vec128<double, 1> v) { |
| static_assert(0 <= kLane && kLane < 1, "Invalid lane"); |
| return v; |
| } |
| |
| #else |
| // No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*. |
| |
| // Unsigned |
| template <int kLane> |
| HWY_INLINE Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) { |
| static_assert(0 <= kLane && kLane < 8, "Invalid lane"); |
| return Vec128<uint16_t>(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane))); |
| } |
| template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)> |
| HWY_INLINE Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane)); |
| } |
| template <int kLane> |
| HWY_INLINE Vec128<uint32_t> Broadcast(const Vec128<uint32_t> v) { |
| static_assert(0 <= kLane && kLane < 4, "Invalid lane"); |
| return Vec128<uint32_t>(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane))); |
| } |
| template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)> |
| HWY_INLINE Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane)); |
| } |
| template <int kLane> |
| HWY_INLINE Vec128<uint64_t> Broadcast(const Vec128<uint64_t> v) { |
| static_assert(0 <= kLane && kLane < 2, "Invalid lane"); |
| return Vec128<uint64_t>(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane))); |
| } |
| // Vec128<uint64_t, 1> is defined below. |
| |
| // Signed |
| template <int kLane> |
| HWY_INLINE Vec128<int16_t> Broadcast(const Vec128<int16_t> v) { |
| static_assert(0 <= kLane && kLane < 8, "Invalid lane"); |
| return Vec128<int16_t>(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane))); |
| } |
| template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)> |
| HWY_INLINE Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane)); |
| } |
| template <int kLane> |
| HWY_INLINE Vec128<int32_t> Broadcast(const Vec128<int32_t> v) { |
| static_assert(0 <= kLane && kLane < 4, "Invalid lane"); |
| return Vec128<int32_t>(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane))); |
| } |
| template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane)); |
| } |
| template <int kLane> |
| HWY_INLINE Vec128<int64_t> Broadcast(const Vec128<int64_t> v) { |
| static_assert(0 <= kLane && kLane < 2, "Invalid lane"); |
| return Vec128<int64_t>(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane))); |
| } |
| // Vec128<int64_t, 1> is defined below. |
| |
| // Float |
| template <int kLane> |
| HWY_INLINE Vec128<float> Broadcast(const Vec128<float> v) { |
| static_assert(0 <= kLane && kLane < 4, "Invalid lane"); |
| return Vec128<float>(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane))); |
| } |
| template <int kLane, size_t N, HWY_IF_LE64(float, N)> |
| HWY_INLINE Vec128<float, N> Broadcast(const Vec128<float, N> v) { |
| static_assert(0 <= kLane && kLane < N, "Invalid lane"); |
| return Vec128<float, N>(vdup_lane_f32(v.raw, kLane)); |
| } |
| |
| #endif |
| |
| template <int kLane> |
| HWY_INLINE Vec128<uint64_t, 1> Broadcast(const Vec128<uint64_t, 1> v) { |
| static_assert(0 <= kLane && kLane < 1, "Invalid lane"); |
| return v; |
| } |
| template <int kLane> |
| HWY_INLINE Vec128<int64_t, 1> Broadcast(const Vec128<int64_t, 1> v) { |
| static_assert(0 <= kLane && kLane < 1, "Invalid lane"); |
| return v; |
| } |
| |
| // ------------------------------ Shuffle bytes with variable indices |
| |
| // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes: |
| // either valid indices in [0, 16) or >= 0x80 to zero the i-th output byte. |
| template <typename T, typename TI> |
| HWY_INLINE Vec128<T> TableLookupBytes(const Vec128<T> bytes, |
| const Vec128<TI> from) { |
| const Full128<uint8_t> d8; |
| #if defined(__aarch64__) |
| return BitCast(Full128<T>(), |
| Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw, |
| BitCast(d8, from).raw))); |
| #else |
| uint8x16_t table0 = BitCast(d8, bytes).raw; |
| uint8x8x2_t table; |
| table.val[0] = vget_low_u8(table0); |
| table.val[1] = vget_high_u8(table0); |
| uint8x16_t idx = BitCast(d8, from).raw; |
| uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx)); |
| uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx)); |
| return BitCast(Full128<T>(), Vec128<uint8_t>(vcombine_u8(low, hi))); |
| #endif |
| } |
| |
| // ------------------------------ Hard-coded shuffles |
| |
| // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant). |
| // Shuffle0321 rotates one lane to the right (the previous least-significant |
| // lane is now most-significant). These could also be implemented via |
| // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. |
| |
| // Swap 32-bit halves in 64-bits |
| HWY_INLINE Vec128<uint32_t, 2> Shuffle2301(const Vec128<uint32_t, 2> v) { |
| return Vec128<uint32_t, 2>(vrev64_u32(v.raw)); |
| } |
| HWY_INLINE Vec128<int32_t, 2> Shuffle2301(const Vec128<int32_t, 2> v) { |
| return Vec128<int32_t, 2>(vrev64_s32(v.raw)); |
| } |
| HWY_INLINE Vec128<float, 2> Shuffle2301(const Vec128<float, 2> v) { |
| return Vec128<float, 2>(vrev64_f32(v.raw)); |
| } |
| HWY_INLINE Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) { |
| return Vec128<uint32_t>(vrev64q_u32(v.raw)); |
| } |
| HWY_INLINE Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) { |
| return Vec128<int32_t>(vrev64q_s32(v.raw)); |
| } |
| HWY_INLINE Vec128<float> Shuffle2301(const Vec128<float> v) { |
| return Vec128<float>(vrev64q_f32(v.raw)); |
| } |
| |
| // Swap 64-bit halves |
| template <typename T> |
| HWY_INLINE Vec128<T> Shuffle1032(const Vec128<T> v) { |
| return CombineShiftRightBytes<8>(v, v); |
| } |
| template <typename T> |
| HWY_INLINE Vec128<T> Shuffle01(const Vec128<T> v) { |
| return CombineShiftRightBytes<8>(v, v); |
| } |
| |
| // Rotate right 32 bits |
| template <typename T> |
| HWY_INLINE Vec128<T> Shuffle0321(const Vec128<T> v) { |
| return CombineShiftRightBytes<4>(v, v); |
| } |
| |
| // Rotate left 32 bits |
| template <typename T> |
| HWY_INLINE Vec128<T> Shuffle2103(const Vec128<T> v) { |
| return CombineShiftRightBytes<12>(v, v); |
| } |
| |
| // Reverse |
| template <typename T> |
| HWY_INLINE Vec128<T> Shuffle0123(const Vec128<T> v) { |
| static_assert(sizeof(T) == 4, |
| "Shuffle0123 should only be applied to 32-bit types"); |
| // TODO(janwas): more efficient implementation?, |
| // It is possible to use two instructions (vrev64q_u32 and vcombine_u32 of the |
| // high/low parts) instead of the extra memory and load. |
| static constexpr uint8_t bytes[16] = {12, 13, 14, 15, 8, 9, 10, 11, |
| 4, 5, 6, 7, 0, 1, 2, 3}; |
| return TableLookupBytes(v, Load(Full128<uint8_t>(), bytes)); |
| } |
| |
| // ------------------------------ Permute (runtime variable) |
| |
| // Returned by SetTableIndices for use by TableLookupLanes. |
| template <typename T> |
| struct Permute128 { |
| uint8x16_t raw; |
| }; |
| |
| template <typename T> |
| HWY_INLINE Permute128<T> SetTableIndices(const Full128<T>, const int32_t* idx) { |
| #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) |
| const size_t N = 16 / sizeof(T); |
| for (size_t i = 0; i < N; ++i) { |
| HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N)); |
| } |
| #endif |
| |
| const Full128<uint8_t> d8; |
| alignas(16) uint8_t control[16]; |
| for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) { |
| const size_t idx_lane = idx_byte / sizeof(T); |
| const size_t mod = idx_byte % sizeof(T); |
| control[idx_byte] = idx[idx_lane] * sizeof(T) + mod; |
| } |
| return Permute128<T>{Load(d8, control).raw}; |
| } |
| |
| HWY_INLINE Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v, |
| const Permute128<uint32_t> idx) { |
| return TableLookupBytes(v, Vec128<uint8_t>(idx.raw)); |
| } |
| HWY_INLINE Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v, |
| const Permute128<int32_t> idx) { |
| return TableLookupBytes(v, Vec128<uint8_t>(idx.raw)); |
| } |
| HWY_INLINE Vec128<float> TableLookupLanes(const Vec128<float> v, |
| const Permute128<float> idx) { |
| const Full128<int32_t> di; |
| const Full128<float> df; |
| return BitCast(df, |
| TableLookupBytes(BitCast(di, v), Vec128<uint8_t>(idx.raw))); |
| } |
| |
| // ------------------------------ Interleave lanes |
| |
| // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides |
| // the least-significant lane) and "b". To concatenate two half-width integers |
| // into one, use ZipLower/Upper instead (also works with scalar). |
| HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveLower, vzip1, _, 2) |
| HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveLower, vzip1, _, 2) |
| |
| HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2) |
| HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2) |
| |
| #if defined(__aarch64__) |
| // For 64 bit types, we only have the "q" version of the function defined as |
| // interleaving 64-wide registers with 64-wide types in them makes no sense. |
| HWY_INLINE Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a, |
| const Vec128<uint64_t> b) { |
| return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw)); |
| } |
| HWY_INLINE Vec128<int64_t> InterleaveLower(const Vec128<int64_t> a, |
| const Vec128<int64_t> b) { |
| return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw)); |
| } |
| |
| HWY_INLINE Vec128<uint64_t> InterleaveUpper(const Vec128<uint64_t> a, |
| const Vec128<uint64_t> b) { |
| return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw)); |
| } |
| HWY_INLINE Vec128<int64_t> InterleaveUpper(const Vec128<int64_t> a, |
| const Vec128<int64_t> b) { |
| return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw)); |
| } |
| #else |
| // ARMv7 emulation. |
| HWY_INLINE Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a, |
| const Vec128<uint64_t> b) { |
| auto flip = CombineShiftRightBytes<8>(a, a); |
| return CombineShiftRightBytes<8>(b, flip); |
| } |
| HWY_INLINE Vec128<int64_t> InterleaveLower(const Vec128<int64_t> a, |
| const Vec128<int64_t> b) { |
| auto flip = CombineShiftRightBytes<8>(a, a); |
| return CombineShiftRightBytes<8>(b, flip); |
| } |
| |
| HWY_INLINE Vec128<uint64_t> InterleaveUpper(const Vec128<uint64_t> a, |
| const Vec128<uint64_t> b) { |
| auto flip = CombineShiftRightBytes<8>(b, b); |
| return CombineShiftRightBytes<8>(flip, a); |
| } |
| HWY_INLINE Vec128<int64_t> InterleaveUpper(const Vec128<int64_t> a, |
| const Vec128<int64_t> b) { |
| auto flip = CombineShiftRightBytes<8>(b, b); |
| return CombineShiftRightBytes<8>(flip, a); |
| } |
| #endif |
| |
| // Floats |
| HWY_INLINE Vec128<float> InterleaveLower(const Vec128<float> a, |
| const Vec128<float> b) { |
| return Vec128<float>(vzip1q_f32(a.raw, b.raw)); |
| } |
| #if defined(__aarch64__) |
| HWY_INLINE Vec128<double> InterleaveLower(const Vec128<double> a, |
| const Vec128<double> b) { |
| return Vec128<double>(vzip1q_f64(a.raw, b.raw)); |
| } |
| #endif |
| |
| HWY_INLINE Vec128<float> InterleaveUpper(const Vec128<float> a, |
| const Vec128<float> b) { |
| return Vec128<float>(vzip2q_f32(a.raw, b.raw)); |
| } |
| #if defined(__aarch64__) |
| HWY_INLINE Vec128<double> InterleaveUpper(const Vec128<double> a, |
| const Vec128<double> b) { |
| return Vec128<double>(vzip2q_s64(a.raw, b.raw)); |
| } |
| #endif |
| |
| // ------------------------------ Zip lanes |
| |
| // Same as interleave_*, except that the return lanes are double-width integers; |
| // this is necessary because the single-lane scalar cannot return two values. |
| |
| // Full vectors |
| HWY_INLINE Vec128<uint16_t> ZipLower(const Vec128<uint8_t> a, |
| const Vec128<uint8_t> b) { |
| return Vec128<uint16_t>(vzip1q_u8(a.raw, b.raw)); |
| } |
| HWY_INLINE Vec128<uint32_t> ZipLower(const Vec128<uint16_t> a, |
| const Vec128<uint16_t> b) { |
| return Vec128<uint32_t>(vzip1q_u16(a.raw, b.raw)); |
| } |
| HWY_INLINE Vec128<uint64_t> ZipLower(const Vec128<uint32_t> a, |
| const Vec128<uint32_t> b) { |
| return Vec128<uint64_t>(vzip1q_u32(a.raw, b.raw)); |
| } |
| |
| HWY_INLINE Vec128<int16_t> ZipLower(const Vec128<int8_t> a, |
| const Vec128<int8_t> b) { |
| return Vec128<int16_t>(vzip1q_s8(a.raw, b.raw)); |
| } |
| HWY_INLINE Vec128<int32_t> ZipLower(const Vec128<int16_t> a, |
| const Vec128<int16_t> b) { |
| return Vec128<int32_t>(vzip1q_s16(a.raw, b.raw)); |
| } |
| HWY_INLINE Vec128<int64_t> ZipLower(const Vec128<int32_t> a, |
| const Vec128<int32_t> b) { |
| return Vec128<int64_t>(vzip1q_s32(a.raw, b.raw)); |
| } |
| |
| HWY_INLINE Vec128<uint16_t> ZipUpper(const Vec128<uint8_t> a, |
| const Vec128<uint8_t> b) { |
| return Vec128<uint16_t>(vzip2q_u8(a.raw, b.raw)); |
| } |
| HWY_INLINE Vec128<uint32_t> ZipUpper(const Vec128<uint16_t> a, |
| const Vec128<uint16_t> b) { |
| return Vec128<uint32_t>(vzip2q_u16(a.raw, b.raw)); |
| } |
| HWY_INLINE Vec128<uint64_t> ZipUpper(const Vec128<uint32_t> a, |
| const Vec128<uint32_t> b) { |
| return Vec128<uint64_t>(vzip2q_u32(a.raw, b.raw)); |
| } |
| |
| HWY_INLINE Vec128<int16_t> ZipUpper(const Vec128<int8_t> a, |
| const Vec128<int8_t> b) { |
| return Vec128<int16_t>(vzip2q_s8(a.raw, b.raw)); |
| } |
| HWY_INLINE Vec128<int32_t> ZipUpper(const Vec128<int16_t> a, |
| const Vec128<int16_t> b) { |
| return Vec128<int32_t>(vzip2q_s16(a.raw, b.raw)); |
| } |
| HWY_INLINE Vec128<int64_t> ZipUpper(const Vec128<int32_t> a, |
| const Vec128<int32_t> b) { |
| return Vec128<int64_t>(vzip2q_s32(a.raw, b.raw)); |
| } |
| |
| // Half vectors or less |
| template <size_t N, HWY_IF_LE64(uint8_t, N)> |
| HWY_INLINE Vec128<uint16_t, (N + 1) / 2> ZipLower(const Vec128<uint8_t, N> a, |
| const Vec128<uint8_t, N> b) { |
| return Vec128<uint16_t, (N + 1) / 2>(vzip1_u8(a.raw, b.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(uint16_t, N)> |
| HWY_INLINE Vec128<uint32_t, (N + 1) / 2> ZipLower(const Vec128<uint16_t, N> a, |
| const Vec128<uint16_t, N> b) { |
| return Vec128<uint32_t, (N + 1) / 2>(vzip1_u16(a.raw, b.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(uint32_t, N)> |
| HWY_INLINE Vec128<uint64_t, (N + 1) / 2> ZipLower(const Vec128<uint32_t, N> a, |
| const Vec128<uint32_t, N> b) { |
| return Vec128<uint64_t, (N + 1) / 2>(vzip1_u32(a.raw, b.raw)); |
| } |
| |
| template <size_t N, HWY_IF_LE64(int8_t, N)> |
| HWY_INLINE Vec128<int16_t, (N + 1) / 2> ZipLower(const Vec128<int8_t, N> a, |
| const Vec128<int8_t, N> b) { |
| return Vec128<int16_t, (N + 1) / 2>(vzip1_s8(a.raw, b.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(int16_t, N)> |
| HWY_INLINE Vec128<int32_t, (N + 1) / 2> ZipLower(const Vec128<int16_t, N> a, |
| const Vec128<int16_t, N> b) { |
| return Vec128<int32_t, (N + 1) / 2>(vzip1_s16(a.raw, b.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<int64_t, (N + 1) / 2> ZipLower(const Vec128<int32_t, N> a, |
| const Vec128<int32_t, N> b) { |
| return Vec128<int64_t, (N + 1) / 2>(vzip1_s32(a.raw, b.raw)); |
| } |
| |
| template <size_t N, HWY_IF_LE64(uint8_t, N)> |
| HWY_INLINE Vec128<uint16_t, N / 2> ZipUpper(const Vec128<uint8_t, N> a, |
| const Vec128<uint8_t, N> b) { |
| return Vec128<uint16_t, N / 2>(vzip2_u8(a.raw, b.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(uint16_t, N)> |
| HWY_INLINE Vec128<uint32_t, N / 2> ZipUpper(const Vec128<uint16_t, N> a, |
| const Vec128<uint16_t, N> b) { |
| return Vec128<uint32_t, N / 2>(vzip2_u16(a.raw, b.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(uint32_t, N)> |
| HWY_INLINE Vec128<uint64_t, N / 2> ZipUpper(const Vec128<uint32_t, N> a, |
| const Vec128<uint32_t, N> b) { |
| return Vec128<uint64_t, N / 2>(vzip2_u32(a.raw, b.raw)); |
| } |
| |
| template <size_t N, HWY_IF_LE64(int8_t, N)> |
| HWY_INLINE Vec128<int16_t, N / 2> ZipUpper(const Vec128<int8_t, N> a, |
| const Vec128<int8_t, N> b) { |
| return Vec128<int16_t, N / 2>(vzip2_s8(a.raw, b.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(int16_t, N)> |
| HWY_INLINE Vec128<int32_t, N / 2> ZipUpper(const Vec128<int16_t, N> a, |
| const Vec128<int16_t, N> b) { |
| return Vec128<int32_t, N / 2>(vzip2_s16(a.raw, b.raw)); |
| } |
| template <size_t N, HWY_IF_LE64(int32_t, N)> |
| HWY_INLINE Vec128<int64_t, N / 2> ZipUpper(const Vec128<int32_t, N> a, |
| const Vec128<int32_t, N> b) { |
| return Vec128<int64_t, N / 2>(vzip2_s32(a.raw, b.raw)); |
| } |
| |
| // ------------------------------ Blocks |
| |
| // hiH,hiL loH,loL |-> hiL,loL (= lower halves) |
| template <typename T> |
| HWY_INLINE Vec128<T> ConcatLowerLower(const Vec128<T> hi, const Vec128<T> lo) { |
| const Full128<uint64_t> d64; |
| return BitCast(Full128<T>(), |
| InterleaveLower(BitCast(d64, lo), BitCast(d64, hi))); |
| } |
| |
| // hiH,hiL loH,loL |-> hiH,loH (= upper halves) |
| template <typename T> |
| HWY_INLINE Vec128<T> ConcatUpperUpper(const Vec128<T> hi, const Vec128<T> lo) { |
| const Full128<uint64_t> d64; |
| return BitCast(Full128<T>(), |
| InterleaveUpper(BitCast(d64, lo), BitCast(d64, hi))); |
| } |
| |
| // hiH,hiL loH,loL |-> hiL,loH (= inner halves) |
| template <typename T> |
| HWY_INLINE Vec128<T> ConcatLowerUpper(const Vec128<T> hi, const Vec128<T> lo) { |
| return CombineShiftRightBytes<8>(hi, lo); |
| } |
| |
| // hiH,hiL loH,loL |-> hiH,loL (= outer halves) |
| template <typename T> |
| HWY_INLINE Vec128<T> ConcatUpperLower(const Vec128<T> hi, const Vec128<T> lo) { |
| // TODO(janwas): more efficient implementation? |
| alignas(16) const uint8_t kBytes[16] = { |
| 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0}; |
| const auto vec = BitCast(Full128<T>(), Load(Full128<uint8_t>(), kBytes)); |
| return IfThenElse(MaskFromVec(vec), lo, hi); |
| } |
| |
| // ------------------------------ Odd/even lanes |
| |
| template <typename T> |
| HWY_INLINE Vec128<T> OddEven(const Vec128<T> a, const Vec128<T> b) { |
| alignas(16) constexpr uint8_t kBytes[16] = { |
| ((0 / sizeof(T)) & 1) ? 0 : 0xFF, ((1 / sizeof(T)) & 1) ? 0 : 0xFF, |
| ((2 / sizeof(T)) & 1) ? 0 : 0xFF, ((3 / sizeof(T)) & 1) ? 0 : 0xFF, |
| ((4 / sizeof(T)) & 1) ? 0 : 0xFF, ((5 / sizeof(T)) & 1) ? 0 : 0xFF, |
| ((6 / sizeof(T)) & 1) ? 0 : 0xFF, ((7 / sizeof(T)) & 1) ? 0 : 0xFF, |
| ((8 / sizeof(T)) & 1) ? 0 : 0xFF, ((9 / sizeof(T)) & 1) ? 0 : 0xFF, |
| ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF, |
| ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF, |
| ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF, |
| }; |
| const auto vec = BitCast(Full128<T>(), Load(Full128<uint8_t>(), kBytes)); |
| return IfThenElse(MaskFromVec(vec), b, a); |
| } |
| |
| // ================================================== MISC |
| |
| // Returns a vector with lane i=[0, N) set to "first" + i. |
| template <typename T, size_t N, typename T2> |
| Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) { |
| HWY_ALIGN T lanes[16 / sizeof(T)]; |
| for (size_t i = 0; i < 16 / sizeof(T); ++i) { |
| lanes[i] = static_cast<T>(first + static_cast<T2>(i)); |
| } |
| return Load(d, lanes); |
| } |
| |
| // ------------------------------ Gather (requires GetLane) |
| |
| template <typename T, size_t N, typename Offset> |
| HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d, |
| const T* HWY_RESTRICT base, |
| const Vec128<Offset, N> offset) { |
| static_assert(N == 1, "NEON does not support full gather"); |
| static_assert(sizeof(T) == sizeof(Offset), "T must match Offset"); |
| const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset); |
| T val; |
| CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val); |
| return Set(d, val); |
| } |
| |
| template <typename T, size_t N, typename Index> |
| HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base, |
| const Vec128<Index, N> index) { |
| static_assert(N == 1, "NEON does not support full gather"); |
| static_assert(sizeof(T) == sizeof(Index), "T must match Index"); |
| return Set(d, base[GetLane(index)]); |
| } |
| |
| // ------------------------------ ARMv7 int64 equality (requires Shuffle2301) |
| |
| #if !defined(__aarch64__) |
| |
| template <size_t N> |
| HWY_INLINE Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a, |
| const Vec128<int64_t, N> b) { |
| const Simd<int32_t, N * 2> d32; |
| const Simd<int64_t, N> d64; |
| const auto cmp32 = VecFromMask(BitCast(d32, a) == BitCast(d32, b)); |
| const auto cmp64 = cmp32 & Shuffle2301(cmp32); |
| return MaskFromVec(BitCast(d64, cmp64)); |
| } |
| |
| template <size_t N> |
| HWY_INLINE Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a, |
| const Vec128<uint64_t, N> b) { |
| const Simd<uint32_t, N * 2> d32; |
| const Simd<uint64_t, N> d64; |
| const auto cmp32 = VecFromMask(BitCast(d32, a) == BitCast(d32, b)); |
| const auto cmp64 = cmp32 & Shuffle2301(cmp32); |
| return MaskFromVec(BitCast(d64, cmp64)); |
| } |
| |
| #endif |
| |
| // ------------------------------ Horizontal sum (reduction) |
| |
| // Returns 64-bit sums of 8-byte groups. |
| HWY_INLINE Vec128<uint64_t> SumsOfU8x8(const Vec128<uint8_t> v) { |
| uint16x8_t a = vpaddlq_u8(v.raw); |
| uint32x4_t b = vpaddlq_u16(a); |
| return Vec128<uint64_t>(vpaddlq_u32(b)); |
| } |
| |
| HWY_INLINE Vec128<uint64_t, 1> SumsOfU8x8(const Vec128<uint8_t, 8> v) { |
| uint16x4_t a = vpaddl_u8(v.raw); |
| uint32x2_t b = vpaddl_u16(a); |
| return Vec128<uint64_t, 1>(vpaddl_u32(b)); |
| } |
| |
| #if defined(__aarch64__) |
| // Supported for 32b and 64b vector types. Returns the sum in each lane. |
| HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) { |
| return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw))); |
| } |
| HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) { |
| return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(v.raw))); |
| } |
| HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) { |
| return Vec128<float>(vdupq_n_f32(vaddvq_f32(v.raw))); |
| } |
| HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) { |
| return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(v.raw))); |
| } |
| HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) { |
| return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(v.raw))); |
| } |
| HWY_INLINE Vec128<double> SumOfLanes(const Vec128<double> v) { |
| return Vec128<double>(vdupq_n_f64(vaddvq_f64(v.raw))); |
| } |
| #else |
| // ARMv7 version for everything except doubles. |
| HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) { |
| uint32x4x2_t v0 = vuzpq_u32(v.raw, v.raw); |
| uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]); |
| uint32x4x2_t v1 = vuzpq_u32(c0, c0); |
| return Vec128<uint32_t>(vaddq_u32(v1.val[0], v1.val[1])); |
| } |
| HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) { |
| int32x4x2_t v0 = vuzpq_s32(v.raw, v.raw); |
| int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]); |
| int32x4x2_t v1 = vuzpq_s32(c0, c0); |
| return Vec128<int32_t>(vaddq_s32(v1.val[0], v1.val[1])); |
| } |
| HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) { |
| float32x4x2_t v0 = vuzpq_f32(v.raw, v.raw); |
| float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]); |
| float32x4x2_t v1 = vuzpq_f32(c0, c0); |
| return Vec128<float>(vaddq_f32(v1.val[0], v1.val[1])); |
| } |
| HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) { |
| return v + CombineShiftRightBytes<8>(v, v); |
| } |
| HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) { |
| return v + CombineShiftRightBytes<8>(v, v); |
| } |
| #endif |
| |
| namespace detail { |
| |
| // For u32/i32/f32. |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */, |
| const Vec128<T, N> v3210) { |
| const Vec128<T> v1032 = Shuffle1032(v3210); |
| const Vec128<T> v31_20_31_20 = Min(v3210, v1032); |
| const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20); |
| return Min(v20_31_20_31, v31_20_31_20); |
| } |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */, |
| const Vec128<T, N> v3210) { |
| const Vec128<T> v1032 = Shuffle1032(v3210); |
| const Vec128<T> v31_20_31_20 = Max(v3210, v1032); |
| const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20); |
| return Max(v20_31_20_31, v31_20_31_20); |
| } |
| |
| // For u64/i64[/f64]. |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */, |
| const Vec128<T, N> v10) { |
| const Vec128<T> v01 = Shuffle01(v10); |
| return Min(v10, v01); |
| } |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */, |
| const Vec128<T, N> v10) { |
| const Vec128<T> v01 = Shuffle01(v10); |
| return Max(v10, v01); |
| } |
| |
| } // namespace detail |
| |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> MinOfLanes(const Vec128<T, N> v) { |
| return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v); |
| } |
| template <typename T, size_t N> |
| HWY_API Vec128<T, N> MaxOfLanes(const Vec128<T, N> v) { |
| return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v); |
| } |
| |
| // ------------------------------ Mask |
| |
| template <typename T> |
| HWY_INLINE bool AllFalse(const Mask128<T> v) { |
| const auto v64 = BitCast(Full128<uint64_t>(), VecFromMask(v)); |
| uint32x2_t a = vqmovn_u64(v64.raw); |
| return vreinterpret_u64_u32(a)[0] == 0; |
| } |
| |
| template <typename T> |
| HWY_INLINE bool AllTrue(const Mask128<T> v) { |
| return AllFalse(VecFromMask(v) == Zero(Full128<T>())); |
| } |
| |
| namespace impl { |
| |
| template <typename T> |
| HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, |
| const Mask128<T> mask) { |
| constexpr uint8x16_t kCollapseMask = { |
| 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, |
| }; |
| const Full128<uint8_t> du; |
| const uint8x16_t values = BitCast(du, VecFromMask(mask)).raw & kCollapseMask; |
| |
| #if defined(__aarch64__) |
| // Can't vaddv - we need two separate bytes (16 bits). |
| const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values, values)); |
| const uint8x8_t x4 = vpadd_u8(x2, x2); |
| const uint8x8_t x8 = vpadd_u8(x4, x4); |
| return vreinterpret_u16_u8(x8)[0]; |
| #else |
| // Don't have vpaddq, so keep doubling lane size. |
| const uint16x8_t x2 = vpaddlq_u8(values); |
| const uint32x4_t x4 = vpaddlq_u16(x2); |
| const uint64x2_t x8 = vpaddlq_u32(x4); |
| return (uint64_t(x8[1]) << 8) | x8[0]; |
| #endif |
| } |
| |
| template <typename T> |
| HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, |
| const Mask128<T> mask) { |
| constexpr uint16x8_t kCollapseMask = {1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80}; |
| const Full128<uint16_t> du; |
| const uint16x8_t values = BitCast(du, VecFromMask(mask)).raw & kCollapseMask; |
| #if defined(__aarch64__) |
| return vaddvq_u16(values); |
| #else |
| const uint32x4_t x2 = vpaddlq_u16(values); |
| const uint64x2_t x4 = vpaddlq_u32(x2); |
| return x4[0] + x4[1]; |
| #endif |
| } |
| |
| template <typename T> |
| HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, |
| const Mask128<T> mask) { |
| constexpr uint32x4_t kCollapseMask = {1, 2, 4, 8}; |
| const Full128<uint32_t> du; |
| const uint32x4_t values = BitCast(du, VecFromMask(mask)).raw & kCollapseMask; |
| #if defined(__aarch64__) |
| return vaddvq_u32(values); |
| #else |
| const uint64x2_t x2 = vpaddlq_u32(values); |
| return x2[0] + x2[1]; |
| #endif |
| } |
| |
| template <typename T> |
| HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, const Mask128<T> v) { |
| constexpr uint64x2_t kCollapseMask = {1, 2}; |
| const Full128<uint64_t> du; |
| const uint64x2_t values = BitCast(du, VecFromMask(v)).raw & kCollapseMask; |
| #if defined(__aarch64__) |
| return vaddvq_u64(values); |
| #else |
| return values[0] + values[1]; |
| #endif |
| } |
| |
| // Returns number of lanes whose mask is set. |
| // |
| // Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op |
| // ("vsubv"). ANDing with 1 would work but requires a constant. Negating also |
| // changes each lane to 1 (if mask set) or 0. |
| |
| template <typename T> |
| HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> mask) { |
| const Full128<int8_t> di; |
| const int8x16_t ones = vnegq_s8(BitCast(di, VecFromMask(mask)).raw); |
| |
| #if defined(__aarch64__) |
| return vaddvq_s8(ones); |
| #else |
| const int16x8_t x2 = vpaddlq_s8(ones); |
| const int32x4_t x4 = vpaddlq_s16(x2); |
| const int64x2_t x8 = vpaddlq_s32(x4); |
| return x8[0] + x8[1]; |
| #endif |
| } |
| template <typename T> |
| HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> mask) { |
| const Full128<int16_t> di; |
| const int16x8_t ones = vnegq_s16(BitCast(di, VecFromMask(mask)).raw); |
| |
| #if defined(__aarch64__) |
| return vaddvq_s16(ones); |
| #else |
| const int32x4_t x2 = vpaddlq_s16(ones); |
| const int64x2_t x4 = vpaddlq_s32(x2); |
| return x4[0] + x4[1]; |
| #endif |
| } |
| |
| template <typename T> |
| HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> mask) { |
| const Full128<int32_t> di; |
| const int32x4_t ones = vnegq_s32(BitCast(di, VecFromMask(mask)).raw); |
| |
| #if defined(__aarch64__) |
| return vaddvq_s32(ones); |
| #else |
| const int64x2_t x2 = vpaddlq_s32(ones); |
| return x2[0] + x2[1]; |
| #endif |
| } |
| |
| template <typename T> |
| HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> mask) { |
| #if defined(__aarch64__) |
| const Full128<int64_t> di; |
| const int64x2_t ones = vnegq_s64(BitCast(di, VecFromMask(mask)).raw); |
| return vaddvq_s64(ones); |
| #else |
| const Full128<int64_t> di; |
| const int64x2_t ones = vshrq_n_u64(BitCast(di, VecFromMask(mask)).raw, 63); |
| return ones[0] + ones[1]; |
| #endif |
| } |
| |
| } // namespace impl |
| |
| template <typename T> |
| HWY_INLINE uint64_t BitsFromMask(const Mask128<T> mask) { |
| return impl::BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask); |
| } |
| |
| template <typename T> |
| HWY_INLINE size_t CountTrue(const Mask128<T> mask) { |
| return impl::CountTrue(hwy::SizeTag<sizeof(T)>(), mask); |
| } |
| |
| #if !defined(__aarch64__) |
| #undef vuzp1_s8 |
| #undef vuzp1_u8 |
| #undef vuzp1_s16 |
| #undef vuzp1_u16 |
| #undef vuzp1_s32 |
| #undef vuzp1_u32 |
| #undef vuzp1_f32 |
| #undef vuzp1q_s8 |
| #undef vuzp1q_u8 |
| #undef vuzp1q_s16 |
| #undef vuzp1q_u16 |
| #undef vuzp1q_s32 |
| #undef vuzp1q_u32 |
| #undef vuzp1q_f32 |
| #undef vuzp2_s8 |
| #undef vuzp2_u8 |
| #undef vuzp2_s16 |
| #undef vuzp2_u16 |
| #undef vuzp2_s32 |
| #undef vuzp2_u32 |
| #undef vuzp2_f32 |
| #undef vuzp2q_s8 |
| #undef vuzp2q_u8 |
| #undef vuzp2q_s16 |
| #undef vuzp2q_u16 |
| #undef vuzp2q_s32 |
| #undef vuzp2q_u32 |
| #undef vuzp2q_f32 |
| #undef vzip1_s8 |
| #undef vzip1_u8 |
| #undef vzip1_s16 |
| #undef vzip1_u16 |
| #undef vzip1_s32 |
| #undef vzip1_u32 |
| #undef vzip1_f32 |
| #undef vzip1q_s8 |
| #undef vzip1q_u8 |
| #undef vzip1q_s16 |
| #undef vzip1q_u16 |
| #undef vzip1q_s32 |
| #undef vzip1q_u32 |
| #undef vzip1q_f32 |
| #undef vzip2_s8 |
| #undef vzip2_u8 |
| #undef vzip2_s16 |
| #undef vzip2_u16 |
| #undef vzip2_s32 |
| #undef vzip2_u32 |
| #undef vzip2_f32 |
| #undef vzip2q_s8 |
| #undef vzip2q_u8 |
| #undef vzip2q_s16 |
| #undef vzip2q_u16 |
| #undef vzip2q_s32 |
| #undef vzip2q_u32 |
| #undef vzip2q_f32 |
| #endif |
| |
| #undef HWY_NEON_BUILD_ARG_1 |
| #undef HWY_NEON_BUILD_ARG_2 |
| #undef HWY_NEON_BUILD_ARG_3 |
| #undef HWY_NEON_BUILD_PARAM_1 |
| #undef HWY_NEON_BUILD_PARAM_2 |
| #undef HWY_NEON_BUILD_PARAM_3 |
| #undef HWY_NEON_BUILD_RET_1 |
| #undef HWY_NEON_BUILD_RET_2 |
| #undef HWY_NEON_BUILD_RET_3 |
| #undef HWY_NEON_BUILD_TPL_1 |
| #undef HWY_NEON_BUILD_TPL_2 |
| #undef HWY_NEON_BUILD_TPL_3 |
| #undef HWY_NEON_DEF_FUNCTION |
| #undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS |
| #undef HWY_NEON_DEF_FUNCTION_ALL_TYPES |
| #undef HWY_NEON_DEF_FUNCTION_INT_8 |
| #undef HWY_NEON_DEF_FUNCTION_INT_16 |
| #undef HWY_NEON_DEF_FUNCTION_INT_32 |
| #undef HWY_NEON_DEF_FUNCTION_INT_8_16_32 |
| #undef HWY_NEON_DEF_FUNCTION_INTS |
| #undef HWY_NEON_DEF_FUNCTION_INTS_UINTS |
| #undef HWY_NEON_DEF_FUNCTION_TPL |
| #undef HWY_NEON_DEF_FUNCTION_UINT_8 |
| #undef HWY_NEON_DEF_FUNCTION_UINT_16 |
| #undef HWY_NEON_DEF_FUNCTION_UINT_32 |
| #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32 |
| #undef HWY_NEON_DEF_FUNCTION_UINTS |
| #undef HWY_NEON_EVAL |
| |
| // NOLINTNEXTLINE(google-readability-namespace-comments) |
| } // namespace HWY_NAMESPACE |
| } // namespace hwy |
| HWY_AFTER_NAMESPACE(); |