blob: 5f516d55620373f6cbb787fa4c5c2468650d1e1b [file] [log] [blame]
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// 128-bit WASM vectors and operations.
// External include guard in highway.h - see comment there.
#include <stddef.h>
#include <stdint.h>
#include <wasm_simd128.h>
#include "hwy/base.h"
#include "hwy/ops/shared-inl.h"
#ifdef HWY_WASM_OLD_NAMES
#define wasm_i8x16_shuffle wasm_v8x16_shuffle
#define wasm_i16x8_shuffle wasm_v16x8_shuffle
#define wasm_i32x4_shuffle wasm_v32x4_shuffle
#define wasm_i64x2_shuffle wasm_v64x2_shuffle
#define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
#define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
#define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
#define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
#define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
#define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
#define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
#define wasm_u8x16_add_sat wasm_u8x16_add_saturate
#define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
#define wasm_u16x8_add_sat wasm_u16x8_add_saturate
#define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
#define wasm_i8x16_add_sat wasm_i8x16_add_saturate
#define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
#define wasm_i16x8_add_sat wasm_i16x8_add_saturate
#define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
template <typename T>
using Full128 = Simd<T, 16 / sizeof(T), 0>;
template <typename T>
using Full64 = Simd<T, 8 / sizeof(T), 0>;
namespace detail {
template <typename T>
struct Raw128 {
using type = __v128_u;
};
template <>
struct Raw128<float> {
using type = __f32x4;
};
} // namespace detail
template <typename T, size_t N = 16 / sizeof(T)>
class Vec128 {
using Raw = typename detail::Raw128<T>::type;
public:
// Compound assignment. Only usable if there is a corresponding non-member
// binary operator overload. For example, only f32 and f64 support division.
HWY_INLINE Vec128& operator*=(const Vec128 other) {
return *this = (*this * other);
}
HWY_INLINE Vec128& operator/=(const Vec128 other) {
return *this = (*this / other);
}
HWY_INLINE Vec128& operator+=(const Vec128 other) {
return *this = (*this + other);
}
HWY_INLINE Vec128& operator-=(const Vec128 other) {
return *this = (*this - other);
}
HWY_INLINE Vec128& operator&=(const Vec128 other) {
return *this = (*this & other);
}
HWY_INLINE Vec128& operator|=(const Vec128 other) {
return *this = (*this | other);
}
HWY_INLINE Vec128& operator^=(const Vec128 other) {
return *this = (*this ^ other);
}
Raw raw;
};
template <typename T>
using Vec64 = Vec128<T, 8 / sizeof(T)>;
// FF..FF or 0.
template <typename T, size_t N = 16 / sizeof(T)>
struct Mask128 {
typename detail::Raw128<T>::type raw;
};
namespace detail {
// Deduce Simd<T, N, 0> from Vec128<T, N>
struct DeduceD {
template <typename T, size_t N>
Simd<T, N, 0> operator()(Vec128<T, N>) const {
return Simd<T, N, 0>();
}
};
} // namespace detail
template <class V>
using DFromV = decltype(detail::DeduceD()(V()));
template <class V>
using TFromV = TFromD<DFromV<V>>;
// ------------------------------ BitCast
namespace detail {
HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; }
HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) {
return static_cast<__v128_u>(v);
}
HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) {
return static_cast<__v128_u>(v);
}
template <typename T, size_t N>
HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
}
// Cannot rely on function overloading because return types differ.
template <typename T>
struct BitCastFromInteger128 {
HWY_INLINE __v128_u operator()(__v128_u v) { return v; }
};
template <>
struct BitCastFromInteger128<float> {
HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); }
};
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> BitCastFromByte(Simd<T, N, 0> /* tag */,
Vec128<uint8_t, N * sizeof(T)> v) {
return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
}
} // namespace detail
template <typename T, size_t N, typename FromT>
HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> d,
Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
return detail::BitCastFromByte(d, detail::BitCastToByte(v));
}
// ------------------------------ Zero
// Returns an all-zero vector/part.
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_API Vec128<T, N> Zero(Simd<T, N, 0> /* tag */) {
return Vec128<T, N>{wasm_i32x4_splat(0)};
}
template <size_t N, HWY_IF_LE128(float, N)>
HWY_API Vec128<float, N> Zero(Simd<float, N, 0> /* tag */) {
return Vec128<float, N>{wasm_f32x4_splat(0.0f)};
}
template <class D>
using VFromD = decltype(Zero(D()));
// ------------------------------ Set
// Returns a vector/part with all lanes set to "t".
template <size_t N, HWY_IF_LE128(uint8_t, N)>
HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N, 0> /* tag */, const uint8_t t) {
return Vec128<uint8_t, N>{wasm_i8x16_splat(static_cast<int8_t>(t))};
}
template <size_t N, HWY_IF_LE128(uint16_t, N)>
HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N, 0> /* tag */,
const uint16_t t) {
return Vec128<uint16_t, N>{wasm_i16x8_splat(static_cast<int16_t>(t))};
}
template <size_t N, HWY_IF_LE128(uint32_t, N)>
HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N, 0> /* tag */,
const uint32_t t) {
return Vec128<uint32_t, N>{wasm_i32x4_splat(static_cast<int32_t>(t))};
}
template <size_t N, HWY_IF_LE128(uint64_t, N)>
HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N, 0> /* tag */,
const uint64_t t) {
return Vec128<uint64_t, N>{wasm_i64x2_splat(static_cast<int64_t>(t))};
}
template <size_t N, HWY_IF_LE128(int8_t, N)>
HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N, 0> /* tag */, const int8_t t) {
return Vec128<int8_t, N>{wasm_i8x16_splat(t)};
}
template <size_t N, HWY_IF_LE128(int16_t, N)>
HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N, 0> /* tag */, const int16_t t) {
return Vec128<int16_t, N>{wasm_i16x8_splat(t)};
}
template <size_t N, HWY_IF_LE128(int32_t, N)>
HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N, 0> /* tag */, const int32_t t) {
return Vec128<int32_t, N>{wasm_i32x4_splat(t)};
}
template <size_t N, HWY_IF_LE128(int64_t, N)>
HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N, 0> /* tag */, const int64_t t) {
return Vec128<int64_t, N>{wasm_i64x2_splat(t)};
}
template <size_t N, HWY_IF_LE128(float, N)>
HWY_API Vec128<float, N> Set(Simd<float, N, 0> /* tag */, const float t) {
return Vec128<float, N>{wasm_f32x4_splat(t)};
}
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
// Returns a vector with uninitialized elements.
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> d) {
return Zero(d);
}
HWY_DIAGNOSTICS(pop)
// Returns a vector with lane i=[0, N) set to "first" + i.
template <typename T, size_t N, typename T2>
Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
HWY_ALIGN T lanes[16 / sizeof(T)];
for (size_t i = 0; i < 16 / sizeof(T); ++i) {
lanes[i] = static_cast<T>(first + static_cast<T2>(i));
}
return Load(d, lanes);
}
// ================================================== ARITHMETIC
// ------------------------------ Addition
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
const Vec128<uint64_t, N> b) {
return Vec128<uint64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
const Vec128<int64_t, N> b) {
return Vec128<int64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
}
// Float
template <size_t N>
HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Vec128<float, N>{wasm_f32x4_add(a.raw, b.raw)};
}
// ------------------------------ Subtraction
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
const Vec128<uint64_t, N> b) {
return Vec128<uint64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
const Vec128<int64_t, N> b) {
return Vec128<int64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
}
// Float
template <size_t N>
HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Vec128<float, N>{wasm_f32x4_sub(a.raw, b.raw)};
}
// ------------------------------ SaturatedAdd
// Returns a + b clamped to the destination range.
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
}
// ------------------------------ SaturatedSub
// Returns a - b clamped to the destination range.
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
}
// ------------------------------ Average
// Returns (a + b + 1) / 2
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
}
// ------------------------------ Absolute value
// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
template <size_t N>
HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
return Vec128<int64_t, N>{wasm_i64x2_abs(v.raw)};
}
template <size_t N>
HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
return Vec128<float, N>{wasm_f32x4_abs(v.raw)};
}
// ------------------------------ Shift lanes by constant #bits
// Unsigned
template <int kBits, size_t N>
HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) {
return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, kBits)};
}
// Signed
template <int kBits, size_t N>
HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, kBits)};
}
// 8-bit
template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
const DFromV<decltype(v)> d8;
// Use raw instead of BitCast to support N=1.
const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
return kBits == 1
? (v + v)
: (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
}
template <int kBits, size_t N>
HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
const DFromV<decltype(v)> d8;
// Use raw instead of BitCast to support N=1.
const Vec128<uint8_t, N> shifted{
ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
return shifted & Set(d8, 0xFF >> kBits);
}
template <int kBits, size_t N>
HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
const DFromV<decltype(v)> di;
const RebindToUnsigned<decltype(di)> du;
const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
return (shifted ^ shifted_sign) - shifted_sign;
}
// ------------------------------ RotateRight (ShiftRight, Or)
template <int kBits, typename T, size_t N>
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
constexpr size_t kSizeInBits = sizeof(T) * 8;
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
if (kBits == 0) return v;
return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
}
// ------------------------------ Shift lanes by same variable #bits
// After https://reviews.llvm.org/D108415 shift argument became unsigned.
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
// Unsigned
template <size_t N>
HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
const int bits) {
return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
const int bits) {
return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
const int bits) {
return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
const int bits) {
return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v,
const int bits) {
return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v,
const int bits) {
return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, bits)};
}
// Signed
template <size_t N>
HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
const int bits) {
return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
const int bits) {
return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
const int bits) {
return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
const int bits) {
return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v,
const int bits) {
return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v,
const int bits) {
return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, bits)};
}
// 8-bit
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
const DFromV<decltype(v)> d8;
// Use raw instead of BitCast to support N=1.
const Vec128<T, N> shifted{
ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
}
template <size_t N>
HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
const int bits) {
const DFromV<decltype(v)> d8;
// Use raw instead of BitCast to support N=1.
const Vec128<uint8_t, N> shifted{
ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
return shifted & Set(d8, 0xFF >> bits);
}
template <size_t N>
HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
const DFromV<decltype(v)> di;
const RebindToUnsigned<decltype(di)> du;
const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
return (shifted ^ shifted_sign) - shifted_sign;
}
// ignore Wsign-conversion
HWY_DIAGNOSTICS(pop)
// ------------------------------ Minimum
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_u32x4_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
// Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)};
return Vec128<uint64_t, N>{wasm_v128_load(min)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
alignas(16) int64_t min[4];
min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
wasm_i64x2_extract_lane(b.raw, 0));
min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
wasm_i64x2_extract_lane(b.raw, 1));
return Vec128<int64_t, N>{wasm_v128_load(min)};
}
// Float
template <size_t N>
HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) {
return Vec128<float, N>{wasm_f32x4_min(a.raw, b.raw)};
}
// ------------------------------ Maximum
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_u32x4_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
// Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)};
return Vec128<uint64_t, N>{wasm_v128_load(max)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
alignas(16) int64_t max[2];
max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
wasm_i64x2_extract_lane(b.raw, 0));
max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
wasm_i64x2_extract_lane(b.raw, 1));
return Vec128<int64_t, N>{wasm_v128_load(max)};
}
// Float
template <size_t N>
HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) {
return Vec128<float, N>{wasm_f32x4_max(a.raw, b.raw)};
}
// ------------------------------ Integer multiplication
// Unsigned
template <size_t N>
HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
}
// Returns the upper 16 bits of a * b in each lane.
template <size_t N>
HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
// TODO(eustas): replace, when implemented in WASM.
const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
const auto l = wasm_i32x4_mul(al, bl);
const auto h = wasm_i32x4_mul(ah, bh);
// TODO(eustas): shift-right + narrow?
return Vec128<uint16_t, N>{
wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
// TODO(eustas): replace, when implemented in WASM.
const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
const auto l = wasm_i32x4_mul(al, bl);
const auto h = wasm_i32x4_mul(ah, bh);
// TODO(eustas): shift-right + narrow?
return Vec128<int16_t, N>{
wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
}
// Multiplies even lanes (0, 2 ..) and returns the double-width result.
template <size_t N>
HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
// TODO(eustas): replace, when implemented in WASM.
const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
const auto ae = wasm_v128_and(a.raw, kEvenMask);
const auto be = wasm_v128_and(b.raw, kEvenMask);
return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
}
template <size_t N>
HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
// TODO(eustas): replace, when implemented in WASM.
const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
const auto ae = wasm_v128_and(a.raw, kEvenMask);
const auto be = wasm_v128_and(b.raw, kEvenMask);
return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
}
// ------------------------------ Negate
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
return Xor(v, SignBit(DFromV<decltype(v)>()));
}
template <size_t N>
HWY_API Vec128<int8_t, N> Neg(const Vec128<int8_t, N> v) {
return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> Neg(const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> Neg(const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> Neg(const Vec128<int64_t, N> v) {
return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)};
}
// ------------------------------ Floating-point mul / div
template <size_t N>
HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
return Vec128<float, N>{wasm_f32x4_mul(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
}
// Approximate reciprocal
template <size_t N>
HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
return one / v;
}
// Absolute value of difference.
template <size_t N>
HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Abs(a - b);
}
// ------------------------------ Floating-point multiply-add variants
// Returns mul * x + add
template <size_t N>
HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
const Vec128<float, N> x,
const Vec128<float, N> add) {
// TODO(eustas): replace, when implemented in WASM.
// TODO(eustas): is it wasm_f32x4_qfma?
return mul * x + add;
}
// Returns add - mul * x
template <size_t N>
HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
const Vec128<float, N> x,
const Vec128<float, N> add) {
// TODO(eustas): replace, when implemented in WASM.
return add - mul * x;
}
// Returns mul * x - sub
template <size_t N>
HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
const Vec128<float, N> x,
const Vec128<float, N> sub) {
// TODO(eustas): replace, when implemented in WASM.
// TODO(eustas): is it wasm_f32x4_qfms?
return mul * x - sub;
}
// Returns -mul * x - sub
template <size_t N>
HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
const Vec128<float, N> x,
const Vec128<float, N> sub) {
// TODO(eustas): replace, when implemented in WASM.
return Neg(mul) * x - sub;
}
// ------------------------------ Floating-point square root
// Full precision square root
template <size_t N>
HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)};
}
// Approximate reciprocal square root
template <size_t N>
HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
// TODO(eustas): find cheaper a way to calculate this.
const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
return one / Sqrt(v);
}
// ------------------------------ Floating-point rounding
// Toward nearest integer, ties to even
template <size_t N>
HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
return Vec128<float, N>{wasm_f32x4_nearest(v.raw)};
}
// Toward zero, aka truncate
template <size_t N>
HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
return Vec128<float, N>{wasm_f32x4_trunc(v.raw)};
}
// Toward +infinity, aka ceiling
template <size_t N>
HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
return Vec128<float, N>{wasm_f32x4_ceil(v.raw)};
}
// Toward -infinity, aka floor
template <size_t N>
HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
return Vec128<float, N>{wasm_f32x4_floor(v.raw)};
}
// ================================================== COMPARE
// Comparisons fill a lane with 1-bits if the condition is true, else 0.
template <typename TFrom, typename TTo, size_t N>
HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
Mask128<TFrom, N> m) {
static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
return Mask128<TTo, N>{m.raw};
}
template <typename T, size_t N>
HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
return (v & bit) == bit;
}
// ------------------------------ Equality
// Unsigned
template <size_t N>
HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
const Vec128<uint64_t, N> b) {
return Mask128<uint64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
Vec128<int16_t, N> b) {
return Mask128<int16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
const Vec128<int64_t, N> b) {
return Mask128<int64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
}
// Float
template <size_t N>
HWY_API Mask128<float, N> operator==(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Mask128<float, N>{wasm_f32x4_eq(a.raw, b.raw)};
}
// ------------------------------ Inequality
// Unsigned
template <size_t N>
HWY_API Mask128<uint8_t, N> operator!=(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint16_t, N> operator!=(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint32_t, N> operator!=(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint64_t, N> operator!=(const Vec128<uint64_t, N> a,
const Vec128<uint64_t, N> b) {
return Mask128<uint64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Mask128<int8_t, N> operator!=(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int16_t, N> operator!=(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int32_t, N> operator!=(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int64_t, N> operator!=(const Vec128<int64_t, N> a,
const Vec128<int64_t, N> b) {
return Mask128<int64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
}
// Float
template <size_t N>
HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Mask128<float, N>{wasm_f32x4_ne(a.raw, b.raw)};
}
// ------------------------------ Strict inequality
template <size_t N>
HWY_API Mask128<int8_t, N> operator>(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int16_t, N> operator>(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int32_t, N> operator>(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
const Vec128<int64_t, N> b) {
return Mask128<int64_t, N>{wasm_i64x2_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint8_t, N> operator>(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Mask128<uint8_t, N>{wasm_u8x16_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint16_t, N> operator>(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Mask128<uint16_t, N>{wasm_u16x8_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint32_t, N> operator>(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Mask128<uint32_t, N>{wasm_u32x4_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint64_t, N> operator>(const Vec128<uint64_t, N> a,
const Vec128<uint64_t, N> b) {
const DFromV<decltype(a)> d;
const Repartition<uint32_t, decltype(d)> d32;
const auto a32 = BitCast(d32, a);
const auto b32 = BitCast(d32, b);
// If the upper halves are not equal, this is the answer.
const auto m_gt = a32 > b32;
// Otherwise, the lower half decides.
const auto m_eq = a32 == b32;
const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
const auto lo_gt = And(m_eq, MaskFromVec(VFromD<decltype(d32)>{lo_in_hi}));
const auto gt = Or(lo_gt, m_gt);
// Copy result in upper 32 bits to lower 32 bits.
return Mask128<uint64_t, N>{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)};
}
template <size_t N>
HWY_API Mask128<float, N> operator>(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Mask128<float, N>{wasm_f32x4_gt(a.raw, b.raw)};
}
template <typename T, size_t N>
HWY_API Mask128<T, N> operator<(const Vec128<T, N> a, const Vec128<T, N> b) {
return operator>(b, a);
}
// ------------------------------ Weak inequality
// Float <= >=
template <size_t N>
HWY_API Mask128<float, N> operator<=(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Mask128<float, N>{wasm_f32x4_le(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
}
// ------------------------------ FirstN (Iota, Lt)
template <typename T, size_t N>
HWY_API Mask128<T, N> FirstN(const Simd<T, N, 0> d, size_t num) {
const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
}
// ================================================== LOGICAL
// ------------------------------ Not
template <typename T, size_t N>
HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
return Vec128<T, N>{wasm_v128_not(v.raw)};
}
// ------------------------------ And
template <typename T, size_t N>
HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
}
// ------------------------------ AndNot
// Returns ~not_mask & mask.
template <typename T, size_t N>
HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
}
// ------------------------------ Or
template <typename T, size_t N>
HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
}
// ------------------------------ Xor
template <typename T, size_t N>
HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
}
// ------------------------------ OrAnd
template <typename T, size_t N>
HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
return Or(o, And(a1, a2));
}
// ------------------------------ IfVecThenElse
template <typename T, size_t N>
HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
return IfThenElse(MaskFromVec(mask), yes, no);
}
// ------------------------------ Operator overloads (internal-only if float)
template <typename T, size_t N>
HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
return And(a, b);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
return Or(a, b);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
return Xor(a, b);
}
// ------------------------------ CopySign
template <typename T, size_t N>
HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
const Vec128<T, N> sign) {
static_assert(IsFloat<T>(), "Only makes sense for floating-point");
const auto msb = SignBit(DFromV<decltype(magn)>());
return Or(AndNot(msb, magn), And(msb, sign));
}
template <typename T, size_t N>
HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
const Vec128<T, N> sign) {
static_assert(IsFloat<T>(), "Only makes sense for floating-point");
return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign));
}
// ------------------------------ BroadcastSignBit (compare)
template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) {
return ShiftRight<sizeof(T) * 8 - 1>(v);
}
template <size_t N>
HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
const DFromV<decltype(v)> d;
return VecFromMask(d, v < Zero(d));
}
// ------------------------------ Mask
// Mask and Vec are the same (true = FF..FF).
template <typename T, size_t N>
HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
return Mask128<T, N>{v.raw};
}
template <typename T, size_t N>
HWY_API Vec128<T, N> VecFromMask(Simd<T, N, 0> /* tag */, Mask128<T, N> v) {
return Vec128<T, N>{v.raw};
}
// mask ? yes : no
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
}
// mask ? yes : 0
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
}
// mask ? 0 : no
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
Vec128<T, N> no) {
static_assert(IsSigned<T>(), "Only works for signed/float");
const DFromV<decltype(v)> d;
const RebindToSigned<decltype(d)> di;
v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
return IfThenElse(MaskFromVec(v), yes, no);
}
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const auto zero = Zero(d);
return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
}
// ------------------------------ Mask logical
template <typename T, size_t N>
HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
}
template <typename T, size_t N>
HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
const Simd<T, N, 0> d;
return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
}
template <typename T, size_t N>
HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
const Simd<T, N, 0> d;
return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
}
template <typename T, size_t N>
HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
const Simd<T, N, 0> d;
return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
}
template <typename T, size_t N>
HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
const Simd<T, N, 0> d;
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
}
// ------------------------------ Shl (BroadcastSignBit, IfThenElse)
// The x86 multiply-by-Pow2() trick will not work because WASM saturates
// float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
// scalar count operand, per-lane shift instructions would require extract_lane
// for each lane, and hoping that shuffle is correctly mapped to a native
// instruction. Using non-vector shifts would incur a store-load forwarding
// stall when loading the result vector. We instead test bits of the shift
// count to "predicate" a shift of the entire vector by a constant.
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
const DFromV<decltype(v)> d;
Mask128<T, N> mask;
// Need a signed type for BroadcastSignBit.
auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
// Move the highest valid bit of the shift count into the sign bit.
test = ShiftLeft<12>(test);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test); // next bit (descending order)
v = IfThenElse(mask, ShiftLeft<8>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test); // next bit (descending order)
v = IfThenElse(mask, ShiftLeft<4>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test); // next bit (descending order)
v = IfThenElse(mask, ShiftLeft<2>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
return IfThenElse(mask, ShiftLeft<1>(v), v);
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
const DFromV<decltype(v)> d;
Mask128<T, N> mask;
// Need a signed type for BroadcastSignBit.
auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
// Move the highest valid bit of the shift count into the sign bit.
test = ShiftLeft<27>(test);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test); // next bit (descending order)
v = IfThenElse(mask, ShiftLeft<16>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test); // next bit (descending order)
v = IfThenElse(mask, ShiftLeft<8>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test); // next bit (descending order)
v = IfThenElse(mask, ShiftLeft<4>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test); // next bit (descending order)
v = IfThenElse(mask, ShiftLeft<2>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
return IfThenElse(mask, ShiftLeft<1>(v), v);
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
const DFromV<decltype(v)> d;
alignas(16) T lanes[2];
alignas(16) T bits_lanes[2];
Store(v, d, lanes);
Store(bits, d, bits_lanes);
lanes[0] <<= bits_lanes[0];
lanes[1] <<= bits_lanes[1];
return Load(d, lanes);
}
// ------------------------------ Shr (BroadcastSignBit, IfThenElse)
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
const DFromV<decltype(v)> d;
Mask128<T, N> mask;
// Need a signed type for BroadcastSignBit.
auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
// Move the highest valid bit of the shift count into the sign bit.
test = ShiftLeft<12>(test);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test); // next bit (descending order)
v = IfThenElse(mask, ShiftRight<8>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test); // next bit (descending order)
v = IfThenElse(mask, ShiftRight<4>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test); // next bit (descending order)
v = IfThenElse(mask, ShiftRight<2>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
return IfThenElse(mask, ShiftRight<1>(v), v);
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
const DFromV<decltype(v)> d;
Mask128<T, N> mask;
// Need a signed type for BroadcastSignBit.
auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
// Move the highest valid bit of the shift count into the sign bit.
test = ShiftLeft<27>(test);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test); // next bit (descending order)
v = IfThenElse(mask, ShiftRight<16>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test); // next bit (descending order)
v = IfThenElse(mask, ShiftRight<8>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test); // next bit (descending order)
v = IfThenElse(mask, ShiftRight<4>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
test = ShiftLeft<1>(test); // next bit (descending order)
v = IfThenElse(mask, ShiftRight<2>(v), v);
mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
return IfThenElse(mask, ShiftRight<1>(v), v);
}
// ================================================== MEMORY
// ------------------------------ Load
template <typename T>
HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
return Vec128<T>{wasm_v128_load(aligned)};
}
template <typename T, size_t N>
HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
const T* HWY_RESTRICT aligned) {
return IfThenElseZero(m, Load(d, aligned));
}
// Partial load.
template <typename T, size_t N, HWY_IF_LE64(T, N)>
HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
Vec128<T, N> v;
CopyBytes<sizeof(T) * N>(p, &v);
return v;
}
// LoadU == Load.
template <typename T, size_t N>
HWY_API Vec128<T, N> LoadU(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
return Load(d, p);
}
// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
return Load(d, p);
}
// ------------------------------ Store
template <typename T>
HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
wasm_v128_store(aligned, v.raw);
}
// Partial store.
template <typename T, size_t N, HWY_IF_LE64(T, N)>
HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
CopyBytes<sizeof(T) * N>(&v, p);
}
HWY_API void Store(const Vec128<float, 1> v, Simd<float, 1, 0> /* tag */,
float* HWY_RESTRICT p) {
*p = wasm_f32x4_extract_lane(v.raw, 0);
}
// StoreU == Store.
template <typename T, size_t N>
HWY_API void StoreU(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT p) {
Store(v, d, p);
}
// ------------------------------ Non-temporal stores
// Same as aligned stores on non-x86.
template <typename T, size_t N>
HWY_API void Stream(Vec128<T, N> v, Simd<T, N, 0> /* tag */,
T* HWY_RESTRICT aligned) {
wasm_v128_store(aligned, v.raw);
}
// ------------------------------ Scatter (Store)
template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
T* HWY_RESTRICT base,
const Vec128<Offset, N> offset) {
static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
alignas(16) T lanes[N];
Store(v, d, lanes);
alignas(16) Offset offset_lanes[N];
Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
for (size_t i = 0; i < N; ++i) {
CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
}
}
template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
const Vec128<Index, N> index) {
static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
alignas(16) T lanes[N];
Store(v, d, lanes);
alignas(16) Index index_lanes[N];
Store(index, Rebind<Index, decltype(d)>(), index_lanes);
for (size_t i = 0; i < N; ++i) {
base[index_lanes[i]] = lanes[i];
}
}
// ------------------------------ Gather (Load/Store)
template <typename T, size_t N, typename Offset>
HWY_API Vec128<T, N> GatherOffset(const Simd<T, N, 0> d,
const T* HWY_RESTRICT base,
const Vec128<Offset, N> offset) {
static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
alignas(16) Offset offset_lanes[N];
Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
alignas(16) T lanes[N];
const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
for (size_t i = 0; i < N; ++i) {
CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
}
return Load(d, lanes);
}
template <typename T, size_t N, typename Index>
HWY_API Vec128<T, N> GatherIndex(const Simd<T, N, 0> d,
const T* HWY_RESTRICT base,
const Vec128<Index, N> index) {
static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
alignas(16) Index index_lanes[N];
Store(index, Rebind<Index, decltype(d)>(), index_lanes);
alignas(16) T lanes[N];
for (size_t i = 0; i < N; ++i) {
lanes[i] = base[index_lanes[i]];
}
return Load(d, lanes);
}
// ================================================== SWIZZLE
// ------------------------------ Extract lane
// Gets the single value stored in a vector/part.
template <size_t N>
HWY_API uint8_t GetLane(const Vec128<uint8_t, N> v) {
return static_cast<uint8_t>(wasm_i8x16_extract_lane(v.raw, 0));
}
template <size_t N>
HWY_API int8_t GetLane(const Vec128<int8_t, N> v) {
return static_cast<int8_t>(wasm_i8x16_extract_lane(v.raw, 0));
}
template <size_t N>
HWY_API uint16_t GetLane(const Vec128<uint16_t, N> v) {
return static_cast<uint16_t>(wasm_i16x8_extract_lane(v.raw, 0));
}
template <size_t N>
HWY_API int16_t GetLane(const Vec128<int16_t, N> v) {
return static_cast<int16_t>(wasm_i16x8_extract_lane(v.raw, 0));
}
template <size_t N>
HWY_API uint32_t GetLane(const Vec128<uint32_t, N> v) {
return static_cast<uint32_t>(wasm_i32x4_extract_lane(v.raw, 0));
}
template <size_t N>
HWY_API int32_t GetLane(const Vec128<int32_t, N> v) {
return static_cast<int32_t>(wasm_i32x4_extract_lane(v.raw, 0));
}
template <size_t N>
HWY_API uint64_t GetLane(const Vec128<uint64_t, N> v) {
return static_cast<uint64_t>(wasm_i64x2_extract_lane(v.raw, 0));
}
template <size_t N>
HWY_API int64_t GetLane(const Vec128<int64_t, N> v) {
return static_cast<int64_t>(wasm_i64x2_extract_lane(v.raw, 0));
}
template <size_t N>
HWY_API float GetLane(const Vec128<float, N> v) {
return wasm_f32x4_extract_lane(v.raw, 0);
}
// ------------------------------ LowerHalf
template <typename T, size_t N>
HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
Vec128<T, N> v) {
return Vec128<T, N / 2>{v.raw};
}
template <typename T, size_t N>
HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
return LowerHalf(Simd<T, N / 2, 0>(), v);
}
// ------------------------------ ShiftLeftBytes
// 0x01..0F, kBytes = 1 => 0x02..0F00
template <int kBytes, typename T, size_t N>
HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
const __i8x16 zero = wasm_i8x16_splat(0);
switch (kBytes) {
case 0:
return v;
case 1:
return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 14)};
case 2:
return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13)};
case 3:
return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12)};
case 4:
return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11)};
case 5:
return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10)};
case 6:
return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
case 7:
return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
case 8:
return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
case 9:
return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
16, 16, 16, 16, 0, 1, 2, 3, 4, 5,
6)};
case 10:
return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 0, 1, 2, 3, 4,
5)};
case 11:
return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 0, 1, 2, 3,
4)};
case 12:
return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 0, 1,
2, 3)};
case 13:
return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 0,
1, 2)};
case 14:
return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16,
0, 1)};
case 15:
return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 0)};
}
return Vec128<T, N>{zero};
}
template <int kBytes, typename T, size_t N>
HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(), v);
}
// ------------------------------ ShiftLeftLanes
template <int kLanes, typename T, size_t N>
HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
const Repartition<uint8_t, decltype(d)> d8;
return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
}
template <int kLanes, typename T, size_t N>
HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
}
// ------------------------------ ShiftRightBytes
namespace detail {
// Helper function allows zeroing invalid lanes in caller.
template <int kBytes, typename T, size_t N>
HWY_API __i8x16 ShrBytes(const Vec128<T, N> v) {
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
const __i8x16 zero = wasm_i8x16_splat(0);
switch (kBytes) {
case 0:
return v.raw;
case 1:
return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16);
case 2:
return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 16);
case 3:
return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 16, 16);
case 4:
return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 16, 16, 16);
case 5:
return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 16, 16, 16, 16);
case 6:
return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 16, 16, 16, 16, 16);
case 7:
return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 16, 16, 16, 16, 16, 16);
case 8:
return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
16, 16, 16, 16, 16, 16, 16);
case 9:
return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
16, 16, 16, 16, 16, 16, 16);
case 10:
return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16);
case 11:
return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16);
case 12:
return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16);
case 13:
return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16);
case 14:
return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16);
case 15:
return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16);
case 16:
return zero;
}
}
} // namespace detail
// 0x01..0F, kBytes = 1 => 0x0001..0E
template <int kBytes, typename T, size_t N>
HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
// For partial vectors, clear upper lanes so we shift in zeros.
if (N != 16 / sizeof(T)) {
const Vec128<T> vfull{v.raw};
v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
}
return Vec128<T, N>{detail::ShrBytes<kBytes>(v)};
}
// ------------------------------ ShiftRightLanes
template <int kLanes, typename T, size_t N>
HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
const Repartition<uint8_t, decltype(d)> d8;
return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
}
// ------------------------------ UpperHalf (ShiftRightBytes)
// Full input: copy hi into lo (smaller instruction encoding than shifts).
template <typename T>
HWY_API Vec64<T> UpperHalf(Full64<T> /* tag */, const Vec128<T> v) {
return Vec64<T>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
}
HWY_API Vec64<float> UpperHalf(Full64<float> /* tag */, const Vec128<float> v) {
return Vec64<float>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
}
// Partial
template <typename T, size_t N, HWY_IF_LE64(T, N)>
HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const auto vu = BitCast(du, v);
const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
return Vec128<T, (N + 1) / 2>{upper.raw};
}
// ------------------------------ CombineShiftRightBytes
template <int kBytes, typename T, class V = Vec128<T>>
HWY_API V CombineShiftRightBytes(Full128<T> /* tag */, V hi, V lo) {
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
switch (kBytes) {
case 0:
return lo;
case 1:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16)};
case 2:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17)};
case 3:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18)};
case 4:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19)};
case 5:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20)};
case 6:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21)};
case 7:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22)};
case 8:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23)};
case 9:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24)};
case 10:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25)};
case 11:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26)};
case 12:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27)};
case 13:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28)};
case 14:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25, 26, 27, 28, 29)};
case 15:
return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30)};
}
return hi;
}
template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
class V = Vec128<T, N>>
HWY_API V CombineShiftRightBytes(Simd<T, N, 0> d, V hi, V lo) {
constexpr size_t kSize = N * sizeof(T);
static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
const Repartition<uint8_t, decltype(d)> d8;
const Full128<uint8_t> d_full8;
using V8 = VFromD<decltype(d_full8)>;
const V8 hi8{BitCast(d8, hi).raw};
// Move into most-significant bytes
const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
return V{BitCast(Full128<T>(), r).raw};
}
// ------------------------------ Broadcast/splat any lane
template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
kLane, kLane, kLane, kLane, kLane)};
}
template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<T, N>{
wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
}
template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<T, N>{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)};
}
// ------------------------------ TableLookupBytes
// Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
// lane indices in [0, 16).
template <typename T, size_t N, typename TI, size_t NI>
HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
const Vec128<TI, NI> from) {
// Not yet available in all engines, see
// https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
// V8 implementation of this had a bug, fixed on 2021-04-03:
// https://chromium-review.googlesource.com/c/v8/v8/+/2822951
#if 0
return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
#else
alignas(16) uint8_t control[16];
alignas(16) uint8_t input[16];
alignas(16) uint8_t output[16];
wasm_v128_store(control, from.raw);
wasm_v128_store(input, bytes.raw);
for (size_t i = 0; i < 16; ++i) {
output[i] = control[i] < 16 ? input[control[i]] : 0;
}
return Vec128<TI, NI>{wasm_v128_load(output)};
#endif
}
template <typename T, size_t N, typename TI, size_t NI>
HWY_API Vec128<TI, NI> TableLookupBytesOr0(const Vec128<T, N> bytes,
const Vec128<TI, NI> from) {
const Simd<TI, NI, 0> d;
// Mask size must match vector type, so cast everything to this type.
Repartition<int8_t, decltype(d)> di8;
Repartition<int8_t, Simd<T, N, 0>> d_bytes8;
const auto msb = BitCast(di8, from) < Zero(di8);
const auto lookup =
TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
return BitCast(d, IfThenZeroElse(msb, lookup));
}
// ------------------------------ Hard-coded shuffles
// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
// Shuffle0321 rotates one lane to the right (the previous least-significant
// lane is now most-significant). These could also be implemented via
// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
// Swap 32-bit halves in 64-bit halves.
template <typename T, size_t N>
HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
static_assert(N == 2 || N == 4, "Does not make sense for N=1");
return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
}
// Swap 64-bit halves
template <typename T>
HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
static_assert(sizeof(T) == 8, "Only for 64-bit lanes");
return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
}
template <typename T>
HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
}
// Rotate right 32 bits
template <typename T>
HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
}
// Rotate left 32 bits
template <typename T>
HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
}
// Reverse
template <typename T>
HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
}
// ------------------------------ TableLookupLanes
// Returned by SetTableIndices for use by TableLookupLanes.
template <typename T, size_t N>
struct Indices128 {
__v128_u raw;
};
template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N, 0> d, Vec128<TI, N> vec) {
static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
#if HWY_IS_DEBUG_BUILD
const Rebind<TI, decltype(d)> di;
HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
#endif
const Repartition<uint8_t, decltype(d)> d8;
using V8 = VFromD<decltype(d8)>;
const Repartition<uint16_t, decltype(d)> d16;
// Broadcast each lane index to all bytes of T and shift to bytes
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "");
if (sizeof(T) == 4) {
alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
const V8 lane_indices =
TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
const V8 byte_indices =
BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
0, 1, 2, 3, 0, 1, 2, 3};
return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
} else {
alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
const V8 lane_indices =
TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
const V8 byte_indices =
BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices)));
alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
0, 1, 2, 3, 4, 5, 6, 7};
return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
}
}
template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
const Rebind<TI, decltype(d)> di;
return IndicesFromVec(d, LoadU(di, idx));
}
template <typename T, size_t N>
HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
using TI = MakeSigned<T>;
const DFromV<decltype(v)> d;
const Rebind<TI, decltype(d)> di;
return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128<TI, N>{idx.raw}));
}
// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
// Single lane: no change
template <typename T>
HWY_API Vec128<T, 1> Reverse(Simd<T, 1, 0> /* tag */, const Vec128<T, 1> v) {
return v;
}
// Two lanes: shuffle
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec128<T, 2> Reverse(Simd<T, 2, 0> /* tag */, const Vec128<T, 2> v) {
return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
}
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
return Shuffle01(v);
}
// Four lanes: shuffle
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
return Shuffle0123(v);
}
// 16-bit
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> d, const Vec128<T, N> v) {
const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
}
// ------------------------------ Reverse2
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
return BitCast(d, RotateRight<16>(BitCast(du32, v)));
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
return Shuffle2301(v);
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
return Shuffle01(v);
}
// ------------------------------ Reverse4
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> d, const Vec128<T, N> v) {
return BitCast(d, Vec128<uint16_t, N>{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2,
1, 0, 7, 6, 5, 4)});
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
return Shuffle0123(v);
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N>) {
HWY_ASSERT(0); // don't have 8 u64 lanes
}
// ------------------------------ Reverse8
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> d, const Vec128<T, N> v) {
return Reverse(d, v);
}
template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0>, const Vec128<T, N>) {
HWY_ASSERT(0); // don't have 8 lanes unless 16-bit
}
// ------------------------------ InterleaveLower
template <size_t N>
HWY_API Vec128<uint8_t, N> InterleaveLower(Vec128<uint8_t, N> a,
Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_i8x16_shuffle(
a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> InterleaveLower(Vec128<uint16_t, N> a,
Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{
wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> InterleaveLower(Vec128<uint32_t, N> a,
Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> InterleaveLower(Vec128<uint64_t, N> a,
Vec128<uint64_t, N> b) {
return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
}
template <size_t N>
HWY_API Vec128<int8_t, N> InterleaveLower(Vec128<int8_t, N> a,
Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_shuffle(
a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> InterleaveLower(Vec128<int16_t, N> a,
Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{
wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> InterleaveLower(Vec128<int32_t, N> a,
Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> InterleaveLower(Vec128<int64_t, N> a,
Vec128<int64_t, N> b) {
return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
}
template <size_t N>
HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a,
Vec128<float, N> b) {
return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
}
// Additional overload for the optional tag.
template <class V>
HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
return InterleaveLower(a, b);
}
// ------------------------------ InterleaveUpper (UpperHalf)
// All functions inside detail lack the required D parameter.
namespace detail {
template <size_t N>
HWY_API Vec128<uint8_t, N> InterleaveUpper(Vec128<uint8_t, N> a,
Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
26, 11, 27, 12, 28, 13, 29, 14,
30, 15, 31)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> InterleaveUpper(Vec128<uint16_t, N> a,
Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{
wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> InterleaveUpper(Vec128<uint32_t, N> a,
Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> InterleaveUpper(Vec128<uint64_t, N> a,
Vec128<uint64_t, N> b) {
return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
}
template <size_t N>
HWY_API Vec128<int8_t, N> InterleaveUpper(Vec128<int8_t, N> a,
Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
26, 11, 27, 12, 28, 13, 29, 14,
30, 15, 31)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> InterleaveUpper(Vec128<int16_t, N> a,
Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{
wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> InterleaveUpper(Vec128<int32_t, N> a,
Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> InterleaveUpper(Vec128<int64_t, N> a,
Vec128<int64_t, N> b) {
return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
}
template <size_t N>
HWY_API Vec128<float, N> InterleaveUpper(Vec128<float, N> a,
Vec128<float, N> b) {
return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
}
} // namespace detail
// Full
template <typename T, class V = Vec128<T>>
HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
return detail::InterleaveUpper(a, b);
}
// Partial
template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
const Half<decltype(d)> d2;
return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
}
// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
// Same as Interleave*, except that the return lanes are double-width integers;
// this is necessary because the single-lane scalar cannot return two values.
template <class V, class DW = RepartitionToWide<DFromV<V>>>
HWY_API VFromD<DW> ZipLower(V a, V b) {
return BitCast(DW(), InterleaveLower(a, b));
}
template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
return BitCast(dw, InterleaveLower(D(), a, b));
}
template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
return BitCast(dw, InterleaveUpper(D(), a, b));
}
// ================================================== COMBINE
// ------------------------------ Combine (InterleaveLower)
// N = N/2 + N/2 (upper half undefined)
template <typename T, size_t N>
HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
Vec128<T, N / 2> lo_half) {
const Half<decltype(d)> d2;
const RebindToUnsigned<decltype(d2)> du2;
// Treat half-width input as one lane, and expand to two lanes.
using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
const VU lo{BitCast(du2, lo_half).raw};
const VU hi{BitCast(du2, hi_half).raw};
return BitCast(d, InterleaveLower(lo, hi));
}
// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
template <typename T, size_t N>
HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
}
// ------------------------------ ConcatLowerLower
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
template <typename T>
HWY_API Vec128<T> ConcatLowerLower(Full128<T> /* tag */, const Vec128<T> hi,
const Vec128<T> lo) {
return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
}
template <typename T, size_t N, HWY_IF_LE64(T, N)>
HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
const Vec128<T, N> lo) {
const Half<decltype(d)> d2;
return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
}
// ------------------------------ ConcatUpperUpper
template <typename T>
HWY_API Vec128<T> ConcatUpperUpper(Full128<T> /* tag */, const Vec128<T> hi,
const Vec128<T> lo) {
return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
}
template <typename T, size_t N, HWY_IF_LE64(T, N)>
HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
const Vec128<T, N> lo) {
const Half<decltype(d)> d2;
return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
}
// ------------------------------ ConcatLowerUpper
template <typename T>
HWY_API Vec128<T> ConcatLowerUpper(Full128<T> d, const Vec128<T> hi,
const Vec128<T> lo) {
return CombineShiftRightBytes<8>(d, hi, lo);
}
template <typename T, size_t N, HWY_IF_LE64(T, N)>
HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
const Vec128<T, N> lo) {
const Half<decltype(d)> d2;
return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
}
// ------------------------------ ConcatUpperLower
template <typename T, size_t N>
HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
const Vec128<T, N> lo) {
return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
}
// ------------------------------ ConcatOdd
// 32-bit full
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
}
// 32-bit partial
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2, 0> /* tag */, Vec128<T, 2> hi,
Vec128<T, 2> lo) {
return InterleaveUpper(Simd<T, 2, 0>(), lo, hi);
}
// 64-bit full - no partial because we need at least two inputs to have
// even/odd.
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
return InterleaveUpper(Full128<T>(), lo, hi);
}
// ------------------------------ ConcatEven (InterleaveLower)
// 32-bit full
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
}
// 32-bit partial
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2, 0> /* tag */, Vec128<T, 2> hi,
Vec128<T, 2> lo) {
return InterleaveLower(Simd<T, 2, 0>(), lo, hi);
}
// 64-bit full - no partial because we need at least two inputs to have
// even/odd.
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
return InterleaveLower(Full128<T>(), lo, hi);
}
// ------------------------------ DupEven (InterleaveLower)
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)};
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
return InterleaveLower(DFromV<decltype(v)>(), v, v);
}
// ------------------------------ DupOdd (InterleaveUpper)
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)};
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
return InterleaveUpper(DFromV<decltype(v)>(), v, v);
}
// ------------------------------ OddEven
namespace detail {
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a,
const Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const Repartition<uint8_t, decltype(d)> d8;
alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a,
const Vec128<T, N> b) {
return Vec128<T, N>{
wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a,
const Vec128<T, N> b) {
return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a,
const Vec128<T, N> b) {
return Vec128<T, N>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
}
} // namespace detail
template <typename T, size_t N>
HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
}
template <size_t N>
HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
}
// ------------------------------ OddEvenBlocks
template <typename T, size_t N>
HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
return even;
}
// ------------------------------ SwapAdjacentBlocks
template <typename T, size_t N>
HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
return v;
}
// ------------------------------ ReverseBlocks
// Single block: no change
template <typename T>
HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
return v;
}
// ================================================== CONVERT
// ------------------------------ Promotions (part w/ narrow lanes -> full)
// Unsigned: zero-extend.
template <size_t N>
HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N, 0> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<uint32_t, N>{
wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
}
template <size_t N>
HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<int32_t, N>{
wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
const Vec128<uint16_t, N> v) {
return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N, 0> /* tag */,
const Vec128<uint32_t, N> v) {
return Vec128<uint64_t, N>{wasm_u64x2_extend_low_u32x4(v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
const Vec128<uint16_t, N> v) {
return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
}
// Signed: replicate sign bit.
template <size_t N>
HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
const Vec128<int8_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
const Vec128<int8_t, N> v) {
return Vec128<int32_t, N>{
wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
}
template <size_t N>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
const Vec128<int16_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<int64_t, N>{wasm_i64x2_extend_low_i32x4(v.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> PromoteTo(Simd<double, N, 0> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<double, N>{wasm_f64x2_convert_low_i32x4(v.raw)};
}
template <size_t N>
HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
const Vec128<float16_t, N> v) {
const RebindToSigned<decltype(df32)> di32;
const RebindToUnsigned<decltype(df32)> du32;
// Expand to u32 so we can shift.
const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
const auto sign = ShiftRight<15>(bits16);
const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
const auto mantissa = bits16 & Set(du32, 0x3FF);
const auto subnormal =
BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
Set(df32, 1.0f / 16384 / 1024));
const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
return BitCast(df32, ShiftLeft<31>(sign) | bits32);
}
template <size_t N>
HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
const Vec128<bfloat16_t, N> v) {
const Rebind<uint16_t, decltype(df32)> du16;
const RebindToSigned<decltype(df32)> di32;
return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
}
// ------------------------------ Demotions (full -> part w/ narrow lanes)
template <size_t N>
HWY_API Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N, 0> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<uint16_t, N>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> DemoteTo(Simd<int16_t, N, 0> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
}
template <size_t N>
HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
const Vec128<int32_t, N> v) {
const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
return Vec128<uint8_t, N>{
wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
}
template <size_t N>
HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
const Vec128<int16_t, N> v) {
return Vec128<uint8_t, N>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
}
template <size_t N>
HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
const Vec128<int32_t, N> v) {
const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
}
template <size_t N>
HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
const Vec128<int16_t, N> v) {
return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N, 0> /* di */,
const Vec128<double, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
}
template <size_t N>
HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> df16,
const Vec128<float, N> v) {
const RebindToUnsigned<decltype(df16)> du16;
const Rebind<uint32_t, decltype(du16)> du;
const RebindToSigned<decltype(du)> di;
const auto bits32 = BitCast(du, v);
const auto sign = ShiftRight<31>(bits32);
const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
const auto k15 = Set(di, 15);
const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
const auto is_tiny = exp < Set(di, -24);
const auto is_subnormal = exp < Set(di, -14);
const auto biased_exp16 =
BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
(mantissa32 >> (Set(du, 13) + sub_exp));
const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
ShiftRight<13>(mantissa32)); // <1024
const auto sign16 = ShiftLeft<15>(sign);
const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
return Vec128<float16_t, N>{DemoteTo(du16, bits16).raw};
}
template <size_t N>
HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> dbf16,
const Vec128<float, N> v) {
const Rebind<int32_t, decltype(dbf16)> di32;
const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
const Rebind<uint16_t, decltype(dbf16)> du16;
const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
return BitCast(dbf16, DemoteTo(du16, bits_in_32));
}
template <size_t N>
HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
const RebindToUnsigned<decltype(dbf16)> du16;
const Repartition<uint32_t, decltype(dbf16)> du32;
const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
}
// For already range-limited input [0, 255].
template <size_t N>
HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
return Vec128<uint8_t, N>{
wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
}
// ------------------------------ Convert i32 <=> f32 (Round)
template <size_t N>
HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<float, N>{wasm_f32x4_convert_i32x4(v.raw)};
}
// Truncates (rounds toward zero).
template <size_t N>
HWY_API Vec128<int32_t, N> ConvertTo(Simd<int32_t, N, 0> /* tag */,
const Vec128<float, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
return ConvertTo(Simd<int32_t, N, 0>(), Round(v));
}
// ================================================== MISC
// ------------------------------ SumsOf8 (ShiftRight, Add)
template <size_t N>
HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
const DFromV<decltype(v)> du8;
const RepartitionToWide<decltype(du8)> du16;
const RepartitionToWide<decltype(du16)> du32;
const RepartitionToWide<decltype(du32)> du64;
using VU16 = VFromD<decltype(du16)>;
const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF));
const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
const VU16 szz_FE_zz_BA_zz_76_zz_32 =
BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF));
}
// ------------------------------ LoadMaskBits (TestBit)
namespace detail {
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
const RebindToUnsigned<decltype(d)> du;
// Easier than Set(), which would require an >8-bit type, which would not
// compile for T=uint8_t, N=1.
const Vec128<T, N> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};
// Replicate bytes 8x such that each byte contains the bit that governs it.
alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1};
const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
1, 2, 4, 8, 16, 32, 64, 128};
return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
const RebindToUnsigned<decltype(d)> du;
alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
return RebindMask(
d, TestBit(Set(du, static_cast<uint16_t>(bits)), Load(du, kBit)));
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
const RebindToUnsigned<decltype(d)> du;
alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
return RebindMask(
d, TestBit(Set(du, static_cast<uint32_t>(bits)), Load(du, kBit)));
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
const RebindToUnsigned<decltype(d)> du;
alignas(16) constexpr uint64_t kBit[8] = {1, 2};
return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
}
} // namespace detail
// `p` points to at least 8 readable bytes, not all of which need be valid.
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
const uint8_t* HWY_RESTRICT bits) {
uint64_t mask_bits = 0;
CopyBytes<(N + 7) / 8>(bits, &mask_bits);
return detail::LoadMaskBits(d, mask_bits);
}
// ------------------------------ Mask
namespace detail {
// Full
template <typename T>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
const Mask128<T> mask) {
alignas(16) uint64_t lanes[2];
wasm_v128_store(lanes, mask.raw);
constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
const uint64_t lo = ((lanes[0] * kMagic) >> 56);
const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
return (hi + lo);
}
// 64-bit
template <typename T>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
const Mask128<T, 8> mask) {
constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
return (static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0)) *
kMagic) >>
56;
}
// 32-bit or less: need masking
template <typename T, size_t N, HWY_IF_LE32(T, N)>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
const Mask128<T, N> mask) {
uint64_t bytes = static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
// Clear potentially undefined bytes.
bytes &= (1ULL << (N * 8)) - 1;
constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
return (bytes * kMagic) >> 56;
}
template <typename T, size_t N>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
const Mask128<T, N> mask) {
// Remove useless lower half of each u16 while preserving the sign bit.
const __i16x8 zero = wasm_i16x8_splat(0);
const Mask128<uint8_t, N> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
return BitsFromMask(hwy::SizeTag<1>(), mask8);
}
template <typename T, size_t N>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
const Mask128<T, N> mask) {
const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
alignas(16) uint32_t lanes[4];
wasm_v128_store(lanes, sliced_mask);
return lanes[0] | lanes[1] | lanes[2] | lanes[3];
}
template <typename T, size_t N>
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
const Mask128<T, N> mask) {
const __i64x2 mask_i = static_cast<__i64x2>(mask.raw);
const __i64x2 slice = wasm_i64x2_make(1, 2);
const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
alignas(16) uint64_t lanes[2];
wasm_v128_store(lanes, sliced_mask);
return lanes[0] | lanes[1];
}
// Returns the lowest N bits for the BitsFromMask result.
template <typename T, size_t N>
constexpr uint64_t OnlyActive(uint64_t bits) {
return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1);
}
// Returns 0xFF for bytes with index >= N, otherwise 0.
template <size_t N>
constexpr __i8x16 BytesAbove() {
return /**/
(N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
: (N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
: (N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
: (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
: (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
: (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
: (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
: (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
: (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
: (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1)
: (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1)
: (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1)
: (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
-1, -1, -1)
: (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
-1, -1, -1)
: (N == 11)
? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
: (N == 13)
? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
: wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
}
template <typename T, size_t N>
HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
}
template <typename T>
HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
return PopCount(BitsFromMask(tag, m));
}
template <typename T>
HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
return PopCount(BitsFromMask(tag, m));
}
template <typename T>
HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
alignas(16) uint64_t lanes[2];
wasm_v128_store(lanes, shifted_bits);
return PopCount(lanes[0] | lanes[1]);
}
template <typename T>
HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
alignas(16) int64_t lanes[2];
wasm_v128_store(lanes, m.raw);
return static_cast<size_t>(-(lanes[0] + lanes[1]));
}
} // namespace detail
// `p` points to at least 8 writable bytes.
template <typename T, size_t N>
HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
const Mask128<T, N> mask, uint8_t* bits) {
const uint64_t mask_bits = detail::BitsFromMask(mask);
const size_t kNumBytes = (N + 7) / 8;
CopyBytes<kNumBytes>(&mask_bits, bits);
return kNumBytes;
}
template <typename T, size_t N>
HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), m);
}
// Partial vector
template <typename T, size_t N, HWY_IF_LE64(T, N)>
HWY_API size_t CountTrue(const Simd<T, N, 0> d, const Mask128<T, N> m) {
// Ensure all undefined bytes are 0.
const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
return CountTrue(d, Mask128<T>{AndNot(mask, m).raw});
}
// Full vector
template <typename T>
HWY_API bool AllFalse(const Full128<T> d, const Mask128<T> m) {
#if 0
// Casting followed by wasm_i8x16_any_true results in wasm error:
// i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(d, m));
return !wasm_i8x16_any_true(v8.raw);
#else
(void)d;
return (wasm_i64x2_extract_lane(m.raw, 0) |
wasm_i64x2_extract_lane(m.raw, 1)) == 0;
#endif
}
// Full vector
namespace detail {
template <typename T>
HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) {
return wasm_i8x16_all_true(m.raw);
}
template <typename T>
HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) {
return wasm_i16x8_all_true(m.raw);
}
template <typename T>
HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
return wasm_i32x4_all_true(m.raw);
}
template <typename T>
HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
return wasm_i64x2_all_true(m.raw);
}
} // namespace detail
template <typename T, size_t N>
HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
}
// Partial vectors
template <typename T, size_t N, HWY_IF_LE64(T, N)>
HWY_API bool AllFalse(Simd<T, N, 0> /* tag */, const Mask128<T, N> m) {
// Ensure all undefined bytes are 0.
const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
return AllFalse(Full128<T>(), Mask128<T>{AndNot(mask, m).raw});
}
template <typename T, size_t N, HWY_IF_LE64(T, N)>
HWY_API bool AllTrue(const Simd<T, N, 0> /* d */, const Mask128<T, N> m) {
// Ensure all undefined bytes are FF.
const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw});
}
template <typename T, size_t N>
HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
const Mask128<T, N> mask) {
const uint64_t bits = detail::BitsFromMask(mask);
return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1;
}
// ------------------------------ Compress
namespace detail {
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
HWY_DASSERT(mask_bits < 256);
const Simd<T, N, 0> d;
const Rebind<uint8_t, decltype(d)> d8;
const Simd<uint16_t, N, 0> du;
// We need byte indices for TableLookupBytes (one vector's worth for each of
// 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
// can instead store lane indices and convert to byte indices (2*lane + 0..1),
// with the doubling baked into the table. Unpacking nibbles is likely more
// costly than the higher cache footprint from storing bytes.
alignas(16) constexpr uint8_t table[256 * 8] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2,
6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0,
0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0,
0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,
2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8,
0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0,
0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0,
0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0,
0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4,
6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6,
8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0,
4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4,
10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0,
0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0,
0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0,
0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2,
4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0,
0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0,
0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0,
2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8,
10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10,
0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0,
0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0,
0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12,
0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0,
0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0,
0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0,
6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6,
12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0,
0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0,
0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0,
0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2,
8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12,
0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0,
0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0,
2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6,
8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8,
12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0,
0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0,
0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4,
10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10,
12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0,
0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0,
4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4,
6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0,
0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0,
0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0,
0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2,
4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10,
12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12,
0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0,
2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0,
0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0,
0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0,
0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0,
0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6,
14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14,
0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0,
0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0,
8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8,
14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0,
0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0,
0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0,
0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2,
6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8,
14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14,
0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0,
2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10,
14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14,
0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0,
0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0,
0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4,
6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6,
10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0,
0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0,
4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4,
8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14,
0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0,
0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0,
0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2,
4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0,
0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0,
0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0,
2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12,
14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14,
0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0,
0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0,
0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8,
12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12,
14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0,
0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0,
6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6,
8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12,
14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14,
0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0,
0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2,
10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12,
14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14,
0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0,
2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6,
10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10,
12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0,
0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0,
0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4,
8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8,
10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14,
0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0,
4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4,
6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
return BitCast(d, pairs + Set(du, 0x0100));
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
HWY_DASSERT(mask_bits < 16);
// There are only 4 lanes, so we can afford to load the index vector directly.
alignas(16) constexpr uint8_t packed_array[16 * 16] = {
0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, //
8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, //
4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, //
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, //
12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, //
0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, //
8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, //
0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
const Simd<T, N, 0> d;
const Repartition<uint8_t, decltype(d)> d8;
return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Idx64x2FromBits(const uint64_t mask_bits) {
HWY_DASSERT(mask_bits < 4);
// There are only 2 lanes, so we can afford to load the index vector directly.
alignas(16) constexpr uint8_t packed_array[4 * 16] = {
0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, //
8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
const Simd<T, N, 0> d;
const Repartition<uint8_t, decltype(d)> d8;
return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
}
// Helper functions called by both Compress and CompressStore - avoids a
// redundant BitsFromMask in the latter.
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Compress(hwy::SizeTag<2> /*tag*/, Vec128<T, N> v,
const uint64_t mask_bits) {
const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
const DFromV<decltype(v)> d;
const RebindToSigned<decltype(d)> di;
return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Compress(hwy::SizeTag<4> /*tag*/, Vec128<T, N> v,
const uint64_t mask_bits) {
const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
const DFromV<decltype(v)> d;
const RebindToSigned<decltype(d)> di;
return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
}
template <typename T, size_t N>
HWY_INLINE Vec128<T, N> Compress(hwy::SizeTag<8> /*tag*/, Vec128<T, N> v,
const uint64_t mask_bits) {
const auto idx = detail::Idx64x2FromBits<T, N>(mask_bits);
const DFromV<decltype(v)> d;
const RebindToSigned<decltype(d)> di;
return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
}
} // namespace detail
template <typename T, size_t N>
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
const uint64_t mask_bits = detail::BitsFromMask(mask);
return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
}
// ------------------------------ CompressBits
template <typename T, size_t N>
HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
const uint8_t* HWY_RESTRICT bits) {
uint64_t mask_bits = 0;
constexpr size_t kNumBytes = (N + 7) / 8;
CopyBytes<kNumBytes>(bits, &mask_bits);
if (N < 8) {
mask_bits &= (1ull << N) - 1;
}
return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
}
// ------------------------------ CompressStore
template <typename T, size_t N>
HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
const uint64_t mask_bits = detail::BitsFromMask(mask);
const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
StoreU(c, d, unaligned);
return PopCount(mask_bits);
}
// ------------------------------ CompressBlendedStore
template <typename T, size_t N>
HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
Simd<T, N, 0> d,
T* HWY_RESTRICT unaligned) {
const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16
using TU = TFromD<decltype(du)>;
const uint64_t mask_bits = detail::BitsFromMask(m);
const size_t count = PopCount(mask_bits);
const Mask128<TU, N> store_mask = FirstN(du, count);
const Vec128<TU, N> compressed =
detail::Compress(hwy::SizeTag<sizeof(T)>(), BitCast(du, v), mask_bits);
const Vec128<TU, N> prev = BitCast(du, LoadU(d, unaligned));
StoreU(BitCast(d, IfThenElse(store_mask, compressed, prev)), d, unaligned);
return count;
}
// ------------------------------ CompressBitsStore
template <typename T, size_t N>
HWY_API size_t CompressBitsStore(Vec128<T, N> v,
const uint8_t* HWY_RESTRICT bits,
Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
uint64_t mask_bits = 0;
constexpr size_t kNumBytes = (N + 7) / 8;
CopyBytes<kNumBytes>(bits, &mask_bits);
if (N < 8) {
mask_bits &= (1ull << N) - 1;
}
const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
StoreU(c, d, unaligned);
return PopCount(mask_bits);
}
// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
// TableLookupBytes)
// 128 bits
HWY_API void StoreInterleaved3(const Vec128<uint8_t> a, const Vec128<uint8_t> b,
const Vec128<uint8_t> c, Full128<uint8_t> d,
uint8_t* HWY_RESTRICT unaligned) {
const auto k5 = Set(d, 5);
const auto k6 = Set(d, 6);
// Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
// 0x80 so lanes to be filled from other vectors are 0 for blending.
alignas(16) static constexpr uint8_t tbl_r0[16] = {
0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
3, 0x80, 0x80, 4, 0x80, 0x80, 5};
alignas(16) static constexpr uint8_t tbl_g0[16] = {
0x80, 0, 0x80, 0x80, 1, 0x80, //
0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
const auto shuf_r0 = Load(d, tbl_r0);
const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB
const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0
const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0.
const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0..
const auto int0 = r0 | g0 | b0;
StoreU(int0, d, unaligned + 0 * 16);
// Second vector: g10,r10, bgr[9:6], b5,g5
const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
const auto r1 = TableLookupBytes(a, shuf_r1);
const auto g1 = TableLookupBytes(b, shuf_g1);
const auto b1 = TableLookupBytes(c, shuf_b1);
const auto int1 = r1 | g1 | b1;
StoreU(int1, d, unaligned + 1 * 16);
// Third vector: bgr[15:11], b10
const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
const auto r2 = TableLookupBytes(a, shuf_r2);
const auto g2 = TableLookupBytes(b, shuf_g2);
const auto b2 = TableLookupBytes(c, shuf_b2);
const auto int2 = r2 | g2 | b2;
StoreU(int2, d, unaligned + 2 * 16);
}
// 64 bits
HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> a,
const Vec128<uint8_t, 8> b,
const Vec128<uint8_t, 8> c, Full64<uint8_t> d,
uint8_t* HWY_RESTRICT unaligned) {
// Use full vectors for the shuffles and first result.
const Full128<uint8_t> d_full;
const auto k5 = Set(d_full, 5);
const auto k6 = Set(d_full, 6);
const Vec128<uint8_t> full_a{a.raw};
const Vec128<uint8_t> full_b{b.raw};
const Vec128<uint8_t> full_c{c.raw};
// Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
// 0x80 so lanes to be filled from other vectors are 0 for blending.
alignas(16) static constexpr uint8_t tbl_r0[16] = {
0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
3, 0x80, 0x80, 4, 0x80, 0x80, 5};
alignas(16) static constexpr uint8_t tbl_g0[16] = {
0x80, 0, 0x80, 0x80, 1, 0x80, //
0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
const auto shuf_r0 = Load(d_full, tbl_r0);
const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB
const auto shuf_b0 = CombineShiftRightBytes<15>(d_full, shuf_g0, shuf_g0);
const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0
const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0.
const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0..
const auto int0 = r0 | g0 | b0;
StoreU(int0, d_full, unaligned + 0 * 16);
// Second (HALF) vector: bgr[7:6], b5,g5
const auto shuf_r1 = shuf_b0 + k6; // ..7..6..
const auto shuf_g1 = shuf_r0 + k5; // .7..6..5
const auto shuf_b1 = shuf_g0 + k5; // 7..6..5.
const auto r1 = TableLookupBytes(full_a, shuf_r1);
const auto g1 = TableLookupBytes(full_b, shuf_g1);
const auto b1 = TableLookupBytes(full_c, shuf_b1);
const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
StoreU(int1, d, unaligned + 1 * 16);
}
// <= 32 bits
template <size_t N, HWY_IF_LE32(uint8_t, N)>
HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b,
const Vec128<uint8_t, N> c,
Simd<uint8_t, N, 0> /*tag*/,
uint8_t* HWY_RESTRICT unaligned) {
// Use full vectors for the shuffles and result.
const Full128<uint8_t> d_full;
const Vec128<uint8_t> full_a{a.raw};
const Vec128<uint8_t> full_b{b.raw};
const Vec128<uint8_t> full_c{c.raw};
// Shuffle (a,b,c) vector bytes to bgr[3:0].
// 0x80 so lanes to be filled from other vectors are 0 for blending.
alignas(16) static constexpr uint8_t tbl_r0[16] = {
0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, //
0x80, 0x80, 0x80, 0x80};
const auto shuf_r0 = Load(d_full, tbl_r0);
const auto shuf_g0 = CombineShiftRightBytes<15>(d_full, shuf_r0, shuf_r0);
const auto shuf_b0 = CombineShiftRightBytes<14>(d_full, shuf_r0, shuf_r0);
const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0
const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0.
const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0..
const auto int0 = r0 | g0 | b0;
alignas(16) uint8_t buf[16];
StoreU(int0, d_full, buf);
CopyBytes<N * 3>(buf, unaligned);
}
// ------------------------------ StoreInterleaved4
// 128 bits
HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
const Vec128<uint8_t> v1,
const Vec128<uint8_t> v2,
const Vec128<uint8_t> v3, Full128<uint8_t> d8,
uint8_t* HWY_RESTRICT unaligned) {
const RepartitionToWide<decltype(d8)> d16;
const RepartitionToWide<decltype(d16)> d32;
// let a,b,c,d denote v0..3.
const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
const auto ba8 = ZipUpper(d16, v0, v1);
const auto dc8 = ZipUpper(d16, v2, v3);
const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4
const auto dcba_8 = ZipLower(d32, ba8, dc8); // d..aB d..a8
const auto dcba_C = ZipUpper(d32, ba8, dc8); // d..aF d..aC
StoreU(BitCast(d8, dcba_0), d8, unaligned + 0 * 16);
StoreU(BitCast(d8, dcba_4), d8, unaligned + 1 * 16);
StoreU(BitCast(d8, dcba_8), d8, unaligned + 2 * 16);
StoreU(BitCast(d8, dcba_C), d8, unaligned + 3 * 16);
}
// 64 bits
HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> in0,
const Vec128<uint8_t, 8> in1,
const Vec128<uint8_t, 8> in2,
const Vec128<uint8_t, 8> in3,
Full64<uint8_t> /* tag */,
uint8_t* HWY_RESTRICT unaligned) {
// Use full vectors to reduce the number of stores.
const Full128<uint8_t> d_full8;
const RepartitionToWide<decltype(d_full8)> d16;
const RepartitionToWide<decltype(d16)> d32;
const Vec128<uint8_t> v0{in0.raw};
const Vec128<uint8_t> v1{in1.raw};
const Vec128<uint8_t> v2{in2.raw};
const Vec128<uint8_t> v3{in3.raw};
// let a,b,c,d denote v0..3.
const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4
StoreU(BitCast(d_full8, dcba_0), d_full8, unaligned + 0 * 16);
StoreU(BitCast(d_full8, dcba_4), d_full8, unaligned + 1 * 16);
}
// <= 32 bits
template <size_t N, HWY_IF_LE32(uint8_t, N)>
HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
const Vec128<uint8_t, N> in1,
const Vec128<uint8_t, N> in2,
const Vec128<uint8_t, N> in3,
Simd<uint8_t, N, 0> /*tag*/,
uint8_t* HWY_RESTRICT unaligned) {
// Use full vectors to reduce the number of stores.
const Full128<uint8_t> d_full8;
const RepartitionToWide<decltype(d_full8)> d16;
const RepartitionToWide<decltype(d16)> d32;
const Vec128<uint8_t> v0{in0.raw};
const Vec128<uint8_t> v1{in1.raw};
const Vec128<uint8_t> v2{in2.raw};
const Vec128<uint8_t> v3{in3.raw};
// let a,b,c,d denote v0..3.
const auto ba0 = ZipLower(d16, v0, v1); // b3 a3 .. b0 a0
const auto dc0 = ZipLower(d16, v2, v3); // d3 c3 .. d0 c0
const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
alignas(16) uint8_t buf[16];
StoreU(BitCast(d_full8, dcba_0), d_full8, buf);
CopyBytes<4 * N>(buf, unaligned);
}
// ------------------------------ MulEven/Odd (Load)
HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
const Vec128<uint64_t> b) {
alignas(16) uint64_t mul[2];
mul[0] =
Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
return Load(Full128<uint64_t>(), mul);
}
HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
const Vec128<uint64_t> b) {
alignas(16) uint64_t mul[2];
mul[0] =
Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
return Load(Full128<uint64_t>(), mul);
}
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
template <size_t N>
HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
Vec128<bfloat16_t, 2 * N> a,
Vec128<bfloat16_t, 2 * N> b,
const Vec128<float, N> sum0,
Vec128<float, N>& sum1) {
const Repartition<uint16_t, decltype(df32)> du16;
const RebindToUnsigned<decltype(df32)> du32;
const Vec128<uint16_t, 2 * N> zero = Zero(du16);
const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
}
// ------------------------------ Reductions
namespace detail {
// N=1 for any T: no-op
template <typename T>
HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
const Vec128<T, 1> v) {
return v;
}
template <typename T>
HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
const Vec128<T, 1> v) {
return v;
}
template <typename T>
HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
const Vec128<T, 1> v) {
return v;
}
// u32/i32/f32:
// N=2
template <typename T>
HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<T, 2> v10) {
return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
}
template <typename T>
HWY_INLINE Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<T, 2> v10) {
return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
}
template <typename T>
HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<T, 2> v10) {
return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
}
// N=4 (full)
template <typename T>
HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<T> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = v3210 + v1032;
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return v20_31_20_31 + v31_20_31_20;
}
template <typename T>
HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<T> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return Min(v20_31_20_31, v31_20_31_20);
}
template <typename T>
HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<T> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return Max(v20_31_20_31, v31_20_31_20);
}
// u64/i64/f64:
// N=2 (full)
template <typename T>
HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
const Vec128<T> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return v10 + v01;
}
template <typename T>
HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
const Vec128<T> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return Min(v10, v01);
}
template <typename T>
HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
const Vec128<T> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return Max(v10, v01);
}
// u16/i16
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const Repartition<int32_t, decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MinOfLanes(d32, Min(even, odd));
// Also broadcast into odd lanes.
return BitCast(d, Or(min, ShiftLeft<16>(min)));
}
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const Repartition<int32_t, decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto min = MaxOfLanes(d32, Max(even, odd));
// Also broadcast into odd lanes.
return BitCast(d, Or(min, ShiftLeft<16>(min)));
}
} // namespace detail
// Supported for u/i/f 32/64. Returns the same value in each lane.
template <typename T, size_t N>
HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
}
// ------------------------------ Lt128
namespace detail {
template <size_t kLanes, typename T, size_t N>
Mask128<T, N> ShiftMaskLeft(Mask128<T, N> m) {
return MaskFromVec(ShiftLeftLanes<kLanes>(VecFromMask(Simd<T, N, 0>(), m)));
}
} // namespace detail
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
Vec128<T, N> b) {
static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
// Truth table of Eq and Lt for Hi and Lo u64.
// (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
// =H =L cH cL | out = cH | (=H & cL)
// 0 0 0 0 | 0
// 0 0 0 1 | 0
// 0 0 1 0 | 1
// 0 0 1 1 | 1
// 0 1 0 0 | 0
// 0 1 0 1 | 0
// 0 1 1 0 | 1
// 1 0 0 0 | 0
// 1 0 0 1 | 1
// 1 1 0 0 | 0
const Mask128<T, N> eqHL = Eq(a, b);
const Mask128<T, N> ltHL = Lt(a, b);
// We need to bring cL to the upper lane/bit corresponding to cH. Comparing
// the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
// comparison result leftwards requires only 4.
const Mask128<T, N> ltLx = detail::ShiftMaskLeft<1>(ltHL);
const Mask128<T, N> outHx = Or(ltHL, And(eqHL, ltLx));
const Vec128<T, N> vecHx = VecFromMask(d, outHx);
return MaskFromVec(InterleaveUpper(d, vecHx, vecHx));
}
// ------------------------------ Min128, Max128 (Lt128)
// Without a native OddEven, it seems infeasible to go faster than Lt128.
template <class D>
HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
return IfThenElse(Lt128(d, a, b), a, b);
}
template <class D>
HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
return IfThenElse(Lt128(d, a, b), b, a);
}
// ================================================== Operator wrapper
template <class V>
HWY_API V Add(V a, V b) {
return a + b;
}
template <class V>
HWY_API V Sub(V a, V b) {
return a - b;
}
template <class V>
HWY_API V Mul(V a, V b) {
return a * b;
}
template <class V>
HWY_API V Div(V a, V b) {
return a / b;
}
template <class V>
V Shl(V a, V b) {
return a << b;
}
template <class V>
V Shr(V a, V b) {
return a >> b;
}
template <class V>
HWY_API auto Eq(V a, V b) -> decltype(a == b) {
return a == b;
}
template <class V>
HWY_API auto Ne(V a, V b) -> decltype(a == b) {
return a != b;
}
template <class V>
HWY_API auto Lt(V a, V b) -> decltype(a == b) {
return a < b;
}
template <class V>
HWY_API auto Gt(V a, V b) -> decltype(a == b) {
return a > b;
}
template <class V>
HWY_API auto Ge(V a, V b) -> decltype(a == b) {
return a >= b;
}
template <class V>
HWY_API auto Le(V a, V b) -> decltype(a == b) {
return a <= b;
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();