blob: 01af085bd4217b767b6820aacd97af15771e8ae0 [file] [log] [blame]
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// 128-bit WASM vectors and operations.
// External include guard in highway.h - see comment there.
#include <stddef.h>
#include <stdint.h>
#include <wasm_simd128.h>
#include <cmath>
#include "hwy/ops/shared-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
template <typename T>
struct Raw128 {
using type = __v128_u;
};
template <>
struct Raw128<float> {
using type = __f32x4;
};
template <typename T>
using Full128 = Simd<T, 16 / sizeof(T)>;
template <typename T, size_t N = 16 / sizeof(T)>
class Vec128 {
using Raw = typename Raw128<T>::type;
public:
// Compound assignment. Only usable if there is a corresponding non-member
// binary operator overload. For example, only f32 and f64 support division.
HWY_INLINE Vec128& operator*=(const Vec128 other) {
return *this = (*this * other);
}
HWY_INLINE Vec128& operator/=(const Vec128 other) {
return *this = (*this / other);
}
HWY_INLINE Vec128& operator+=(const Vec128 other) {
return *this = (*this + other);
}
HWY_INLINE Vec128& operator-=(const Vec128 other) {
return *this = (*this - other);
}
HWY_INLINE Vec128& operator&=(const Vec128 other) {
return *this = (*this & other);
}
HWY_INLINE Vec128& operator|=(const Vec128 other) {
return *this = (*this | other);
}
HWY_INLINE Vec128& operator^=(const Vec128 other) {
return *this = (*this ^ other);
}
Raw raw;
};
// Integer: FF..FF or 0. Float: MSB, all other bits undefined - see README.
template <typename T, size_t N = 16 / sizeof(T)>
class Mask128 {
using Raw = typename Raw128<T>::type;
public:
Raw raw;
};
// ------------------------------ Cast
HWY_API __v128_u BitCastToInteger(__v128_u v) { return v; }
HWY_API __v128_u BitCastToInteger(__f32x4 v) {
return static_cast<__v128_u>(v);
}
HWY_API __v128_u BitCastToInteger(__f64x2 v) {
return static_cast<__v128_u>(v);
}
template <typename T, size_t N>
HWY_API Vec128<uint8_t, N * sizeof(T)> cast_to_u8(Vec128<T, N> v) {
return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
}
// Cannot rely on function overloading because return types differ.
template <typename T>
struct BitCastFromInteger128 {
HWY_INLINE __v128_u operator()(__v128_u v) { return v; }
};
template <>
struct BitCastFromInteger128<float> {
HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); }
};
template <typename T, size_t N>
HWY_API Vec128<T, N> cast_u8_to(Simd<T, N> /* tag */,
Vec128<uint8_t, N * sizeof(T)> v) {
return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
}
template <typename T, size_t N, typename FromT>
HWY_API Vec128<T, N> BitCast(Simd<T, N> d,
Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
return cast_u8_to(d, cast_to_u8(v));
}
// ------------------------------ Set
// Returns an all-zero vector/part.
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_API Vec128<T, N> Zero(Simd<T, N> /* tag */) {
return Vec128<T, N>{wasm_i32x4_splat(0)};
}
template <size_t N, HWY_IF_LE128(float, N)>
HWY_API Vec128<float, N> Zero(Simd<float, N> /* tag */) {
return Vec128<float, N>{wasm_f32x4_splat(0.0f)};
}
// Returns a vector/part with all lanes set to "t".
template <size_t N, HWY_IF_LE128(uint8_t, N)>
HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N> /* tag */, const uint8_t t) {
return Vec128<uint8_t, N>{wasm_i8x16_splat(t)};
}
template <size_t N, HWY_IF_LE128(uint16_t, N)>
HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N> /* tag */, const uint16_t t) {
return Vec128<uint16_t, N>{wasm_i16x8_splat(t)};
}
template <size_t N, HWY_IF_LE128(uint32_t, N)>
HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N> /* tag */, const uint32_t t) {
return Vec128<uint32_t, N>{wasm_i32x4_splat(t)};
}
template <size_t N, HWY_IF_LE128(int8_t, N)>
HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N> /* tag */, const int8_t t) {
return Vec128<int8_t, N>{wasm_i8x16_splat(t)};
}
template <size_t N, HWY_IF_LE128(int16_t, N)>
HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N> /* tag */, const int16_t t) {
return Vec128<int16_t, N>{wasm_i16x8_splat(t)};
}
template <size_t N, HWY_IF_LE128(int32_t, N)>
HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N> /* tag */, const int32_t t) {
return Vec128<int32_t, N>{wasm_i32x4_splat(t)};
}
template <size_t N, HWY_IF_LE128(float, N)>
HWY_API Vec128<float, N> Set(Simd<float, N> /* tag */, const float t) {
return Vec128<float, N>{wasm_f32x4_splat(t)};
}
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
// Returns a vector with uninitialized elements.
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_API Vec128<T, N> Undefined(Simd<T, N> /* tag */) {
__v128_u raw;
return Vec128<T, N>{raw};
}
template <size_t N, HWY_IF_LE128(float, N)>
HWY_API Vec128<float, N> Undefined(Simd<float, N> /* tag */) {
__f32x4 raw;
return Vec128<float, N>{raw};
}
HWY_DIAGNOSTICS(pop)
// ================================================== ARITHMETIC
// ------------------------------ Addition
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
}
// Float
template <size_t N>
HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Vec128<float, N>{wasm_f32x4_add(a.raw, b.raw)};
}
// ------------------------------ Subtraction
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
}
// Float
template <size_t N>
HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Vec128<float, N>{wasm_f32x4_sub(a.raw, b.raw)};
}
// ------------------------------ Saturating addition
// Returns a + b clamped to the destination range.
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_add_saturate(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_add_saturate(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_add_saturate(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_add_saturate(a.raw, b.raw)};
}
// ------------------------------ Saturating subtraction
// Returns a - b clamped to the destination range.
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_sub_saturate(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_sub_saturate(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_sub_saturate(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_sub_saturate(a.raw, b.raw)};
}
// ------------------------------ Average
// Returns (a + b + 1) / 2
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
}
// ------------------------------ Absolute value
// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
template <size_t N>
HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
}
template <size_t N>
HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
return Vec128<float, N>{wasm_f32x4_abs(v.raw)};
}
// ------------------------------ Shift lanes by constant #bits
// Unsigned
template <int kBits, size_t N>
HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)};
}
// Signed
template <int kBits, size_t N>
HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
}
template <int kBits, size_t N>
HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
}
// ------------------------------ Shift lanes by same variable #bits
// Unsigned (no u8)
template <size_t N>
HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
const int bits) {
return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
const int bits) {
return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
const int bits) {
return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
const int bits) {
return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
}
// Signed (no i8)
template <size_t N>
HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
const int bits) {
return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
const int bits) {
return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
const int bits) {
return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
const int bits) {
return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
}
// ------------------------------ Shift lanes by independent variable #bits
template <typename T, size_t N>
HWY_API Vec128<T, N> operator>>(const Vec128<T, N> v, const Vec128<T, N> bits) {
static_assert(N == 1, "Wasm does not support full variable shift");
return ShiftRightSame(v, GetLane(bits));
}
template <typename T, size_t N>
HWY_API Vec128<T, N> operator<<(const Vec128<T, N> v, const Vec128<T, N> bits) {
static_assert(N == 1, "Wasm does not support full variable shift");
return ShiftLeftSame(v, GetLane(bits));
}
// ------------------------------ Minimum
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> Min(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> Min(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> Min(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_u32x4_min(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> Min(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> Min(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_min(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> Min(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_min(a.raw, b.raw)};
}
// Float
template <size_t N>
HWY_API Vec128<float, N> Min(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Vec128<float, N>{wasm_f32x4_min(a.raw, b.raw)};
}
// ------------------------------ Maximum
// Unsigned
template <size_t N>
HWY_API Vec128<uint8_t, N> Max(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint8_t, N>{wasm_u8x16_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N> Max(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_u16x8_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> Max(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_u32x4_max(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int8_t, N> Max(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int8_t, N>{wasm_i8x16_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> Max(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_max(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> Max(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_max(a.raw, b.raw)};
}
// Float
template <size_t N>
HWY_API Vec128<float, N> Max(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Vec128<float, N>{wasm_f32x4_max(a.raw, b.raw)};
}
// ------------------------------ Integer multiplication
// Unsigned
template <size_t N>
HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
}
// Returns the upper 16 bits of a * b in each lane.
template <size_t N>
HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
// TODO(eustas): replace, when implemented in WASM.
const auto al = wasm_i32x4_widen_low_u16x8(a.raw);
const auto ah = wasm_i32x4_widen_high_u16x8(a.raw);
const auto bl = wasm_i32x4_widen_low_u16x8(b.raw);
const auto bh = wasm_i32x4_widen_high_u16x8(b.raw);
const auto l = wasm_i32x4_mul(al, bl);
const auto h = wasm_i32x4_mul(ah, bh);
// TODO(eustas): shift-right + narrow?
return Vec128<uint16_t, N>{
wasm_v16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
// TODO(eustas): replace, when implemented in WASM.
const auto al = wasm_i32x4_widen_low_i16x8(a.raw);
const auto ah = wasm_i32x4_widen_high_i16x8(a.raw);
const auto bl = wasm_i32x4_widen_low_i16x8(b.raw);
const auto bh = wasm_i32x4_widen_high_i16x8(b.raw);
const auto l = wasm_i32x4_mul(al, bl);
const auto h = wasm_i32x4_mul(ah, bh);
// TODO(eustas): shift-right + narrow?
return Vec128<int16_t, N>{
wasm_v16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
}
// Multiplies even lanes (0, 2 ..) and returns the double-width result.
template <size_t N>
HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
// TODO(eustas): replace, when implemented in WASM.
const auto kEvenMask = wasm_i32x4_make(0xFFFFFFFF, 0, 0xFFFFFFFF, 0);
const auto ae = wasm_v128_and(a.raw, kEvenMask);
const auto be = wasm_v128_and(b.raw, kEvenMask);
return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
}
template <size_t N>
HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
// TODO(eustas): replace, when implemented in WASM.
const auto kEvenMask = wasm_i32x4_make(0xFFFFFFFF, 0, 0xFFFFFFFF, 0);
const auto ae = wasm_v128_and(a.raw, kEvenMask);
const auto be = wasm_v128_and(b.raw, kEvenMask);
return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
}
// ------------------------------ Negate
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
return Xor(v, SignBit(Simd<T, N>()));
}
template <size_t N>
HWY_API Vec128<int8_t, N> Neg(const Vec128<int8_t, N> v) {
return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> Neg(const Vec128<int16_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> Neg(const Vec128<int32_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)};
}
template <size_t N>
HWY_API Vec128<int64_t, N> Neg(const Vec128<int64_t, N> v) {
return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)};
}
// ------------------------------ Floating-point mul / div
template <size_t N>
HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
return Vec128<float, N>{wasm_f32x4_mul(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
}
// Approximate reciprocal
template <size_t N>
HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
// TODO(eustas): replace, when implemented in WASM.
const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
return one / v;
}
// Absolute value of difference.
template <size_t N>
HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Abs(a - b);
}
// ------------------------------ Floating-point multiply-add variants
// Returns mul * x + add
template <size_t N>
HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
const Vec128<float, N> x,
const Vec128<float, N> add) {
// TODO(eustas): replace, when implemented in WASM.
// TODO(eustas): is it wasm_f32x4_qfma?
return mul * x + add;
}
// Returns add - mul * x
template <size_t N>
HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
const Vec128<float, N> x,
const Vec128<float, N> add) {
// TODO(eustas): replace, when implemented in WASM.
return add - mul * x;
}
// Returns mul * x - sub
template <size_t N>
HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
const Vec128<float, N> x,
const Vec128<float, N> sub) {
// TODO(eustas): replace, when implemented in WASM.
// TODO(eustas): is it wasm_f32x4_qfms?
return mul * x - sub;
}
// Returns -mul * x - sub
template <size_t N>
HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
const Vec128<float, N> x,
const Vec128<float, N> sub) {
// TODO(eustas): replace, when implemented in WASM.
return Neg(mul) * x - sub;
}
// ------------------------------ Floating-point square root
// Full precision square root
template <size_t N>
HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)};
}
// Approximate reciprocal square root
template <size_t N>
HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
// TODO(eustas): find cheaper a way to calculate this.
const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
return one / Sqrt(v);
}
// ------------------------------ Floating-point rounding
// Toward nearest integer, ties to even
template <size_t N>
HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
// TODO(eustas): is it f32x4.nearest? (not implemented yet)
alignas(16) float input[4];
alignas(16) float output[4];
wasm_v128_store(input, v.raw);
for (size_t i = 0; i < 4; ++i) {
output[i] = std::round(input[i]);
}
return Vec128<float, N>{wasm_v128_load(output)};
}
// Toward zero, aka truncate
template <size_t N>
HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
// TODO(eustas): is it f32x4.trunc? (not implemented yet)
alignas(16) float input[4];
alignas(16) float output[4];
wasm_v128_store(input, v.raw);
for (size_t i = 0; i < 4; ++i) {
output[i] = std::trunc(input[i]);
}
return Vec128<float, N>{wasm_v128_load(output)};
}
// Toward +infinity, aka ceiling
template <size_t N>
HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
// TODO(eustas): is it f32x4.ceil? (not implemented yet)
alignas(16) float input[4];
alignas(16) float output[4];
wasm_v128_store(input, v.raw);
for (size_t i = 0; i < 4; ++i) {
output[i] = std::ceil(input[i]);
}
return Vec128<float, N>{wasm_v128_load(output)};
}
// Toward -infinity, aka floor
template <size_t N>
HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
// TODO(eustas): is it f32x4.floor? (not implemented yet)
alignas(16) float input[4];
alignas(16) float output[4];
wasm_v128_store(input, v.raw);
for (size_t i = 0; i < 4; ++i) {
output[i] = std::floor(input[i]);
}
return Vec128<float, N>{wasm_v128_load(output)};
}
// ================================================== COMPARE
// Comparisons fill a lane with 1-bits if the condition is true, else 0.
// ------------------------------ Equality
// Unsigned
template <size_t N>
HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a,
const Vec128<uint32_t, N> b) {
return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
}
// Signed
template <size_t N>
HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
Vec128<int16_t, N> b) {
return Mask128<int16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
}
// Float
template <size_t N>
HWY_API Mask128<float, N> operator==(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Mask128<float, N>{wasm_f32x4_eq(a.raw, b.raw)};
}
template <typename T, size_t N>
HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
return (v & bit) == bit;
}
// ------------------------------ Strict inequality
// Signed/float <
template <size_t N>
HWY_API Mask128<int8_t, N> operator<(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Mask128<int8_t, N>{wasm_i8x16_lt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int16_t, N> operator<(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Mask128<int16_t, N>{wasm_i16x8_lt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int32_t, N> operator<(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Mask128<int32_t, N>{wasm_i32x4_lt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<float, N> operator<(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Mask128<float, N>{wasm_f32x4_lt(a.raw, b.raw)};
}
// Signed/float >
template <size_t N>
HWY_API Mask128<int8_t, N> operator>(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int16_t, N> operator>(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<int32_t, N> operator>(const Vec128<int32_t, N> a,
const Vec128<int32_t, N> b) {
return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<float, N> operator>(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Mask128<float, N>{wasm_f32x4_gt(a.raw, b.raw)};
}
// ------------------------------ Weak inequality
// Float <= >=
template <size_t N>
HWY_API Mask128<float, N> operator<=(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Mask128<float, N>{wasm_f32x4_le(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
const Vec128<float, N> b) {
return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
}
// ================================================== LOGICAL
// ------------------------------ Bitwise AND
template <typename T, size_t N>
HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
}
// ------------------------------ Bitwise AND-NOT
// Returns ~not_mask & mask.
template <typename T, size_t N>
HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
}
// ------------------------------ Bitwise OR
template <typename T, size_t N>
HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
}
// ------------------------------ Bitwise XOR
template <typename T, size_t N>
HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
}
// ------------------------------ Operator overloads (internal-only if float)
template <typename T, size_t N>
HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
return And(a, b);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
return Or(a, b);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
return Xor(a, b);
}
// ------------------------------ CopySign
template <typename T, size_t N>
HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
const Vec128<T, N> sign) {
static_assert(IsFloat<T>(), "Only makes sense for floating-point");
const auto msb = SignBit(Simd<T, N>());
return Or(AndNot(msb, magn), And(msb, sign));
}
template <typename T, size_t N>
HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
const Vec128<T, N> sign) {
static_assert(IsFloat<T>(), "Only makes sense for floating-point");
return Or(abs, And(SignBit(Simd<T, N>()), sign));
}
// ------------------------------ Mask
// Mask and Vec are the same (true = FF..FF).
template <typename T, size_t N>
HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
return Mask128<T, N>{v.raw};
}
template <typename T, size_t N>
HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
return Vec128<T, N>{v.raw};
}
// mask ? yes : no
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
}
// mask ? yes : 0
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
return yes & VecFromMask(mask);
}
// mask ? 0 : no
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
return AndNot(VecFromMask(mask), no);
}
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
const Simd<T, N> d;
const auto zero = Zero(d);
return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
}
// ================================================== MEMORY
// ------------------------------ Load
template <typename T>
HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
return Vec128<T>{wasm_v128_load(aligned)};
}
// Partial load.
template <typename T, size_t N, HWY_IF_LE64(T, N)>
HWY_API Vec128<T, N> Load(Simd<T, N> /* tag */, const T* HWY_RESTRICT p) {
Vec128<T, N> v;
CopyBytes<sizeof(T) * N>(p, &v);
return v;
}
// LoadU == Load.
template <typename T, size_t N>
HWY_API Vec128<T, N> LoadU(Simd<T, N> d, const T* HWY_RESTRICT p) {
return Load(d, p);
}
// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_API Vec128<T, N> LoadDup128(Simd<T, N> d, const T* HWY_RESTRICT p) {
return Load(d, p);
}
// ------------------------------ Store
template <typename T>
HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
wasm_v128_store(aligned, v.raw);
}
// Partial store.
template <typename T, size_t N, HWY_IF_LE64(T, N)>
HWY_API void Store(Vec128<T, N> v, Simd<T, N> /* tag */, T* HWY_RESTRICT p) {
CopyBytes<sizeof(T) * N>(&v, p);
}
HWY_API void Store(const Vec128<float, 1> v, Simd<float, 1> /* tag */,
float* HWY_RESTRICT p) {
*p = wasm_f32x4_extract_lane(v.raw, 0);
}
// StoreU == Store.
template <typename T, size_t N>
HWY_API void StoreU(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT p) {
Store(v, d, p);
}
// ------------------------------ Non-temporal stores
// Same as aligned stores on non-x86.
template <typename T, size_t N>
HWY_API void Stream(Vec128<T, N> v, Simd<T, N> /* tag */,
T* HWY_RESTRICT aligned) {
wasm_v128_store(aligned, v.raw);
}
// ------------------------------ Gather
template <typename T, size_t N, typename Offset>
HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
const T* HWY_RESTRICT base,
const Vec128<Offset, N> offset) {
static_assert(N == 1, "Wasm does not support full gather");
static_assert(sizeof(T) == sizeof(Offset), "T must match Offset");
const uintptr_t address = reinterpret_cast<uintptr_t>(base) + GetLane(offset);
T val;
CopyBytes<sizeof(T)>(reinterpret_cast<const T*>(address), &val);
return Set(d, val);
}
template <typename T, size_t N, typename Index>
HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
const Vec128<Index, N> index) {
static_assert(N == 1, "Wasm does not support full gather");
static_assert(sizeof(T) == sizeof(Index), "T must match Index");
return Set(d, base[GetLane(index)]);
}
// ================================================== SWIZZLE
// ------------------------------ Extract lane
// Gets the single value stored in a vector/part.
template <size_t N>
HWY_API uint8_t GetLane(const Vec128<uint8_t, N> v) {
return wasm_i8x16_extract_lane(v.raw, 0);
}
template <size_t N>
HWY_API int8_t GetLane(const Vec128<int8_t, N> v) {
return wasm_i8x16_extract_lane(v.raw, 0);
}
template <size_t N>
HWY_API uint16_t GetLane(const Vec128<uint16_t, N> v) {
return wasm_i16x8_extract_lane(v.raw, 0);
}
template <size_t N>
HWY_API int16_t GetLane(const Vec128<int16_t, N> v) {
return wasm_i16x8_extract_lane(v.raw, 0);
}
template <size_t N>
HWY_API uint32_t GetLane(const Vec128<uint32_t, N> v) {
return wasm_i32x4_extract_lane(v.raw, 0);
}
template <size_t N>
HWY_API int32_t GetLane(const Vec128<int32_t, N> v) {
return wasm_i32x4_extract_lane(v.raw, 0);
}
template <size_t N>
HWY_API float GetLane(const Vec128<float, N> v) {
return wasm_f32x4_extract_lane(v.raw, 0);
}
// ------------------------------ Extract half
// Returns upper/lower half of a vector.
template <typename T, size_t N>
HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
return Vec128<T, N / 2>{v.raw};
}
// These copy hi into lo (smaller instruction encoding than shifts).
template <typename T>
HWY_API Vec128<T, 8 / sizeof(T)> UpperHalf(Vec128<T> v) {
// TODO(eustas): use swizzle?
return Vec128<T, 8 / sizeof(T)>{wasm_v32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
}
template <>
HWY_INLINE Vec128<float, 2> UpperHalf(Vec128<float> v) {
// TODO(eustas): use swizzle?
return Vec128<float, 2>{wasm_v32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
}
// ------------------------------ Shift vector by constant #bytes
// 0x01..0F, kBytes = 1 => 0x02..0F00
template <int kBytes, typename T>
HWY_API Vec128<T> ShiftLeftBytes(const Vec128<T> v) {
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
const __i8x16 zero = wasm_i8x16_splat(0);
switch (kBytes) {
case 0:
return v;
case 1:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14)};
case 2:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13)};
case 3:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
4, 5, 6, 7, 8, 9, 10, 11, 12)};
case 4:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11)};
case 5:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10)};
case 6:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
case 7:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
case 8:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
case 9:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};
case 10:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};
case 11:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};
case 12:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};
case 13:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};
case 14:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 0,
1)};
case 15:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16,
0)};
}
return Vec128<T>{zero};
}
template <int kLanes, typename T, size_t N>
HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
const Simd<uint8_t, N * sizeof(T)> d8;
const Simd<T, N> d;
return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
}
// 0x01..0F, kBytes = 1 => 0x0001..0E
template <int kBytes, typename T>
HWY_API Vec128<T> ShiftRightBytes(const Vec128<T> v) {
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
const __i8x16 zero = wasm_i8x16_splat(0);
switch (kBytes) {
case 0:
return v;
case 1:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16)};
case 2:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 16)};
case 3:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 16, 16)};
case 4:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 16, 16, 16)};
case 5:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 16, 16, 16, 16)};
case 6:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 16, 16, 16, 16, 16)};
case 7:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 16, 16, 16, 16, 16, 16)};
case 8:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14,
15, 16, 16, 16, 16, 16, 16, 16, 16)};
case 9:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14,
15, 16, 16, 16, 16, 16, 16, 16, 16,
16)};
case 10:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15,
16, 16, 16, 16, 16, 16, 16, 16, 16,
16)};
case 11:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16,
16)};
case 12:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16,
16)};
case 13:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16,
16)};
case 14:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16,
16)};
case 15:
return Vec128<T>{wasm_v8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16,
16)};
}
return Vec128<T>{zero};
}
template <int kLanes, typename T, size_t N>
HWY_API Vec128<T, N> ShiftRightLanes(const Vec128<T, N> v) {
const Simd<uint8_t, N * sizeof(T)> d8;
const Simd<T, N> d;
return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
}
// ------------------------------ Extract from 2x 128-bit at constant offset
// Extracts 128 bits from <hi, lo> by skipping the least-significant kBytes.
template <int kBytes, typename T>
HWY_API Vec128<T> CombineShiftRightBytes(const Vec128<T> hi,
const Vec128<T> lo) {
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
switch (kBytes) {
case 0:
return lo;
case 1:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 16)};
case 2:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16, 17)};
case 3:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18)};
case 4:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19)};
case 5:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20)};
case 6:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20,
21)};
case 7:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21,
22)};
case 8:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22,
23)};
case 9:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23,
24)};
case 10:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25)};
case 11:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24,
25, 26)};
case 12:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25,
26, 27)};
case 13:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28)};
case 14:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29)};
case 15:
return Vec128<T>{wasm_v8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30)};
}
return hi;
}
// ------------------------------ Broadcast/splat any lane
// Unsigned
template <int kLane, size_t N>
HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<uint16_t, N>{wasm_v16x8_shuffle(
v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
}
template <int kLane, size_t N>
HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<uint32_t, N>{
wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
}
// Signed
template <int kLane, size_t N>
HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<int16_t, N>{wasm_v16x8_shuffle(
v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
}
template <int kLane, size_t N>
HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<int32_t, N>{
wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
}
// Float
template <int kLane, size_t N>
HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<float, N>{
wasm_v32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
}
// ------------------------------ Shuffle bytes with variable indices
// Returns vector of bytes[from[i]]. "from" is also interpreted as bytes:
// either valid indices in [0, 16) or >= 0x80 to zero the i-th output byte.
template <typename T, typename TI>
HWY_API Vec128<T> TableLookupBytes(const Vec128<T> bytes,
const Vec128<TI> from) {
// TODO(eustas): use swizzle? what about 0x80+ indices?
alignas(16) uint8_t control[16];
alignas(16) uint8_t input[16];
alignas(16) uint8_t output[16];
wasm_v128_store(control, from.raw);
wasm_v128_store(input, bytes.raw);
// TODO(eustas): wasm_v8x16_shuffle does not work: params have to be
// constants.
for (size_t i = 0; i < 16; ++i) {
const int idx = control[i];
output[i] = (idx >= 0x80) ? 0 : input[idx];
}
return Vec128<T>{wasm_v128_load(output)};
}
// ------------------------------ Hard-coded shuffles
// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
// Shuffle0321 rotates one lane to the right (the previous least-significant
// lane is now most-significant). These could also be implemented via
// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
// Swap 32-bit halves in 64-bit halves.
HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
// TODO(eustas): use swizzle?
return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
}
HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
// TODO(eustas): use swizzle?
return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
}
HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) {
// TODO(eustas): use swizzle?
return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
}
// Swap 64-bit halves
HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) {
// TODO(eustas): use swizzle?
return Vec128<uint32_t>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
}
HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) {
// TODO(eustas): use swizzle?
return Vec128<int32_t>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
}
HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) {
// TODO(eustas): use swizzle?
return Vec128<float>{wasm_v64x2_shuffle(v.raw, v.raw, 1, 0)};
}
// Rotate right 32 bits
HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) {
// TODO(eustas): use swizzle?
return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
}
HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) {
// TODO(eustas): use swizzle?
return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
}
HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) {
// TODO(eustas): use swizzle?
return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
}
// Rotate left 32 bits
HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) {
// TODO(eustas): use swizzle?
return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
}
HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) {
// TODO(eustas): use swizzle?
return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
}
HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) {
// TODO(eustas): use swizzle?
return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
}
// Reverse
HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) {
// TODO(eustas): use swizzle?
return Vec128<uint32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
}
HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) {
// TODO(eustas): use swizzle?
return Vec128<int32_t>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
}
HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
// TODO(eustas): use swizzle?
return Vec128<float>{wasm_v32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
}
// ------------------------------ Permute (runtime variable)
// Returned by SetTableIndices for use by TableLookupLanes.
template <typename T>
struct permute_wasm {
__v128_u raw;
};
template <typename T>
HWY_API permute_wasm<T> SetTableIndices(Full128<T>, const int32_t* idx) {
#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
const size_t N = 16 / sizeof(T);
for (size_t i = 0; i < N; ++i) {
HWY_DASSERT(0 <= idx[i] && idx[i] < static_cast<int32_t>(N));
}
#endif
const Full128<uint8_t> d8;
alignas(16) uint8_t control[16]; // = Lanes()
for (size_t idx_byte = 0; idx_byte < 16; ++idx_byte) {
const size_t idx_lane = idx_byte / sizeof(T);
const size_t mod = idx_byte % sizeof(T);
control[idx_byte] = idx[idx_lane] * sizeof(T) + mod;
}
return permute_wasm<T>{Load(d8, control).raw};
}
HWY_API Vec128<uint32_t> TableLookupLanes(const Vec128<uint32_t> v,
const permute_wasm<uint32_t> idx) {
return TableLookupBytes(v, Vec128<uint8_t>{idx.raw});
}
HWY_API Vec128<int32_t> TableLookupLanes(const Vec128<int32_t> v,
const permute_wasm<int32_t> idx) {
return TableLookupBytes(v, Vec128<uint8_t>{idx.raw});
}
HWY_API Vec128<float> TableLookupLanes(const Vec128<float> v,
const permute_wasm<float> idx) {
return TableLookupBytes(v, Vec128<uint8_t>{idx.raw});
}
// ------------------------------ Zip lanes
// Same as Interleave*, except that the return lanes are double-width integers;
// this is necessary because the single-lane scalar cannot return two values.
template <size_t N>
HWY_API Vec128<uint16_t, (N + 1) / 2> ZipLower(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint16_t, (N + 1) / 2>{wasm_v8x16_shuffle(
a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
}
template <size_t N>
HWY_API Vec128<uint32_t, (N + 1) / 2> ZipLower(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint32_t, (N + 1) / 2>{
wasm_v16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
}
template <size_t N>
HWY_API Vec128<int16_t, (N + 1) / 2> ZipLower(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int16_t, (N + 1) / 2>{wasm_v8x16_shuffle(
a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
}
template <size_t N>
HWY_API Vec128<int32_t, (N + 1) / 2> ZipLower(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int32_t, (N + 1) / 2>{
wasm_v16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
}
template <size_t N>
HWY_API Vec128<uint16_t, N / 2> ZipUpper(const Vec128<uint8_t, N> a,
const Vec128<uint8_t, N> b) {
return Vec128<uint16_t, N / 2>{wasm_v8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
10, 26, 11, 27, 12, 28, 13,
29, 14, 30, 15, 31)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N / 2> ZipUpper(const Vec128<uint16_t, N> a,
const Vec128<uint16_t, N> b) {
return Vec128<uint32_t, N / 2>{
wasm_v16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
}
template <size_t N>
HWY_API Vec128<int16_t, N / 2> ZipUpper(const Vec128<int8_t, N> a,
const Vec128<int8_t, N> b) {
return Vec128<int16_t, N / 2>{wasm_v8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25,
10, 26, 11, 27, 12, 28, 13,
29, 14, 30, 15, 31)};
}
template <size_t N>
HWY_API Vec128<int32_t, N / 2> ZipUpper(const Vec128<int16_t, N> a,
const Vec128<int16_t, N> b) {
return Vec128<int32_t, N / 2>{
wasm_v16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
}
// ------------------------------ Interleave lanes
// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
// the least-significant lane) and "b". To concatenate two half-width integers
// into one, use ZipLower/Upper instead (also works with scalar).
template <typename T>
HWY_API Vec128<T> InterleaveLower(const Vec128<T> a, const Vec128<T> b) {
return Vec128<T>{ZipLower(a, b).raw};
}
template <>
HWY_INLINE Vec128<uint32_t> InterleaveLower<uint32_t>(
const Vec128<uint32_t> a, const Vec128<uint32_t> b) {
return Vec128<uint32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
}
template <>
HWY_INLINE Vec128<int32_t> InterleaveLower<int32_t>(const Vec128<int32_t> a,
const Vec128<int32_t> b) {
return Vec128<int32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
}
template <>
HWY_INLINE Vec128<float> InterleaveLower<float>(const Vec128<float> a,
const Vec128<float> b) {
return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
}
template <typename T>
HWY_API Vec128<T> InterleaveUpper(const Vec128<T> a, const Vec128<T> b) {
return Vec128<T>{ZipUpper(a, b).raw};
}
template <>
HWY_INLINE Vec128<uint32_t> InterleaveUpper<uint32_t>(
const Vec128<uint32_t> a, const Vec128<uint32_t> b) {
return Vec128<uint32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
}
template <>
HWY_INLINE Vec128<int32_t> InterleaveUpper<int32_t>(const Vec128<int32_t> a,
const Vec128<int32_t> b) {
return Vec128<int32_t>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
}
template <>
HWY_INLINE Vec128<float> InterleaveUpper<float>(const Vec128<float> a,
const Vec128<float> b) {
return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
}
// ------------------------------ Blocks
// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
template <typename T>
HWY_API Vec128<T> ConcatLowerLower(const Vec128<T> hi, const Vec128<T> lo) {
return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 0, 2)};
}
// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
template <typename T>
HWY_API Vec128<T> ConcatUpperUpper(const Vec128<T> hi, const Vec128<T> lo) {
return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 1, 3)};
}
// hiH,hiL loH,loL |-> hiL,loH (= inner halves)
template <typename T>
HWY_API Vec128<T> ConcatLowerUpper(const Vec128<T> hi, const Vec128<T> lo) {
return CombineShiftRightBytes<8>(hi, lo);
}
// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
template <typename T>
HWY_API Vec128<T> ConcatUpperLower(const Vec128<T> hi, const Vec128<T> lo) {
return Vec128<T>{wasm_v64x2_shuffle(lo.raw, hi.raw, 0, 3)};
}
// ------------------------------ Odd/even lanes
namespace {
template <typename T>
HWY_API Vec128<T> odd_even_impl(hwy::SizeTag<1> /* tag */, const Vec128<T> a,
const Vec128<T> b) {
const Full128<T> d;
const Full128<uint8_t> d8;
alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
}
template <typename T>
HWY_API Vec128<T> odd_even_impl(hwy::SizeTag<2> /* tag */, const Vec128<T> a,
const Vec128<T> b) {
return Vec128<T>{wasm_v16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
}
template <typename T>
HWY_API Vec128<T> odd_even_impl(hwy::SizeTag<4> /* tag */, const Vec128<T> a,
const Vec128<T> b) {
return Vec128<T>{wasm_v32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
}
// TODO(eustas): implement
// template <typename T>
// HWY_API Vec128<T> odd_even_impl(hwy::SizeTag<8> /* tag */,
// const Vec128<T> a,
// const Vec128<T> b)
} // namespace
template <typename T>
HWY_API Vec128<T> OddEven(const Vec128<T> a, const Vec128<T> b) {
return odd_even_impl(hwy::SizeTag<sizeof(T)>(), a, b);
}
template <>
HWY_INLINE Vec128<float> OddEven<float>(const Vec128<float> a,
const Vec128<float> b) {
return Vec128<float>{wasm_v32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
}
// ================================================== CONVERT
// ------------------------------ Promotions (part w/ narrow lanes -> full)
// Unsigned: zero-extend.
template <size_t N>
HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<uint16_t, N>{wasm_i16x8_widen_low_u8x16(v.raw)};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<uint32_t, N>{
wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))};
}
template <size_t N>
HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_widen_low_u8x16(v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<int32_t, N>{
wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
const Vec128<uint16_t, N> v) {
return Vec128<uint32_t, N>{wasm_i32x4_widen_low_u16x8(v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
const Vec128<uint16_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_widen_low_u16x8(v.raw)};
}
// Signed: replicate sign bit.
template <size_t N>
HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
const Vec128<int8_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_widen_low_i8x16(v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
const Vec128<int8_t, N> v) {
return Vec128<int32_t, N>{
wasm_i32x4_widen_low_i16x8(wasm_i16x8_widen_low_i8x16(v.raw))};
}
template <size_t N>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
const Vec128<int16_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_widen_low_i16x8(v.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> PromoteTo(Simd<double, N> df,
const Vec128<int32_t, N> v) {
// TODO(janwas): use https://github.com/WebAssembly/simd/pull/383
alignas(16) int32_t lanes[4];
Store(v, Simd<int32_t, N>(), lanes);
alignas(16) double lanes64[2];
lanes64[0] = lanes[0];
lanes64[1] = N >= 2 ? lanes[1] : 0.0;
return Load(df, lanes64);
}
HWY_API Vec128<uint32_t> U32FromU8(const Vec128<uint8_t> v) {
return Vec128<uint32_t>{
wasm_i32x4_widen_low_u16x8(wasm_i16x8_widen_low_u8x16(v.raw))};
}
// ------------------------------ Demotions (full -> part w/ narrow lanes)
template <size_t N>
HWY_API Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<uint16_t, N>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
}
template <size_t N>
HWY_API Vec128<int16_t, N> DemoteTo(Simd<int16_t, N> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
}
template <size_t N>
HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N> /* tag */,
const Vec128<int32_t, N> v) {
const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
return Vec128<uint8_t, N>{
wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
}
template <size_t N>
HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N> /* tag */,
const Vec128<int16_t, N> v) {
return Vec128<uint8_t, N>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
}
template <size_t N>
HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N> /* tag */,
const Vec128<int32_t, N> v) {
const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
}
template <size_t N>
HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N> /* tag */,
const Vec128<int16_t, N> v) {
return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N> di,
const Vec128<double, N> v) {
// TODO(janwas): use https://github.com/WebAssembly/simd/pull/383
alignas(16) double lanes64[2];
Store(v, Simd<double, N>(), lanes64);
alignas(16) int32_t lanes[4] = {static_cast<int32_t>(lanes64[0])};
if (N >= 2) lanes[1] = static_cast<int32_t>(lanes64[1]);
return Load(di, lanes);
}
// For already range-limited input [0, 255].
HWY_API Vec128<uint8_t, 4> U8FromU32(const Vec128<uint32_t> v) {
const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
return Vec128<uint8_t, 4>{
wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
}
// ------------------------------ Convert i32 <=> f32
template <size_t N>
HWY_API Vec128<float, N> ConvertTo(Simd<float, N> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<float, N>{wasm_f32x4_convert_i32x4(v.raw)};
}
// Truncates (rounds toward zero).
template <size_t N>
HWY_API Vec128<int32_t, N> ConvertTo(Simd<int32_t, N> /* tag */,
const Vec128<float, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_trunc_saturate_f32x4(v.raw)};
}
template <size_t N>
HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
const __f32x4 c00 = wasm_f32x4_splat(0.0f);
const __f32x4 corr = wasm_f32x4_convert_i32x4(wasm_f32x4_le(v.raw, c00));
const __f32x4 c05 = wasm_f32x4_splat(0.5f);
// +0.5 for non-negative lane, -0.5 for other.
const __f32x4 delta = wasm_f32x4_add(c05, corr);
// Shift input by 0.5 away from 0.
const __f32x4 fixed = wasm_f32x4_add(v.raw, delta);
return Vec128<int32_t, N>{wasm_i32x4_trunc_saturate_f32x4(fixed)};
}
// ================================================== MISC
// Returns a vector with lane i=[0, N) set to "first" + i.
template <typename T, size_t N, typename T2>
Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
HWY_ALIGN T lanes[16 / sizeof(T)];
for (size_t i = 0; i < 16 / sizeof(T); ++i) {
lanes[i] = static_cast<T>(first + static_cast<T2>(i));
}
return Load(d, lanes);
}
// ------------------------------ Mask
template <typename T>
HWY_API bool AllFalse(const Mask128<T> v) {
return !wasm_i8x16_any_true(v.raw);
}
HWY_API bool AllFalse(const Mask128<float> v) {
return !wasm_i32x4_any_true(v.raw);
}
template <typename T>
HWY_API bool AllTrue(const Mask128<T> v) {
return wasm_i8x16_all_true(v.raw);
}
HWY_API bool AllTrue(const Mask128<float> v) {
return wasm_i32x4_all_true(v.raw);
}
namespace impl {
template <typename T>
HWY_API uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, const Mask128<T> mask) {
const __i8x16 slice =
wasm_i8x16_make(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8);
// Each u32 lane has byte[i] = (1 << i) or 0.
const __i8x16 v8_4_2_1 = wasm_v128_and(mask.raw, slice);
// OR together 4 bytes of each u32 to get the 4 bits.
const __i16x8 v2_1_z_z = wasm_i32x4_shl(v8_4_2_1, 16);
const __i16x8 v82_41_2_1 = wasm_v128_or(v8_4_2_1, v2_1_z_z);
const __i16x8 v41_2_1_0 = wasm_i32x4_shl(v82_41_2_1, 8);
const __i16x8 v8421_421_21_10 = wasm_v128_or(v82_41_2_1, v41_2_1_0);
const __i16x8 nibble_per_u32 = wasm_i32x4_shr(v8421_421_21_10, 24);
// Assemble four nibbles into 16 bits.
alignas(16) uint32_t lanes[4];
wasm_v128_store(lanes, nibble_per_u32);
return lanes[0] | (lanes[1] << 4) | (lanes[2] << 8) | (lanes[3] << 12);
}
template <typename T>
HWY_API uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, const Mask128<T> mask) {
// Remove useless lower half of each u16 while preserving the sign bit.
const __i16x8 zero = wasm_i16x8_splat(0);
const Mask128<T> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
return BitsFromMask(hwy::SizeTag<1>(), mask8);
}
template <typename T>
HWY_API uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, const Mask128<T> mask) {
const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
alignas(16) uint32_t lanes[4];
wasm_v128_store(lanes, sliced_mask);
return lanes[0] | lanes[1] | lanes[2] | lanes[3];
}
} // namespace impl
template <typename T>
HWY_API uint64_t BitsFromMask(const Mask128<T> mask) {
return impl::BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask);
}
template <typename T>
HWY_API size_t CountTrue(const Mask128<T> v) {
const __i32x4 mask =
wasm_i32x4_make(0x01010101, 0x01010101, 0x02020202, 0x02020202);
const __i8x16 shifted_bits = wasm_v128_and(v.raw, mask);
alignas(16) uint64_t lanes[2];
wasm_v128_store(lanes, shifted_bits);
return PopCount(lanes[0] | lanes[1]) / sizeof(T);
}
HWY_API size_t CountTrue(const Mask128<float> v) {
const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
const __i32x4 shifted_bits = wasm_v128_and(v.raw, var_shift);
alignas(16) uint64_t lanes[2];
wasm_v128_store(lanes, shifted_bits);
return PopCount(lanes[0] | lanes[1]);
}
// ------------------------------ Reductions
// TODO(eustas): optimize
// Returns 64-bit sums of 8-byte groups.
HWY_API Vec128<uint64_t> SumsOfU8x8(const Vec128<uint8_t> v) {
const auto kMask = wasm_i16x8_splat(0xFF);
const auto kMix = wasm_i16x8_splat(1);
const auto l = wasm_v128_and(v.raw, kMask);
const auto h = wasm_u16x8_shr(v.raw, 8);
const auto s16 = wasm_i16x8_add(h, l);
const auto s64 = wasm_i64x2_mul(s16, kMix);
return Vec128<uint64_t>{wasm_u64x2_shr(s64, 48)};
}
namespace detail {
// For u32/i32/f32.
template <typename T, size_t N>
HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<T, N> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = v3210 + v1032;
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return v20_31_20_31 + v31_20_31_20;
}
template <typename T, size_t N>
HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<T, N> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return Min(v20_31_20_31, v31_20_31_20);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<T, N> v3210) {
const Vec128<T> v1032 = Shuffle1032(v3210);
const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
return Max(v20_31_20_31, v31_20_31_20);
}
// For u64/i64/f64.
template <typename T, size_t N>
HWY_API Vec128<T, N> SumOfLanes(hwy::SizeTag<8> /* tag */,
const Vec128<T, N> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return v10 + v01;
}
template <typename T, size_t N>
HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<8> /* tag */,
const Vec128<T, N> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return Min(v10, v01);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<8> /* tag */,
const Vec128<T, N> v10) {
const Vec128<T> v01 = Shuffle01(v10);
return Max(v10, v01);
}
} // namespace detail
// Supported for u/i/f 32/64. Returns the sum in each lane.
template <typename T, size_t N>
HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> MinOfLanes(const Vec128<T, N> v) {
return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> MaxOfLanes(const Vec128<T, N> v) {
return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();