hwy/ops/wasm_128-inl.h - external/github.com/google/highway - Git at Google

 // Copyright 2019 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // 128-bit WASM vectors and operations.
 // External include guard in highway.h - see comment there.

 #include <stddef.h>
 #include <stdint.h>
 #include <wasm_simd128.h>

 #include "hwy/base.h"
 #include "hwy/ops/shared-inl.h"

 #ifdef HWY_WASM_OLD_NAMES
 #define wasm_i8x16_shuffle wasm_v8x16_shuffle
 #define wasm_i16x8_shuffle wasm_v16x8_shuffle
 #define wasm_i32x4_shuffle wasm_v32x4_shuffle
 #define wasm_i64x2_shuffle wasm_v64x2_shuffle
 #define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
 #define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
 #define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
 #define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
 #define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
 #define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
 #define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
 #define wasm_u8x16_add_sat wasm_u8x16_add_saturate
 #define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
 #define wasm_u16x8_add_sat wasm_u16x8_add_saturate
 #define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
 #define wasm_i8x16_add_sat wasm_i8x16_add_saturate
 #define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
 #define wasm_i16x8_add_sat wasm_i16x8_add_saturate
 #define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
 #endif

 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

 template <typename T>
 using Full128 = Simd<T, 16 / sizeof(T), 0>;

 template <typename T>
 using Full64 = Simd<T, 8 / sizeof(T), 0>;

 namespace detail {

 template <typename T>
 struct Raw128 {
   using type = __v128_u;
 };
 template <>
 struct Raw128<float> {
   using type = __f32x4;
 };

 } // namespace detail

 template <typename T, size_t N = 16 / sizeof(T)>
 class Vec128 {
   using Raw = typename detail::Raw128<T>::type;

  public:
   // Compound assignment. Only usable if there is a corresponding non-member
   // binary operator overload. For example, only f32 and f64 support division.
   HWY_INLINE Vec128& operator*=(const Vec128 other) {
     return *this = (*this * other);
   }
   HWY_INLINE Vec128& operator/=(const Vec128 other) {
     return *this = (*this / other);
   }
   HWY_INLINE Vec128& operator+=(const Vec128 other) {
     return *this = (*this + other);
   }
   HWY_INLINE Vec128& operator-=(const Vec128 other) {
     return *this = (*this - other);
   }
   HWY_INLINE Vec128& operator&=(const Vec128 other) {
     return *this = (*this & other);
   }
   HWY_INLINE Vec128& operator|=(const Vec128 other) {
     return *this = (*this | other);
   }
   HWY_INLINE Vec128& operator^=(const Vec128 other) {
     return *this = (*this ^ other);
   }

   Raw raw;
 };

 template <typename T>
 using Vec64 = Vec128<T, 8 / sizeof(T)>;

 // FF..FF or 0.
 template <typename T, size_t N = 16 / sizeof(T)>
 struct Mask128 {
   typename detail::Raw128<T>::type raw;
 };

 namespace detail {

 // Deduce Simd<T, N, 0> from Vec128<T, N>
 struct DeduceD {
   template <typename T, size_t N>
   Simd<T, N, 0> operator()(Vec128<T, N>) const {
     return Simd<T, N, 0>();
   }
 };

 }  // namespace detail

 template <class V>
 using DFromV = decltype(detail::DeduceD()(V()));

 template <class V>
 using TFromV = TFromD<DFromV<V>>;

 // ------------------------------ BitCast

 namespace detail {

 HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; }
 HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) {
   return static_cast<__v128_u>(v);
 }
 HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) {
   return static_cast<__v128_u>(v);
 }

 template <typename T, size_t N>
 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
   return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
 }

 // Cannot rely on function overloading because return types differ.
 template <typename T>
 struct BitCastFromInteger128 {
   HWY_INLINE __v128_u operator()(__v128_u v) { return v; }
 };
 template <>
 struct BitCastFromInteger128<float> {
   HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); }
 };

 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> BitCastFromByte(Simd<T, N, 0> /* tag */,
                                         Vec128<uint8_t, N * sizeof(T)> v) {
   return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
 }

 }  // namespace detail

 template <typename T, size_t N, typename FromT>
 HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> d,
                              Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
   return detail::BitCastFromByte(d, detail::BitCastToByte(v));
 }

 // ------------------------------ Zero

 // Returns an all-zero vector/part.
 template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_API Vec128<T, N> Zero(Simd<T, N, 0> /* tag */) {
   return Vec128<T, N>{wasm_i32x4_splat(0)};
 }
 template <size_t N, HWY_IF_LE128(float, N)>
 HWY_API Vec128<float, N> Zero(Simd<float, N, 0> /* tag */) {
   return Vec128<float, N>{wasm_f32x4_splat(0.0f)};
 }

 template <class D>
 using VFromD = decltype(Zero(D()));

 // ------------------------------ Set

 // Returns a vector/part with all lanes set to "t".
 template <size_t N, HWY_IF_LE128(uint8_t, N)>
 HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N, 0> /* tag */, const uint8_t t) {
   return Vec128<uint8_t, N>{wasm_i8x16_splat(static_cast<int8_t>(t))};
 }
 template <size_t N, HWY_IF_LE128(uint16_t, N)>
 HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N, 0> /* tag */,
                                 const uint16_t t) {
   return Vec128<uint16_t, N>{wasm_i16x8_splat(static_cast<int16_t>(t))};
 }
 template <size_t N, HWY_IF_LE128(uint32_t, N)>
 HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N, 0> /* tag */,
                                 const uint32_t t) {
   return Vec128<uint32_t, N>{wasm_i32x4_splat(static_cast<int32_t>(t))};
 }
 template <size_t N, HWY_IF_LE128(uint64_t, N)>
 HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N, 0> /* tag */,
                                 const uint64_t t) {
   return Vec128<uint64_t, N>{wasm_i64x2_splat(static_cast<int64_t>(t))};
 }

 template <size_t N, HWY_IF_LE128(int8_t, N)>
 HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N, 0> /* tag */, const int8_t t) {
   return Vec128<int8_t, N>{wasm_i8x16_splat(t)};
 }
 template <size_t N, HWY_IF_LE128(int16_t, N)>
 HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N, 0> /* tag */, const int16_t t) {
   return Vec128<int16_t, N>{wasm_i16x8_splat(t)};
 }
 template <size_t N, HWY_IF_LE128(int32_t, N)>
 HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N, 0> /* tag */, const int32_t t) {
   return Vec128<int32_t, N>{wasm_i32x4_splat(t)};
 }
 template <size_t N, HWY_IF_LE128(int64_t, N)>
 HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N, 0> /* tag */, const int64_t t) {
   return Vec128<int64_t, N>{wasm_i64x2_splat(t)};
 }

 template <size_t N, HWY_IF_LE128(float, N)>
 HWY_API Vec128<float, N> Set(Simd<float, N, 0> /* tag */, const float t) {
   return Vec128<float, N>{wasm_f32x4_splat(t)};
 }

 HWY_DIAGNOSTICS(push)
 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")

 // Returns a vector with uninitialized elements.
 template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> d) {
   return Zero(d);
 }

 HWY_DIAGNOSTICS(pop)

 // Returns a vector with lane i=[0, N) set to "first" + i.
 template <typename T, size_t N, typename T2>
 Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
   HWY_ALIGN T lanes[16 / sizeof(T)];
   for (size_t i = 0; i < 16 / sizeof(T); ++i) {
     lanes[i] = static_cast<T>(first + static_cast<T2>(i));
   }
   return Load(d, lanes);
 }

 // ================================================== ARITHMETIC

 // ------------------------------ Addition

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
                                      const Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
                                       const Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
                                       const Vec128<uint32_t, N> b) {
   return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
                                       const Vec128<uint64_t, N> b) {
   return Vec128<uint64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
                                     const Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
                                      const Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
                                      const Vec128<int32_t, N> b) {
   return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
                                      const Vec128<int64_t, N> b) {
   return Vec128<int64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
 }

 // Float
 template <size_t N>
 HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
                                    const Vec128<float, N> b) {
   return Vec128<float, N>{wasm_f32x4_add(a.raw, b.raw)};
 }

 // ------------------------------ Subtraction

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
                                      const Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
                                       Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
                                       const Vec128<uint32_t, N> b) {
   return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
                                       const Vec128<uint64_t, N> b) {
   return Vec128<uint64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
                                     const Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
                                      const Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
                                      const Vec128<int32_t, N> b) {
   return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
                                      const Vec128<int64_t, N> b) {
   return Vec128<int64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
 }

 // Float
 template <size_t N>
 HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
                                    const Vec128<float, N> b) {
   return Vec128<float, N>{wasm_f32x4_sub(a.raw, b.raw)};
 }

 // ------------------------------ SaturatedAdd

 // Returns a + b clamped to the destination range.

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
                                         const Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
                                          const Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
                                        const Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
                                         const Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
 }

 // ------------------------------ SaturatedSub

 // Returns a - b clamped to the destination range.

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
                                         const Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
                                          const Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
                                        const Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
                                         const Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
 }

 // ------------------------------ Average

 // Returns (a + b + 1) / 2

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
                                         const Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
                                          const Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
 }

 // ------------------------------ Absolute value

 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
 template <size_t N>
 HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
   return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
   return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
   return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
   return Vec128<int64_t, N>{wasm_i64x2_abs(v.raw)};
 }

 template <size_t N>
 HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
   return Vec128<float, N>{wasm_f32x4_abs(v.raw)};
 }

 // ------------------------------ Shift lanes by constant #bits

 // Unsigned
 template <int kBits, size_t N>
 HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
   return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
   return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
   return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
   return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
   return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) {
   return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, kBits)};
 }

 // Signed
 template <int kBits, size_t N>
 HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
   return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
   return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
   return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
   return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
   return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
 }
 template <int kBits, size_t N>
 HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
   return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, kBits)};
 }

 // 8-bit
 template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
 HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
   const DFromV<decltype(v)> d8;
   // Use raw instead of BitCast to support N=1.
   const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
   return kBits == 1
              ? (v + v)
              : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
 }

 template <int kBits, size_t N>
 HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
   const DFromV<decltype(v)> d8;
   // Use raw instead of BitCast to support N=1.
   const Vec128<uint8_t, N> shifted{
       ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
   return shifted & Set(d8, 0xFF >> kBits);
 }

 template <int kBits, size_t N>
 HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
   const DFromV<decltype(v)> di;
   const RebindToUnsigned<decltype(di)> du;
   const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
   const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
   return (shifted ^ shifted_sign) - shifted_sign;
 }

 // ------------------------------ RotateRight (ShiftRight, Or)
 template <int kBits, typename T, size_t N>
 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
   constexpr size_t kSizeInBits = sizeof(T) * 8;
   static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
   if (kBits == 0) return v;
   return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
 }

 // ------------------------------ Shift lanes by same variable #bits

 // After https://reviews.llvm.org/D108415 shift argument became unsigned.
 HWY_DIAGNOSTICS(push)
 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
                                           const int bits) {
   return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
                                            const int bits) {
   return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
                                           const int bits) {
   return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
                                            const int bits) {
   return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v,
                                           const int bits) {
   return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v,
                                            const int bits) {
   return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, bits)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
                                          const int bits) {
   return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
                                           const int bits) {
   return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
                                          const int bits) {
   return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
                                           const int bits) {
   return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v,
                                          const int bits) {
   return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, bits)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v,
                                           const int bits) {
   return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, bits)};
 }

 // 8-bit
 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
   const DFromV<decltype(v)> d8;
   // Use raw instead of BitCast to support N=1.
   const Vec128<T, N> shifted{
       ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
   return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
 }

 template <size_t N>
 HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
                                           const int bits) {
   const DFromV<decltype(v)> d8;
   // Use raw instead of BitCast to support N=1.
   const Vec128<uint8_t, N> shifted{
       ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
   return shifted & Set(d8, 0xFF >> bits);
 }

 template <size_t N>
 HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
   const DFromV<decltype(v)> di;
   const RebindToUnsigned<decltype(di)> du;
   const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
   const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
   return (shifted ^ shifted_sign) - shifted_sign;
 }

 // ignore Wsign-conversion
 HWY_DIAGNOSTICS(pop)

 // ------------------------------ Minimum

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_u8x16_min(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_u16x8_min(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
   return Vec128<uint32_t, N>{wasm_u32x4_min(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
   // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
   const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
   const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
   const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
   const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
   alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)};
   return Vec128<uint64_t, N>{wasm_v128_load(min)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_min(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_min(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
   return Vec128<int32_t, N>{wasm_i32x4_min(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
   alignas(16) int64_t min[4];
   min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
                    wasm_i64x2_extract_lane(b.raw, 0));
   min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
                    wasm_i64x2_extract_lane(b.raw, 1));
   return Vec128<int64_t, N>{wasm_v128_load(min)};
 }

 // Float
 template <size_t N>
 HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) {
   return Vec128<float, N>{wasm_f32x4_min(a.raw, b.raw)};
 }

 // ------------------------------ Maximum

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_u8x16_max(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_u16x8_max(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
   return Vec128<uint32_t, N>{wasm_u32x4_max(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
   // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
   const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
   const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
   const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
   const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
   alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)};
   return Vec128<uint64_t, N>{wasm_v128_load(max)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_max(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_max(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
   return Vec128<int32_t, N>{wasm_i32x4_max(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
   alignas(16) int64_t max[2];
   max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
                    wasm_i64x2_extract_lane(b.raw, 0));
   max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
                    wasm_i64x2_extract_lane(b.raw, 1));
   return Vec128<int64_t, N>{wasm_v128_load(max)};
 }

 // Float
 template <size_t N>
 HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) {
   return Vec128<float, N>{wasm_f32x4_max(a.raw, b.raw)};
 }

 // ------------------------------ Integer multiplication

 // Unsigned
 template <size_t N>
 HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
                                       const Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
                                       const Vec128<uint32_t, N> b) {
   return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
                                      const Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
                                      const Vec128<int32_t, N> b) {
   return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
 }

 // Returns the upper 16 bits of a * b in each lane.
 template <size_t N>
 HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
                                     const Vec128<uint16_t, N> b) {
   // TODO(eustas): replace, when implemented in WASM.
   const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
   const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
   const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
   const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
   const auto l = wasm_i32x4_mul(al, bl);
   const auto h = wasm_i32x4_mul(ah, bh);
   // TODO(eustas): shift-right + narrow?
   return Vec128<uint16_t, N>{
       wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
                                    const Vec128<int16_t, N> b) {
   // TODO(eustas): replace, when implemented in WASM.
   const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
   const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
   const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
   const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
   const auto l = wasm_i32x4_mul(al, bl);
   const auto h = wasm_i32x4_mul(ah, bh);
   // TODO(eustas): shift-right + narrow?
   return Vec128<int16_t, N>{
       wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
 }

 // Multiplies even lanes (0, 2 ..) and returns the double-width result.
 template <size_t N>
 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
                                              const Vec128<int32_t, N> b) {
   // TODO(eustas): replace, when implemented in WASM.
   const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
   const auto ae = wasm_v128_and(a.raw, kEvenMask);
   const auto be = wasm_v128_and(b.raw, kEvenMask);
   return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
                                               const Vec128<uint32_t, N> b) {
   // TODO(eustas): replace, when implemented in WASM.
   const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
   const auto ae = wasm_v128_and(a.raw, kEvenMask);
   const auto be = wasm_v128_and(b.raw, kEvenMask);
   return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
 }

 // ------------------------------ Negate

 template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
   return Xor(v, SignBit(DFromV<decltype(v)>()));
 }

 template <size_t N>
 HWY_API Vec128<int8_t, N> Neg(const Vec128<int8_t, N> v) {
   return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> Neg(const Vec128<int16_t, N> v) {
   return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> Neg(const Vec128<int32_t, N> v) {
   return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> Neg(const Vec128<int64_t, N> v) {
   return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)};
 }

 // ------------------------------ Floating-point mul / div

 template <size_t N>
 HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
   return Vec128<float, N>{wasm_f32x4_mul(a.raw, b.raw)};
 }

 template <size_t N>
 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
                                    const Vec128<float, N> b) {
   return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
 }

 // Approximate reciprocal
 template <size_t N>
 HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
   const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
   return one / v;
 }

 // Absolute value of difference.
 template <size_t N>
 HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
                                  const Vec128<float, N> b) {
   return Abs(a - b);
 }

 // ------------------------------ Floating-point multiply-add variants

 // Returns mul * x + add
 template <size_t N>
 HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
                                 const Vec128<float, N> x,
                                 const Vec128<float, N> add) {
   // TODO(eustas): replace, when implemented in WASM.
   // TODO(eustas): is it wasm_f32x4_qfma?
   return mul * x + add;
 }

 // Returns add - mul * x
 template <size_t N>
 HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
                                    const Vec128<float, N> x,
                                    const Vec128<float, N> add) {
   // TODO(eustas): replace, when implemented in WASM.
   return add - mul * x;
 }

 // Returns mul * x - sub
 template <size_t N>
 HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
                                 const Vec128<float, N> x,
                                 const Vec128<float, N> sub) {
   // TODO(eustas): replace, when implemented in WASM.
   // TODO(eustas): is it wasm_f32x4_qfms?
   return mul * x - sub;
 }

 // Returns -mul * x - sub
 template <size_t N>
 HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
                                    const Vec128<float, N> x,
                                    const Vec128<float, N> sub) {
   // TODO(eustas): replace, when implemented in WASM.
   return Neg(mul) * x - sub;
 }

 // ------------------------------ Floating-point square root

 // Full precision square root
 template <size_t N>
 HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
   return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)};
 }

 // Approximate reciprocal square root
 template <size_t N>
 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
   // TODO(eustas): find cheaper a way to calculate this.
   const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
   return one / Sqrt(v);
 }

 // ------------------------------ Floating-point rounding

 // Toward nearest integer, ties to even
 template <size_t N>
 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
   return Vec128<float, N>{wasm_f32x4_nearest(v.raw)};
 }

 // Toward zero, aka truncate
 template <size_t N>
 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
   return Vec128<float, N>{wasm_f32x4_trunc(v.raw)};
 }

 // Toward +infinity, aka ceiling
 template <size_t N>
 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
   return Vec128<float, N>{wasm_f32x4_ceil(v.raw)};
 }

 // Toward -infinity, aka floor
 template <size_t N>
 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
   return Vec128<float, N>{wasm_f32x4_floor(v.raw)};
 }

 // ================================================== COMPARE

 // Comparisons fill a lane with 1-bits if the condition is true, else 0.

 template <typename TFrom, typename TTo, size_t N>
 HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
                                    Mask128<TFrom, N> m) {
   static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
   return Mask128<TTo, N>{m.raw};
 }

 template <typename T, size_t N>
 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
   static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
   return (v & bit) == bit;
 }

 // ------------------------------ Equality

 // Unsigned
 template <size_t N>
 HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a,
                                        const Vec128<uint8_t, N> b) {
   return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a,
                                         const Vec128<uint16_t, N> b) {
   return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a,
                                         const Vec128<uint32_t, N> b) {
   return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
                                         const Vec128<uint64_t, N> b) {
   return Mask128<uint64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a,
                                       const Vec128<int8_t, N> b) {
   return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
                                        Vec128<int16_t, N> b) {
   return Mask128<int16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a,
                                        const Vec128<int32_t, N> b) {
   return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
                                        const Vec128<int64_t, N> b) {
   return Mask128<int64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
 }

 // Float
 template <size_t N>
 HWY_API Mask128<float, N> operator==(const Vec128<float, N> a,
                                      const Vec128<float, N> b) {
   return Mask128<float, N>{wasm_f32x4_eq(a.raw, b.raw)};
 }

 // ------------------------------ Inequality

 // Unsigned
 template <size_t N>
 HWY_API Mask128<uint8_t, N> operator!=(const Vec128<uint8_t, N> a,
                                        const Vec128<uint8_t, N> b) {
   return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint16_t, N> operator!=(const Vec128<uint16_t, N> a,
                                         const Vec128<uint16_t, N> b) {
   return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint32_t, N> operator!=(const Vec128<uint32_t, N> a,
                                         const Vec128<uint32_t, N> b) {
   return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint64_t, N> operator!=(const Vec128<uint64_t, N> a,
                                         const Vec128<uint64_t, N> b) {
   return Mask128<uint64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
 }

 // Signed
 template <size_t N>
 HWY_API Mask128<int8_t, N> operator!=(const Vec128<int8_t, N> a,
                                       const Vec128<int8_t, N> b) {
   return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int16_t, N> operator!=(const Vec128<int16_t, N> a,
                                        const Vec128<int16_t, N> b) {
   return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int32_t, N> operator!=(const Vec128<int32_t, N> a,
                                        const Vec128<int32_t, N> b) {
   return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int64_t, N> operator!=(const Vec128<int64_t, N> a,
                                        const Vec128<int64_t, N> b) {
   return Mask128<int64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
 }

 // Float
 template <size_t N>
 HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a,
                                      const Vec128<float, N> b) {
   return Mask128<float, N>{wasm_f32x4_ne(a.raw, b.raw)};
 }

 // ------------------------------ Strict inequality

 template <size_t N>
 HWY_API Mask128<int8_t, N> operator>(const Vec128<int8_t, N> a,
                                      const Vec128<int8_t, N> b) {
   return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int16_t, N> operator>(const Vec128<int16_t, N> a,
                                       const Vec128<int16_t, N> b) {
   return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int32_t, N> operator>(const Vec128<int32_t, N> a,
                                       const Vec128<int32_t, N> b) {
   return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
                                       const Vec128<int64_t, N> b) {
   return Mask128<int64_t, N>{wasm_i64x2_gt(a.raw, b.raw)};
 }

 template <size_t N>
 HWY_API Mask128<uint8_t, N> operator>(const Vec128<uint8_t, N> a,
                                       const Vec128<uint8_t, N> b) {
   return Mask128<uint8_t, N>{wasm_u8x16_gt(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint16_t, N> operator>(const Vec128<uint16_t, N> a,
                                        const Vec128<uint16_t, N> b) {
   return Mask128<uint16_t, N>{wasm_u16x8_gt(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint32_t, N> operator>(const Vec128<uint32_t, N> a,
                                        const Vec128<uint32_t, N> b) {
   return Mask128<uint32_t, N>{wasm_u32x4_gt(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<uint64_t, N> operator>(const Vec128<uint64_t, N> a,
                                        const Vec128<uint64_t, N> b) {
   const DFromV<decltype(a)> d;
   const Repartition<uint32_t, decltype(d)> d32;
   const auto a32 = BitCast(d32, a);
   const auto b32 = BitCast(d32, b);
   // If the upper halves are not equal, this is the answer.
   const auto m_gt = a32 > b32;

   // Otherwise, the lower half decides.
   const auto m_eq = a32 == b32;
   const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
   const auto lo_gt = And(m_eq, MaskFromVec(VFromD<decltype(d32)>{lo_in_hi}));

   const auto gt = Or(lo_gt, m_gt);
   // Copy result in upper 32 bits to lower 32 bits.
   return Mask128<uint64_t, N>{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)};
 }

 template <size_t N>
 HWY_API Mask128<float, N> operator>(const Vec128<float, N> a,
                                     const Vec128<float, N> b) {
   return Mask128<float, N>{wasm_f32x4_gt(a.raw, b.raw)};
 }

 template <typename T, size_t N>
 HWY_API Mask128<T, N> operator<(const Vec128<T, N> a, const Vec128<T, N> b) {
   return operator>(b, a);
 }

 // ------------------------------ Weak inequality

 // Float <= >=
 template <size_t N>
 HWY_API Mask128<float, N> operator<=(const Vec128<float, N> a,
                                      const Vec128<float, N> b) {
   return Mask128<float, N>{wasm_f32x4_le(a.raw, b.raw)};
 }
 template <size_t N>
 HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
                                      const Vec128<float, N> b) {
   return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
 }

 // ------------------------------ FirstN (Iota, Lt)

 template <typename T, size_t N>
 HWY_API Mask128<T, N> FirstN(const Simd<T, N, 0> d, size_t num) {
   const RebindToSigned<decltype(d)> di;  // Signed comparisons may be cheaper.
   return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
 }

 // ================================================== LOGICAL

 // ------------------------------ Not

 template <typename T, size_t N>
 HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
   return Vec128<T, N>{wasm_v128_not(v.raw)};
 }

 // ------------------------------ And

 template <typename T, size_t N>
 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
   return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
 }

 // ------------------------------ AndNot

 // Returns ~not_mask & mask.
 template <typename T, size_t N>
 HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
   return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
 }

 // ------------------------------ Or

 template <typename T, size_t N>
 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
   return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
 }

 // ------------------------------ Xor

 template <typename T, size_t N>
 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
   return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
 }

 // ------------------------------ OrAnd

 template <typename T, size_t N>
 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
   return Or(o, And(a1, a2));
 }

 // ------------------------------ IfVecThenElse

 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
                                    Vec128<T, N> no) {
   return IfThenElse(MaskFromVec(mask), yes, no);
 }

 // ------------------------------ Operator overloads (internal-only if float)

 template <typename T, size_t N>
 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
   return And(a, b);
 }

 template <typename T, size_t N>
 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
   return Or(a, b);
 }

 template <typename T, size_t N>
 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
   return Xor(a, b);
 }

 // ------------------------------ CopySign

 template <typename T, size_t N>
 HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
                               const Vec128<T, N> sign) {
   static_assert(IsFloat<T>(), "Only makes sense for floating-point");
   const auto msb = SignBit(DFromV<decltype(magn)>());
   return Or(AndNot(msb, magn), And(msb, sign));
 }

 template <typename T, size_t N>
 HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
                                    const Vec128<T, N> sign) {
   static_assert(IsFloat<T>(), "Only makes sense for floating-point");
   return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign));
 }

 // ------------------------------ BroadcastSignBit (compare)

 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
 HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) {
   return ShiftRight<sizeof(T) * 8 - 1>(v);
 }
 template <size_t N>
 HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
   const DFromV<decltype(v)> d;
   return VecFromMask(d, v < Zero(d));
 }

 // ------------------------------ Mask

 // Mask and Vec are the same (true = FF..FF).
 template <typename T, size_t N>
 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
   return Mask128<T, N>{v.raw};
 }

 template <typename T, size_t N>
 HWY_API Vec128<T, N> VecFromMask(Simd<T, N, 0> /* tag */, Mask128<T, N> v) {
   return Vec128<T, N>{v.raw};
 }

 // mask ? yes : no
 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
                                 Vec128<T, N> no) {
   return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
 }

 // mask ? yes : 0
 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
   return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
 }

 // mask ? 0 : no
 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
   return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
 }

 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
                                         Vec128<T, N> no) {
   static_assert(IsSigned<T>(), "Only works for signed/float");
   const DFromV<decltype(v)> d;
   const RebindToSigned<decltype(d)> di;

   v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
   return IfThenElse(MaskFromVec(v), yes, no);
 }

 template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
   const DFromV<decltype(v)> d;
   const auto zero = Zero(d);
   return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
 }

 // ------------------------------ Mask logical

 template <typename T, size_t N>
 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
   return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
 }

 template <typename T, size_t N>
 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
   const Simd<T, N, 0> d;
   return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
 }

 template <typename T, size_t N>
 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
   const Simd<T, N, 0> d;
   return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
 }

 template <typename T, size_t N>
 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
   const Simd<T, N, 0> d;
   return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
 }

 template <typename T, size_t N>
 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
   const Simd<T, N, 0> d;
   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }

 // ------------------------------ Shl (BroadcastSignBit, IfThenElse)

 // The x86 multiply-by-Pow2() trick will not work because WASM saturates
 // float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
 // scalar count operand, per-lane shift instructions would require extract_lane
 // for each lane, and hoping that shuffle is correctly mapped to a native
 // instruction. Using non-vector shifts would incur a store-load forwarding
 // stall when loading the result vector. We instead test bits of the shift
 // count to "predicate" a shift of the entire vector by a constant.

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
   const DFromV<decltype(v)> d;
   Mask128<T, N> mask;
   // Need a signed type for BroadcastSignBit.
   auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
   // Move the highest valid bit of the shift count into the sign bit.
   test = ShiftLeft<12>(test);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<8>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<4>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<2>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   return IfThenElse(mask, ShiftLeft<1>(v), v);
 }

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
   const DFromV<decltype(v)> d;
   Mask128<T, N> mask;
   // Need a signed type for BroadcastSignBit.
   auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
   // Move the highest valid bit of the shift count into the sign bit.
   test = ShiftLeft<27>(test);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<16>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<8>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<4>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftLeft<2>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   return IfThenElse(mask, ShiftLeft<1>(v), v);
 }

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
   const DFromV<decltype(v)> d;
   alignas(16) T lanes[2];
   alignas(16) T bits_lanes[2];
   Store(v, d, lanes);
   Store(bits, d, bits_lanes);
   lanes[0] <<= bits_lanes[0];
   lanes[1] <<= bits_lanes[1];
   return Load(d, lanes);
 }

 // ------------------------------ Shr (BroadcastSignBit, IfThenElse)

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
   const DFromV<decltype(v)> d;
   Mask128<T, N> mask;
   // Need a signed type for BroadcastSignBit.
   auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
   // Move the highest valid bit of the shift count into the sign bit.
   test = ShiftLeft<12>(test);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<8>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<4>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<2>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   return IfThenElse(mask, ShiftRight<1>(v), v);
 }

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
 HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
   const DFromV<decltype(v)> d;
   Mask128<T, N> mask;
   // Need a signed type for BroadcastSignBit.
   auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
   // Move the highest valid bit of the shift count into the sign bit.
   test = ShiftLeft<27>(test);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<16>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<8>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<4>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   test = ShiftLeft<1>(test);  // next bit (descending order)
   v = IfThenElse(mask, ShiftRight<2>(v), v);

   mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
   return IfThenElse(mask, ShiftRight<1>(v), v);
 }

 // ================================================== MEMORY

 // ------------------------------ Load

 template <typename T>
 HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
   return Vec128<T>{wasm_v128_load(aligned)};
 }

 template <typename T, size_t N>
 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
                                 const T* HWY_RESTRICT aligned) {
   return IfThenElseZero(m, Load(d, aligned));
 }

 // Partial load.
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
 HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
   Vec128<T, N> v;
   CopyBytes<sizeof(T) * N>(p, &v);
   return v;
 }

 // LoadU == Load.
 template <typename T, size_t N>
 HWY_API Vec128<T, N> LoadU(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
   return Load(d, p);
 }

 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
 template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
   return Load(d, p);
 }

 // ------------------------------ Store

 template <typename T>
 HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
   wasm_v128_store(aligned, v.raw);
 }

 // Partial store.
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
 HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
   CopyBytes<sizeof(T) * N>(&v, p);
 }

 HWY_API void Store(const Vec128<float, 1> v, Simd<float, 1, 0> /* tag */,
                    float* HWY_RESTRICT p) {
   *p = wasm_f32x4_extract_lane(v.raw, 0);
 }

 // StoreU == Store.
 template <typename T, size_t N>
 HWY_API void StoreU(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT p) {
   Store(v, d, p);
 }

 // ------------------------------ Non-temporal stores

 // Same as aligned stores on non-x86.

 template <typename T, size_t N>
 HWY_API void Stream(Vec128<T, N> v, Simd<T, N, 0> /* tag */,
                     T* HWY_RESTRICT aligned) {
   wasm_v128_store(aligned, v.raw);
 }

 // ------------------------------ Scatter (Store)

 template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
 HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
                            T* HWY_RESTRICT base,
                            const Vec128<Offset, N> offset) {
   static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");

   alignas(16) T lanes[N];
   Store(v, d, lanes);

   alignas(16) Offset offset_lanes[N];
   Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);

   uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
   for (size_t i = 0; i < N; ++i) {
     CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
   }
 }

 template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
 HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
                           const Vec128<Index, N> index) {
   static_assert(sizeof(T) == sizeof(Index), "Must match for portability");

   alignas(16) T lanes[N];
   Store(v, d, lanes);

   alignas(16) Index index_lanes[N];
   Store(index, Rebind<Index, decltype(d)>(), index_lanes);

   for (size_t i = 0; i < N; ++i) {
     base[index_lanes[i]] = lanes[i];
   }
 }

 // ------------------------------ Gather (Load/Store)

 template <typename T, size_t N, typename Offset>
 HWY_API Vec128<T, N> GatherOffset(const Simd<T, N, 0> d,
                                   const T* HWY_RESTRICT base,
                                   const Vec128<Offset, N> offset) {
   static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");

   alignas(16) Offset offset_lanes[N];
   Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);

   alignas(16) T lanes[N];
   const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
   for (size_t i = 0; i < N; ++i) {
     CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
   }
   return Load(d, lanes);
 }

 template <typename T, size_t N, typename Index>
 HWY_API Vec128<T, N> GatherIndex(const Simd<T, N, 0> d,
                                  const T* HWY_RESTRICT base,
                                  const Vec128<Index, N> index) {
   static_assert(sizeof(T) == sizeof(Index), "Must match for portability");

   alignas(16) Index index_lanes[N];
   Store(index, Rebind<Index, decltype(d)>(), index_lanes);

   alignas(16) T lanes[N];
   for (size_t i = 0; i < N; ++i) {
     lanes[i] = base[index_lanes[i]];
   }
   return Load(d, lanes);
 }

 // ================================================== SWIZZLE

 // ------------------------------ Extract lane

 // Gets the single value stored in a vector/part.
 template <size_t N>
 HWY_API uint8_t GetLane(const Vec128<uint8_t, N> v) {
   return static_cast<uint8_t>(wasm_i8x16_extract_lane(v.raw, 0));
 }
 template <size_t N>
 HWY_API int8_t GetLane(const Vec128<int8_t, N> v) {
   return static_cast<int8_t>(wasm_i8x16_extract_lane(v.raw, 0));
 }
 template <size_t N>
 HWY_API uint16_t GetLane(const Vec128<uint16_t, N> v) {
   return static_cast<uint16_t>(wasm_i16x8_extract_lane(v.raw, 0));
 }
 template <size_t N>
 HWY_API int16_t GetLane(const Vec128<int16_t, N> v) {
   return static_cast<int16_t>(wasm_i16x8_extract_lane(v.raw, 0));
 }
 template <size_t N>
 HWY_API uint32_t GetLane(const Vec128<uint32_t, N> v) {
   return static_cast<uint32_t>(wasm_i32x4_extract_lane(v.raw, 0));
 }
 template <size_t N>
 HWY_API int32_t GetLane(const Vec128<int32_t, N> v) {
   return static_cast<int32_t>(wasm_i32x4_extract_lane(v.raw, 0));
 }
 template <size_t N>
 HWY_API uint64_t GetLane(const Vec128<uint64_t, N> v) {
   return static_cast<uint64_t>(wasm_i64x2_extract_lane(v.raw, 0));
 }
 template <size_t N>
 HWY_API int64_t GetLane(const Vec128<int64_t, N> v) {
   return static_cast<int64_t>(wasm_i64x2_extract_lane(v.raw, 0));
 }

 template <size_t N>
 HWY_API float GetLane(const Vec128<float, N> v) {
   return wasm_f32x4_extract_lane(v.raw, 0);
 }

 // ------------------------------ LowerHalf

 template <typename T, size_t N>
 HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
                                    Vec128<T, N> v) {
   return Vec128<T, N / 2>{v.raw};
 }

 template <typename T, size_t N>
 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
   return LowerHalf(Simd<T, N / 2, 0>(), v);
 }

 // ------------------------------ ShiftLeftBytes

 // 0x01..0F, kBytes = 1 => 0x02..0F00
 template <int kBytes, typename T, size_t N>
 HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
   static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
   const __i8x16 zero = wasm_i8x16_splat(0);
   switch (kBytes) {
     case 0:
       return v;

     case 1:
       return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5,
                                              6, 7, 8, 9, 10, 11, 12, 13, 14)};

     case 2:
       return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4,
                                              5, 6, 7, 8, 9, 10, 11, 12, 13)};

     case 3:
       return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2,
                                              3, 4, 5, 6, 7, 8, 9, 10, 11, 12)};

     case 4:
       return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1,
                                              2, 3, 4, 5, 6, 7, 8, 9, 10, 11)};

     case 5:
       return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0,
                                              1, 2, 3, 4, 5, 6, 7, 8, 9, 10)};

     case 6:
       return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
                                              16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};

     case 7:
       return Vec128<T, N>{wasm_i8x16_shuffle(
           v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};

     case 8:
       return Vec128<T, N>{wasm_i8x16_shuffle(
           v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};

     case 9:
       return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
                                              16, 16, 16, 16, 0, 1, 2, 3, 4, 5,
                                              6)};

     case 10:
       return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
                                              16, 16, 16, 16, 16, 0, 1, 2, 3, 4,
                                              5)};

     case 11:
       return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
                                              16, 16, 16, 16, 16, 16, 0, 1, 2, 3,
                                              4)};

     case 12:
       return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
                                              16, 16, 16, 16, 16, 16, 16, 0, 1,
                                              2, 3)};

     case 13:
       return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
                                              16, 16, 16, 16, 16, 16, 16, 16, 0,
                                              1, 2)};

     case 14:
       return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
                                              16, 16, 16, 16, 16, 16, 16, 16, 16,
                                              0, 1)};

     case 15:
       return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
                                              16, 16, 16, 16, 16, 16, 16, 16, 16,
                                              16, 0)};
   }
   return Vec128<T, N>{zero};
 }

 template <int kBytes, typename T, size_t N>
 HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
   return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(), v);
 }

 // ------------------------------ ShiftLeftLanes

 template <int kLanes, typename T, size_t N>
 HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
   const Repartition<uint8_t, decltype(d)> d8;
   return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
 }

 template <int kLanes, typename T, size_t N>
 HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
   return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
 }

 // ------------------------------ ShiftRightBytes
 namespace detail {

 // Helper function allows zeroing invalid lanes in caller.
 template <int kBytes, typename T, size_t N>
 HWY_API __i8x16 ShrBytes(const Vec128<T, N> v) {
   static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
   const __i8x16 zero = wasm_i8x16_splat(0);

   switch (kBytes) {
     case 0:
       return v.raw;

     case 1:
       return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                 12, 13, 14, 15, 16);

     case 2:
       return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                 13, 14, 15, 16, 16);

     case 3:
       return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                 13, 14, 15, 16, 16, 16);

     case 4:
       return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                 14, 15, 16, 16, 16, 16);

     case 5:
       return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                                 15, 16, 16, 16, 16, 16);

     case 6:
       return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                 16, 16, 16, 16, 16, 16);

     case 7:
       return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                 16, 16, 16, 16, 16, 16, 16);

     case 8:
       return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                                 16, 16, 16, 16, 16, 16, 16);

     case 9:
       return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
                                 16, 16, 16, 16, 16, 16, 16);

     case 10:
       return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
                                 16, 16, 16, 16, 16, 16, 16);

     case 11:
       return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
                                 16, 16, 16, 16, 16, 16, 16);

     case 12:
       return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
                                 16, 16, 16, 16, 16, 16, 16);

     case 13:
       return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
                                 16, 16, 16, 16, 16, 16, 16);

     case 14:
       return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
                                 16, 16, 16, 16, 16, 16, 16);

     case 15:
       return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
                                 16, 16, 16, 16, 16, 16, 16);
     case 16:
       return zero;
   }
 }

 }  // namespace detail

 // 0x01..0F, kBytes = 1 => 0x0001..0E
 template <int kBytes, typename T, size_t N>
 HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
   // For partial vectors, clear upper lanes so we shift in zeros.
   if (N != 16 / sizeof(T)) {
     const Vec128<T> vfull{v.raw};
     v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
   }
   return Vec128<T, N>{detail::ShrBytes<kBytes>(v)};
 }

 // ------------------------------ ShiftRightLanes
 template <int kLanes, typename T, size_t N>
 HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
   const Repartition<uint8_t, decltype(d)> d8;
   return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
 }

 // ------------------------------ UpperHalf (ShiftRightBytes)

 // Full input: copy hi into lo (smaller instruction encoding than shifts).
 template <typename T>
 HWY_API Vec64<T> UpperHalf(Full64<T> /* tag */, const Vec128<T> v) {
   return Vec64<T>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
 }
 HWY_API Vec64<float> UpperHalf(Full64<float> /* tag */, const Vec128<float> v) {
   return Vec64<float>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
 }

 // Partial
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
                                          Vec128<T, N> v) {
   const DFromV<decltype(v)> d;
   const RebindToUnsigned<decltype(d)> du;
   const auto vu = BitCast(du, v);
   const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
   return Vec128<T, (N + 1) / 2>{upper.raw};
 }

 // ------------------------------ CombineShiftRightBytes

 template <int kBytes, typename T, class V = Vec128<T>>
 HWY_API V CombineShiftRightBytes(Full128<T> /* tag */, V hi, V lo) {
   static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
   switch (kBytes) {
     case 0:
       return lo;

     case 1:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                   11, 12, 13, 14, 15, 16)};

     case 2:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                   11, 12, 13, 14, 15, 16, 17)};

     case 3:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                   12, 13, 14, 15, 16, 17, 18)};

     case 4:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                   13, 14, 15, 16, 17, 18, 19)};

     case 5:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                   14, 15, 16, 17, 18, 19, 20)};

     case 6:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
                                   14, 15, 16, 17, 18, 19, 20, 21)};

     case 7:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
                                   15, 16, 17, 18, 19, 20, 21, 22)};

     case 8:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
                                   16, 17, 18, 19, 20, 21, 22, 23)};

     case 9:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
                                   17, 18, 19, 20, 21, 22, 23, 24)};

     case 10:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
                                   17, 18, 19, 20, 21, 22, 23, 24, 25)};

     case 11:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
                                   18, 19, 20, 21, 22, 23, 24, 25, 26)};

     case 12:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
                                   19, 20, 21, 22, 23, 24, 25, 26, 27)};

     case 13:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
                                   20, 21, 22, 23, 24, 25, 26, 27, 28)};

     case 14:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
                                   21, 22, 23, 24, 25, 26, 27, 28, 29)};

     case 15:
       return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
                                   22, 23, 24, 25, 26, 27, 28, 29, 30)};
   }
   return hi;
 }

 template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
           class V = Vec128<T, N>>
 HWY_API V CombineShiftRightBytes(Simd<T, N, 0> d, V hi, V lo) {
   constexpr size_t kSize = N * sizeof(T);
   static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
   const Repartition<uint8_t, decltype(d)> d8;
   const Full128<uint8_t> d_full8;
   using V8 = VFromD<decltype(d_full8)>;
   const V8 hi8{BitCast(d8, hi).raw};
   // Move into most-significant bytes
   const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
   const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
   return V{BitCast(Full128<T>(), r).raw};
 }

 // ------------------------------ Broadcast/splat any lane

 template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
   static_assert(0 <= kLane && kLane < N, "Invalid lane");
   return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
                                          kLane, kLane, kLane, kLane, kLane)};
 }

 template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
   static_assert(0 <= kLane && kLane < N, "Invalid lane");
   return Vec128<T, N>{
       wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
 }

 template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
   static_assert(0 <= kLane && kLane < N, "Invalid lane");
   return Vec128<T, N>{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)};
 }

 // ------------------------------ TableLookupBytes

 // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
 // lane indices in [0, 16).
 template <typename T, size_t N, typename TI, size_t NI>
 HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
                                         const Vec128<TI, NI> from) {
 // Not yet available in all engines, see
 // https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
 // V8 implementation of this had a bug, fixed on 2021-04-03:
 // https://chromium-review.googlesource.com/c/v8/v8/+/2822951
 #if 0
   return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
 #else
   alignas(16) uint8_t control[16];
   alignas(16) uint8_t input[16];
   alignas(16) uint8_t output[16];
   wasm_v128_store(control, from.raw);
   wasm_v128_store(input, bytes.raw);
   for (size_t i = 0; i < 16; ++i) {
     output[i] = control[i] < 16 ? input[control[i]] : 0;
   }
   return Vec128<TI, NI>{wasm_v128_load(output)};
 #endif
 }

 template <typename T, size_t N, typename TI, size_t NI>
 HWY_API Vec128<TI, NI> TableLookupBytesOr0(const Vec128<T, N> bytes,
                                            const Vec128<TI, NI> from) {
   const Simd<TI, NI, 0> d;
   // Mask size must match vector type, so cast everything to this type.
   Repartition<int8_t, decltype(d)> di8;
   Repartition<int8_t, Simd<T, N, 0>> d_bytes8;
   const auto msb = BitCast(di8, from) < Zero(di8);
   const auto lookup =
       TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
   return BitCast(d, IfThenZeroElse(msb, lookup));
 }

 // ------------------------------ Hard-coded shuffles

 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
 // Shuffle0321 rotates one lane to the right (the previous least-significant
 // lane is now most-significant). These could also be implemented via
 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.

 // Swap 32-bit halves in 64-bit halves.
 template <typename T, size_t N>
 HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
   static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
 }

 // Swap 64-bit halves
 template <typename T>
 HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
   static_assert(sizeof(T) == 8, "Only for 64-bit lanes");
   return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
 }
 template <typename T>
 HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
   static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
   return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
 }

 // Rotate right 32 bits
 template <typename T>
 HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
   static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
   return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
 }

 // Rotate left 32 bits
 template <typename T>
 HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
   static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
   return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
 }

 // Reverse
 template <typename T>
 HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
   static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
   return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
 }

 // ------------------------------ TableLookupLanes

 // Returned by SetTableIndices for use by TableLookupLanes.
 template <typename T, size_t N>
 struct Indices128 {
   __v128_u raw;
 };

 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
 HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N, 0> d, Vec128<TI, N> vec) {
   static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
 #if HWY_IS_DEBUG_BUILD
   const Rebind<TI, decltype(d)> di;
   HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
               AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
 #endif

   const Repartition<uint8_t, decltype(d)> d8;
   using V8 = VFromD<decltype(d8)>;
   const Repartition<uint16_t, decltype(d)> d16;

   // Broadcast each lane index to all bytes of T and shift to bytes
   static_assert(sizeof(T) == 4 || sizeof(T) == 8, "");
   if (sizeof(T) == 4) {
     alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
         0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
     const V8 lane_indices =
         TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
     const V8 byte_indices =
         BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
     alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
                                                       0, 1, 2, 3, 0, 1, 2, 3};
     return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
   } else {
     alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
         0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
     const V8 lane_indices =
         TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
     const V8 byte_indices =
         BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices)));
     alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
                                                       0, 1, 2, 3, 4, 5, 6, 7};
     return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
   }
 }

 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
 HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
   const Rebind<TI, decltype(d)> di;
   return IndicesFromVec(d, LoadU(di, idx));
 }

 template <typename T, size_t N>
 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
   using TI = MakeSigned<T>;
   const DFromV<decltype(v)> d;
   const Rebind<TI, decltype(d)> di;
   return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128<TI, N>{idx.raw}));
 }

 // ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)

 // Single lane: no change
 template <typename T>
 HWY_API Vec128<T, 1> Reverse(Simd<T, 1, 0> /* tag */, const Vec128<T, 1> v) {
   return v;
 }

 // Two lanes: shuffle
 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
 HWY_API Vec128<T, 2> Reverse(Simd<T, 2, 0> /* tag */, const Vec128<T, 2> v) {
   return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
 }

 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
   return Shuffle01(v);
 }

 // Four lanes: shuffle
 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
   return Shuffle0123(v);
 }

 // 16-bit
 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
 HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> d, const Vec128<T, N> v) {
   const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
   return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
 }

 // ------------------------------ Reverse2

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
   const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
   return BitCast(d, RotateRight<16>(BitCast(du32, v)));
 }

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
   return Shuffle2301(v);
 }

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
   return Shuffle01(v);
 }

 // ------------------------------ Reverse4

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> d, const Vec128<T, N> v) {
   return BitCast(d, Vec128<uint16_t, N>{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2,
                                                            1, 0, 7, 6, 5, 4)});
 }

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
   return Shuffle0123(v);
 }

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N>) {
   HWY_ASSERT(0);  // don't have 8 u64 lanes
 }

 // ------------------------------ Reverse8

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
 HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> d, const Vec128<T, N> v) {
   return Reverse(d, v);
 }

 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
 HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0>, const Vec128<T, N>) {
   HWY_ASSERT(0);  // don't have 8 lanes unless 16-bit
 }

 // ------------------------------ InterleaveLower

 template <size_t N>
 HWY_API Vec128<uint8_t, N> InterleaveLower(Vec128<uint8_t, N> a,
                                            Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_i8x16_shuffle(
       a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> InterleaveLower(Vec128<uint16_t, N> a,
                                             Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{
       wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> InterleaveLower(Vec128<uint32_t, N> a,
                                             Vec128<uint32_t, N> b) {
   return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> InterleaveLower(Vec128<uint64_t, N> a,
                                             Vec128<uint64_t, N> b) {
   return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
 }

 template <size_t N>
 HWY_API Vec128<int8_t, N> InterleaveLower(Vec128<int8_t, N> a,
                                           Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_shuffle(
       a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> InterleaveLower(Vec128<int16_t, N> a,
                                            Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{
       wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> InterleaveLower(Vec128<int32_t, N> a,
                                            Vec128<int32_t, N> b) {
   return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> InterleaveLower(Vec128<int64_t, N> a,
                                            Vec128<int64_t, N> b) {
   return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
 }

 template <size_t N>
 HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a,
                                          Vec128<float, N> b) {
   return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
 }

 // Additional overload for the optional tag.
 template <class V>
 HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
   return InterleaveLower(a, b);
 }

 // ------------------------------ InterleaveUpper (UpperHalf)

 // All functions inside detail lack the required D parameter.
 namespace detail {

 template <size_t N>
 HWY_API Vec128<uint8_t, N> InterleaveUpper(Vec128<uint8_t, N> a,
                                            Vec128<uint8_t, N> b) {
   return Vec128<uint8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
                                                26, 11, 27, 12, 28, 13, 29, 14,
                                                30, 15, 31)};
 }
 template <size_t N>
 HWY_API Vec128<uint16_t, N> InterleaveUpper(Vec128<uint16_t, N> a,
                                             Vec128<uint16_t, N> b) {
   return Vec128<uint16_t, N>{
       wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> InterleaveUpper(Vec128<uint32_t, N> a,
                                             Vec128<uint32_t, N> b) {
   return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> InterleaveUpper(Vec128<uint64_t, N> a,
                                             Vec128<uint64_t, N> b) {
   return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
 }

 template <size_t N>
 HWY_API Vec128<int8_t, N> InterleaveUpper(Vec128<int8_t, N> a,
                                           Vec128<int8_t, N> b) {
   return Vec128<int8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
                                               26, 11, 27, 12, 28, 13, 29, 14,
                                               30, 15, 31)};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> InterleaveUpper(Vec128<int16_t, N> a,
                                            Vec128<int16_t, N> b) {
   return Vec128<int16_t, N>{
       wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> InterleaveUpper(Vec128<int32_t, N> a,
                                            Vec128<int32_t, N> b) {
   return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> InterleaveUpper(Vec128<int64_t, N> a,
                                            Vec128<int64_t, N> b) {
   return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
 }

 template <size_t N>
 HWY_API Vec128<float, N> InterleaveUpper(Vec128<float, N> a,
                                          Vec128<float, N> b) {
   return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
 }

 }  // namespace detail

 // Full
 template <typename T, class V = Vec128<T>>
 HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
   return detail::InterleaveUpper(a, b);
 }

 // Partial
 template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
 HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
   const Half<decltype(d)> d2;
   return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
 }

 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)

 // Same as Interleave*, except that the return lanes are double-width integers;
 // this is necessary because the single-lane scalar cannot return two values.
 template <class V, class DW = RepartitionToWide<DFromV<V>>>
 HWY_API VFromD<DW> ZipLower(V a, V b) {
   return BitCast(DW(), InterleaveLower(a, b));
 }
 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
   return BitCast(dw, InterleaveLower(D(), a, b));
 }

 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
   return BitCast(dw, InterleaveUpper(D(), a, b));
 }

 // ================================================== COMBINE

 // ------------------------------ Combine (InterleaveLower)

 // N = N/2 + N/2 (upper half undefined)
 template <typename T, size_t N>
 HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
                              Vec128<T, N / 2> lo_half) {
   const Half<decltype(d)> d2;
   const RebindToUnsigned<decltype(d2)> du2;
   // Treat half-width input as one lane, and expand to two lanes.
   using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
   const VU lo{BitCast(du2, lo_half).raw};
   const VU hi{BitCast(du2, hi_half).raw};
   return BitCast(d, InterleaveLower(lo, hi));
 }

 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)

 template <typename T, size_t N>
 HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
   return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
 }

 // ------------------------------ ConcatLowerLower

 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
 template <typename T>
 HWY_API Vec128<T> ConcatLowerLower(Full128<T> /* tag */, const Vec128<T> hi,
                                    const Vec128<T> lo) {
   return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
 }
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
 HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
                                       const Vec128<T, N> lo) {
   const Half<decltype(d)> d2;
   return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
 }

 // ------------------------------ ConcatUpperUpper

 template <typename T>
 HWY_API Vec128<T> ConcatUpperUpper(Full128<T> /* tag */, const Vec128<T> hi,
                                    const Vec128<T> lo) {
   return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
 }
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
 HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
                                       const Vec128<T, N> lo) {
   const Half<decltype(d)> d2;
   return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
 }

 // ------------------------------ ConcatLowerUpper

 template <typename T>
 HWY_API Vec128<T> ConcatLowerUpper(Full128<T> d, const Vec128<T> hi,
                                    const Vec128<T> lo) {
   return CombineShiftRightBytes<8>(d, hi, lo);
 }
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
 HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
                                       const Vec128<T, N> lo) {
   const Half<decltype(d)> d2;
   return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
 }

 // ------------------------------ ConcatUpperLower
 template <typename T, size_t N>
 HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
                                       const Vec128<T, N> lo) {
   return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
 }

 // ------------------------------ ConcatOdd

 // 32-bit full
 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
 HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
   return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
 }

 // 32-bit partial
 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
 HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2, 0> /* tag */, Vec128<T, 2> hi,
                                Vec128<T, 2> lo) {
   return InterleaveUpper(Simd<T, 2, 0>(), lo, hi);
 }

 // 64-bit full - no partial because we need at least two inputs to have
 // even/odd.
 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
   return InterleaveUpper(Full128<T>(), lo, hi);
 }

 // ------------------------------ ConcatEven (InterleaveLower)

 // 32-bit full
 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
 HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
   return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
 }

 // 32-bit partial
 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
 HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2, 0> /* tag */, Vec128<T, 2> hi,
                                 Vec128<T, 2> lo) {
   return InterleaveLower(Simd<T, 2, 0>(), lo, hi);
 }

 // 64-bit full - no partial because we need at least two inputs to have
 // even/odd.
 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
   return InterleaveLower(Full128<T>(), lo, hi);
 }

 // ------------------------------ DupEven (InterleaveLower)

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
   return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)};
 }

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
   return InterleaveLower(DFromV<decltype(v)>(), v, v);
 }

 // ------------------------------ DupOdd (InterleaveUpper)

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
   return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)};
 }

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
   return InterleaveUpper(DFromV<decltype(v)>(), v, v);
 }

 // ------------------------------ OddEven

 namespace detail {

 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a,
                                 const Vec128<T, N> b) {
   const DFromV<decltype(a)> d;
   const Repartition<uint8_t, decltype(d)> d8;
   alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
                                             0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
   return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
 }
 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a,
                                 const Vec128<T, N> b) {
   return Vec128<T, N>{
       wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
 }
 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a,
                                 const Vec128<T, N> b) {
   return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
 }
 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a,
                                 const Vec128<T, N> b) {
   return Vec128<T, N>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
 }

 }  // namespace detail

 template <typename T, size_t N>
 HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
   return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
 }
 template <size_t N>
 HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a,
                                  const Vec128<float, N> b) {
   return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
 }

 // ------------------------------ OddEvenBlocks
 template <typename T, size_t N>
 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
   return even;
 }

 // ------------------------------ SwapAdjacentBlocks

 template <typename T, size_t N>
 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
   return v;
 }

 // ------------------------------ ReverseBlocks

 // Single block: no change
 template <typename T>
 HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
   return v;
 }

 // ================================================== CONVERT

 // ------------------------------ Promotions (part w/ narrow lanes -> full)

 // Unsigned: zero-extend.
 template <size_t N>
 HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N, 0> /* tag */,
                                       const Vec128<uint8_t, N> v) {
   return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
                                       const Vec128<uint8_t, N> v) {
   return Vec128<uint32_t, N>{
       wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
 }
 template <size_t N>
 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
                                      const Vec128<uint8_t, N> v) {
   return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
                                      const Vec128<uint8_t, N> v) {
   return Vec128<int32_t, N>{
       wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
 }
 template <size_t N>
 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
                                       const Vec128<uint16_t, N> v) {
   return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N, 0> /* tag */,
                                       const Vec128<uint32_t, N> v) {
   return Vec128<uint64_t, N>{wasm_u64x2_extend_low_u32x4(v.raw)};
 }

 template <size_t N>
 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
                                      const Vec128<uint16_t, N> v) {
   return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
 }

 // Signed: replicate sign bit.
 template <size_t N>
 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
                                      const Vec128<int8_t, N> v) {
   return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
                                      const Vec128<int8_t, N> v) {
   return Vec128<int32_t, N>{
       wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
 }
 template <size_t N>
 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
                                      const Vec128<int16_t, N> v) {
   return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
 }
 template <size_t N>
 HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
                                      const Vec128<int32_t, N> v) {
   return Vec128<int64_t, N>{wasm_i64x2_extend_low_i32x4(v.raw)};
 }

 template <size_t N>
 HWY_API Vec128<double, N> PromoteTo(Simd<double, N, 0> /* tag */,
                                     const Vec128<int32_t, N> v) {
   return Vec128<double, N>{wasm_f64x2_convert_low_i32x4(v.raw)};
 }

 template <size_t N>
 HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
                                    const Vec128<float16_t, N> v) {
   const RebindToSigned<decltype(df32)> di32;
   const RebindToUnsigned<decltype(df32)> du32;
   // Expand to u32 so we can shift.
   const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
   const auto sign = ShiftRight<15>(bits16);
   const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
   const auto mantissa = bits16 & Set(du32, 0x3FF);
   const auto subnormal =
       BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
                         Set(df32, 1.0f / 16384 / 1024));

   const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
   const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
   const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
   const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
   return BitCast(df32, ShiftLeft<31>(sign) | bits32);
 }

 template <size_t N>
 HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
                                    const Vec128<bfloat16_t, N> v) {
   const Rebind<uint16_t, decltype(df32)> du16;
   const RebindToSigned<decltype(df32)> di32;
   return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
 }

 // ------------------------------ Demotions (full -> part w/ narrow lanes)

 template <size_t N>
 HWY_API Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N, 0> /* tag */,
                                      const Vec128<int32_t, N> v) {
   return Vec128<uint16_t, N>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
 }

 template <size_t N>
 HWY_API Vec128<int16_t, N> DemoteTo(Simd<int16_t, N, 0> /* tag */,
                                     const Vec128<int32_t, N> v) {
   return Vec128<int16_t, N>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
 }

 template <size_t N>
 HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
                                     const Vec128<int32_t, N> v) {
   const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
   return Vec128<uint8_t, N>{
       wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
 }

 template <size_t N>
 HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
                                     const Vec128<int16_t, N> v) {
   return Vec128<uint8_t, N>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
 }

 template <size_t N>
 HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
                                    const Vec128<int32_t, N> v) {
   const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
   return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
 }

 template <size_t N>
 HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
                                    const Vec128<int16_t, N> v) {
   return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
 }

 template <size_t N>
 HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N, 0> /* di */,
                                     const Vec128<double, N> v) {
   return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
 }

 template <size_t N>
 HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> df16,
                                       const Vec128<float, N> v) {
   const RebindToUnsigned<decltype(df16)> du16;
   const Rebind<uint32_t, decltype(du16)> du;
   const RebindToSigned<decltype(du)> di;
   const auto bits32 = BitCast(du, v);
   const auto sign = ShiftRight<31>(bits32);
   const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
   const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);

   const auto k15 = Set(di, 15);
   const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
   const auto is_tiny = exp < Set(di, -24);

   const auto is_subnormal = exp < Set(di, -14);
   const auto biased_exp16 =
       BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
   const auto sub_exp = BitCast(du, Set(di, -14) - exp);  // [1, 11)
   const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
                      (mantissa32 >> (Set(du, 13) + sub_exp));
   const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
                                      ShiftRight<13>(mantissa32));  // <1024

   const auto sign16 = ShiftLeft<15>(sign);
   const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
   const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
   return Vec128<float16_t, N>{DemoteTo(du16, bits16).raw};
 }

 template <size_t N>
 HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> dbf16,
                                        const Vec128<float, N> v) {
   const Rebind<int32_t, decltype(dbf16)> di32;
   const Rebind<uint32_t, decltype(dbf16)> du32;  // for logical shift right
   const Rebind<uint16_t, decltype(dbf16)> du16;
   const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
   return BitCast(dbf16, DemoteTo(du16, bits_in_32));
 }

 template <size_t N>
 HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
     Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
   const RebindToUnsigned<decltype(dbf16)> du16;
   const Repartition<uint32_t, decltype(dbf16)> du32;
   const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
   return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }

 // For already range-limited input [0, 255].
 template <size_t N>
 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
   const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
   return Vec128<uint8_t, N>{
       wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
 }

 // ------------------------------ Convert i32 <=> f32 (Round)

 template <size_t N>
 HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
                                    const Vec128<int32_t, N> v) {
   return Vec128<float, N>{wasm_f32x4_convert_i32x4(v.raw)};
 }
 // Truncates (rounds toward zero).
 template <size_t N>
 HWY_API Vec128<int32_t, N> ConvertTo(Simd<int32_t, N, 0> /* tag */,
                                      const Vec128<float, N> v) {
   return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
 }

 template <size_t N>
 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
   return ConvertTo(Simd<int32_t, N, 0>(), Round(v));
 }

 // ================================================== MISC

 // ------------------------------ SumsOf8 (ShiftRight, Add)
 template <size_t N>
 HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
   const DFromV<decltype(v)> du8;
   const RepartitionToWide<decltype(du8)> du16;
   const RepartitionToWide<decltype(du16)> du32;
   const RepartitionToWide<decltype(du32)> du64;
   using VU16 = VFromD<decltype(du16)>;

   const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
   const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF));
   const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);

   const VU16 szz_FE_zz_BA_zz_76_zz_32 =
       BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
   const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
       Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
   const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
       BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
   const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
       Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
   return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF));
 }

 // ------------------------------ LoadMaskBits (TestBit)

 namespace detail {

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
   const RebindToUnsigned<decltype(d)> du;
   // Easier than Set(), which would require an >8-bit type, which would not
   // compile for T=uint8_t, N=1.
   const Vec128<T, N> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};

   // Replicate bytes 8x such that each byte contains the bit that governs it.
   alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
                                              1, 1, 1, 1, 1, 1, 1, 1};
   const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));

   alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
                                             1, 2, 4, 8, 16, 32, 64, 128};
   return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
 }

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
   const RebindToUnsigned<decltype(d)> du;
   alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
   return RebindMask(
       d, TestBit(Set(du, static_cast<uint16_t>(bits)), Load(du, kBit)));
 }

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
   const RebindToUnsigned<decltype(d)> du;
   alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
   return RebindMask(
       d, TestBit(Set(du, static_cast<uint32_t>(bits)), Load(du, kBit)));
 }

 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
   const RebindToUnsigned<decltype(d)> du;
   alignas(16) constexpr uint64_t kBit[8] = {1, 2};
   return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
 }

 }  // namespace detail

 // `p` points to at least 8 readable bytes, not all of which need be valid.
 template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
                                    const uint8_t* HWY_RESTRICT bits) {
   uint64_t mask_bits = 0;
   CopyBytes<(N + 7) / 8>(bits, &mask_bits);
   return detail::LoadMaskBits(d, mask_bits);
 }

 // ------------------------------ Mask

 namespace detail {

 // Full
 template <typename T>
 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
                                  const Mask128<T> mask) {
   alignas(16) uint64_t lanes[2];
   wasm_v128_store(lanes, mask.raw);

   constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
   const uint64_t lo = ((lanes[0] * kMagic) >> 56);
   const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
   return (hi + lo);
 }

 // 64-bit
 template <typename T>
 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
                                  const Mask128<T, 8> mask) {
   constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
   return (static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0)) *
           kMagic) >>
          56;
 }

 // 32-bit or less: need masking
 template <typename T, size_t N, HWY_IF_LE32(T, N)>
 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
                                  const Mask128<T, N> mask) {
   uint64_t bytes = static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
   // Clear potentially undefined bytes.
   bytes &= (1ULL << (N * 8)) - 1;
   constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
   return (bytes * kMagic) >> 56;
 }

 template <typename T, size_t N>
 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
                                  const Mask128<T, N> mask) {
   // Remove useless lower half of each u16 while preserving the sign bit.
   const __i16x8 zero = wasm_i16x8_splat(0);
   const Mask128<uint8_t, N> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
   return BitsFromMask(hwy::SizeTag<1>(), mask8);
 }

 template <typename T, size_t N>
 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
                                  const Mask128<T, N> mask) {
   const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
   const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
   const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
   alignas(16) uint32_t lanes[4];
   wasm_v128_store(lanes, sliced_mask);
   return lanes[0] | lanes[1] | lanes[2] | lanes[3];
 }

 template <typename T, size_t N>
 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
                                  const Mask128<T, N> mask) {
   const __i64x2 mask_i = static_cast<__i64x2>(mask.raw);
   const __i64x2 slice = wasm_i64x2_make(1, 2);
   const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
   alignas(16) uint64_t lanes[2];
   wasm_v128_store(lanes, sliced_mask);
   return lanes[0] | lanes[1];
 }

 // Returns the lowest N bits for the BitsFromMask result.
 template <typename T, size_t N>
 constexpr uint64_t OnlyActive(uint64_t bits) {
   return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1);
 }

 // Returns 0xFF for bytes with index >= N, otherwise 0.
 template <size_t N>
 constexpr __i8x16 BytesAbove() {
   return /**/
       (N == 0)    ? wasm_i32x4_make(-1, -1, -1, -1)
       : (N == 4)  ? wasm_i32x4_make(0, -1, -1, -1)
       : (N == 8)  ? wasm_i32x4_make(0, 0, -1, -1)
       : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
       : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
       : (N == 2)  ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
       : (N == 6)  ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
       : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
       : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
       : (N == 1)  ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                                    -1, -1, -1, -1, -1)
       : (N == 3)  ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                                    -1, -1, -1, -1)
       : (N == 5)  ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
                                    -1, -1, -1, -1)
       : (N == 7)  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
                                    -1, -1, -1)
       : (N == 9)  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
                                    -1, -1, -1)
       : (N == 11)
           ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
       : (N == 13)
           ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
           : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
 }

 template <typename T, size_t N>
 HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
   return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
 }

 template <typename T>
 HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
   return PopCount(BitsFromMask(tag, m));
 }

 template <typename T>
 HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
   return PopCount(BitsFromMask(tag, m));
 }

 template <typename T>
 HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
   const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
   const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
   alignas(16) uint64_t lanes[2];
   wasm_v128_store(lanes, shifted_bits);
   return PopCount(lanes[0] | lanes[1]);
 }

 template <typename T>
 HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
   alignas(16) int64_t lanes[2];
   wasm_v128_store(lanes, m.raw);
   return static_cast<size_t>(-(lanes[0] + lanes[1]));
 }

 }  // namespace detail

 // `p` points to at least 8 writable bytes.
 template <typename T, size_t N>
 HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
                              const Mask128<T, N> mask, uint8_t* bits) {
   const uint64_t mask_bits = detail::BitsFromMask(mask);
   const size_t kNumBytes = (N + 7) / 8;
   CopyBytes<kNumBytes>(&mask_bits, bits);
   return kNumBytes;
 }

 template <typename T, size_t N>
 HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
   return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), m);
 }

 // Partial vector
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
 HWY_API size_t CountTrue(const Simd<T, N, 0> d, const Mask128<T, N> m) {
   // Ensure all undefined bytes are 0.
   const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
   return CountTrue(d, Mask128<T>{AndNot(mask, m).raw});
 }

 // Full vector
 template <typename T>
 HWY_API bool AllFalse(const Full128<T> d, const Mask128<T> m) {
 #if 0
   // Casting followed by wasm_i8x16_any_true results in wasm error:
   // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
   const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(d, m));
   return !wasm_i8x16_any_true(v8.raw);
 #else
   (void)d;
   return (wasm_i64x2_extract_lane(m.raw, 0) |
           wasm_i64x2_extract_lane(m.raw, 1)) == 0;
 #endif
 }

 // Full vector
 namespace detail {
 template <typename T>
 HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) {
   return wasm_i8x16_all_true(m.raw);
 }
 template <typename T>
 HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) {
   return wasm_i16x8_all_true(m.raw);
 }
 template <typename T>
 HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
   return wasm_i32x4_all_true(m.raw);
 }
 template <typename T>
 HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
   return wasm_i64x2_all_true(m.raw);
 }

 }  // namespace detail

 template <typename T, size_t N>
 HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
   return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
 }

 // Partial vectors

 template <typename T, size_t N, HWY_IF_LE64(T, N)>
 HWY_API bool AllFalse(Simd<T, N, 0> /* tag */, const Mask128<T, N> m) {
   // Ensure all undefined bytes are 0.
   const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
   return AllFalse(Full128<T>(), Mask128<T>{AndNot(mask, m).raw});
 }

 template <typename T, size_t N, HWY_IF_LE64(T, N)>
 HWY_API bool AllTrue(const Simd<T, N, 0> /* d */, const Mask128<T, N> m) {
   // Ensure all undefined bytes are FF.
   const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
   return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw});
 }

 template <typename T, size_t N>
 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
                                const Mask128<T, N> mask) {
   const uint64_t bits = detail::BitsFromMask(mask);
   return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1;
 }

 // ------------------------------ Compress

 namespace detail {

 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 256);
   const Simd<T, N, 0> d;
   const Rebind<uint8_t, decltype(d)> d8;
   const Simd<uint16_t, N, 0> du;

   // We need byte indices for TableLookupBytes (one vector's worth for each of
   // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
   // can instead store lane indices and convert to byte indices (2*lane + 0..1),
   // with the doubling baked into the table. Unpacking nibbles is likely more
   // costly than the higher cache footprint from storing bytes.
   alignas(16) constexpr uint8_t table[256 * 8] = {
       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,
       0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,
       0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  2,  4,  0,  0,  0,  0,
       0,  0,  0,  2,  4,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,
       0,  6,  0,  0,  0,  0,  0,  0,  2,  6,  0,  0,  0,  0,  0,  0,  0,  2,
       6,  0,  0,  0,  0,  0,  4,  6,  0,  0,  0,  0,  0,  0,  0,  4,  6,  0,
       0,  0,  0,  0,  2,  4,  6,  0,  0,  0,  0,  0,  0,  2,  4,  6,  0,  0,
       0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0,  0,
       2,  8,  0,  0,  0,  0,  0,  0,  0,  2,  8,  0,  0,  0,  0,  0,  4,  8,
       0,  0,  0,  0,  0,  0,  0,  4,  8,  0,  0,  0,  0,  0,  2,  4,  8,  0,
       0,  0,  0,  0,  0,  2,  4,  8,  0,  0,  0,  0,  6,  8,  0,  0,  0,  0,
       0,  0,  0,  6,  8,  0,  0,  0,  0,  0,  2,  6,  8,  0,  0,  0,  0,  0,
       0,  2,  6,  8,  0,  0,  0,  0,  4,  6,  8,  0,  0,  0,  0,  0,  0,  4,
       6,  8,  0,  0,  0,  0,  2,  4,  6,  8,  0,  0,  0,  0,  0,  2,  4,  6,
       8,  0,  0,  0,  10, 0,  0,  0,  0,  0,  0,  0,  0,  10, 0,  0,  0,  0,
       0,  0,  2,  10, 0,  0,  0,  0,  0,  0,  0,  2,  10, 0,  0,  0,  0,  0,
       4,  10, 0,  0,  0,  0,  0,  0,  0,  4,  10, 0,  0,  0,  0,  0,  2,  4,
       10, 0,  0,  0,  0,  0,  0,  2,  4,  10, 0,  0,  0,  0,  6,  10, 0,  0,
       0,  0,  0,  0,  0,  6,  10, 0,  0,  0,  0,  0,  2,  6,  10, 0,  0,  0,
       0,  0,  0,  2,  6,  10, 0,  0,  0,  0,  4,  6,  10, 0,  0,  0,  0,  0,
       0,  4,  6,  10, 0,  0,  0,  0,  2,  4,  6,  10, 0,  0,  0,  0,  0,  2,
       4,  6,  10, 0,  0,  0,  8,  10, 0,  0,  0,  0,  0,  0,  0,  8,  10, 0,
       0,  0,  0,  0,  2,  8,  10, 0,  0,  0,  0,  0,  0,  2,  8,  10, 0,  0,
       0,  0,  4,  8,  10, 0,  0,  0,  0,  0,  0,  4,  8,  10, 0,  0,  0,  0,
       2,  4,  8,  10, 0,  0,  0,  0,  0,  2,  4,  8,  10, 0,  0,  0,  6,  8,
       10, 0,  0,  0,  0,  0,  0,  6,  8,  10, 0,  0,  0,  0,  2,  6,  8,  10,
       0,  0,  0,  0,  0,  2,  6,  8,  10, 0,  0,  0,  4,  6,  8,  10, 0,  0,
       0,  0,  0,  4,  6,  8,  10, 0,  0,  0,  2,  4,  6,  8,  10, 0,  0,  0,
       0,  2,  4,  6,  8,  10, 0,  0,  12, 0,  0,  0,  0,  0,  0,  0,  0,  12,
       0,  0,  0,  0,  0,  0,  2,  12, 0,  0,  0,  0,  0,  0,  0,  2,  12, 0,
       0,  0,  0,  0,  4,  12, 0,  0,  0,  0,  0,  0,  0,  4,  12, 0,  0,  0,
       0,  0,  2,  4,  12, 0,  0,  0,  0,  0,  0,  2,  4,  12, 0,  0,  0,  0,
       6,  12, 0,  0,  0,  0,  0,  0,  0,  6,  12, 0,  0,  0,  0,  0,  2,  6,
       12, 0,  0,  0,  0,  0,  0,  2,  6,  12, 0,  0,  0,  0,  4,  6,  12, 0,
       0,  0,  0,  0,  0,  4,  6,  12, 0,  0,  0,  0,  2,  4,  6,  12, 0,  0,
       0,  0,  0,  2,  4,  6,  12, 0,  0,  0,  8,  12, 0,  0,  0,  0,  0,  0,
       0,  8,  12, 0,  0,  0,  0,  0,  2,  8,  12, 0,  0,  0,  0,  0,  0,  2,
       8,  12, 0,  0,  0,  0,  4,  8,  12, 0,  0,  0,  0,  0,  0,  4,  8,  12,
       0,  0,  0,  0,  2,  4,  8,  12, 0,  0,  0,  0,  0,  2,  4,  8,  12, 0,
       0,  0,  6,  8,  12, 0,  0,  0,  0,  0,  0,  6,  8,  12, 0,  0,  0,  0,
       2,  6,  8,  12, 0,  0,  0,  0,  0,  2,  6,  8,  12, 0,  0,  0,  4,  6,
       8,  12, 0,  0,  0,  0,  0,  4,  6,  8,  12, 0,  0,  0,  2,  4,  6,  8,
       12, 0,  0,  0,  0,  2,  4,  6,  8,  12, 0,  0,  10, 12, 0,  0,  0,  0,
       0,  0,  0,  10, 12, 0,  0,  0,  0,  0,  2,  10, 12, 0,  0,  0,  0,  0,
       0,  2,  10, 12, 0,  0,  0,  0,  4,  10, 12, 0,  0,  0,  0,  0,  0,  4,
       10, 12, 0,  0,  0,  0,  2,  4,  10, 12, 0,  0,  0,  0,  0,  2,  4,  10,
       12, 0,  0,  0,  6,  10, 12, 0,  0,  0,  0,  0,  0,  6,  10, 12, 0,  0,
       0,  0,  2,  6,  10, 12, 0,  0,  0,  0,  0,  2,  6,  10, 12, 0,  0,  0,
       4,  6,  10, 12, 0,  0,  0,  0,  0,  4,  6,  10, 12, 0,  0,  0,  2,  4,
       6,  10, 12, 0,  0,  0,  0,  2,  4,  6,  10, 12, 0,  0,  8,  10, 12, 0,
       0,  0,  0,  0,  0,  8,  10, 12, 0,  0,  0,  0,  2,  8,  10, 12, 0,  0,
       0,  0,  0,  2,  8,  10, 12, 0,  0,  0,  4,  8,  10, 12, 0,  0,  0,  0,
       0,  4,  8,  10, 12, 0,  0,  0,  2,  4,  8,  10, 12, 0,  0,  0,  0,  2,
       4,  8,  10, 12, 0,  0,  6,  8,  10, 12, 0,  0,  0,  0,  0,  6,  8,  10,
       12, 0,  0,  0,  2,  6,  8,  10, 12, 0,  0,  0,  0,  2,  6,  8,  10, 12,
       0,  0,  4,  6,  8,  10, 12, 0,  0,  0,  0,  4,  6,  8,  10, 12, 0,  0,
       2,  4,  6,  8,  10, 12, 0,  0,  0,  2,  4,  6,  8,  10, 12, 0,  14, 0,
       0,  0,  0,  0,  0,  0,  0,  14, 0,  0,  0,  0,  0,  0,  2,  14, 0,  0,
       0,  0,  0,  0,  0,  2,  14, 0,  0,  0,  0,  0,  4,  14, 0,  0,  0,  0,
       0,  0,  0,  4,  14, 0,  0,  0,  0,  0,  2,  4,  14, 0,  0,  0,  0,  0,
       0,  2,  4,  14, 0,  0,  0,  0,  6,  14, 0,  0,  0,  0,  0,  0,  0,  6,
       14, 0,  0,  0,  0,  0,  2,  6,  14, 0,  0,  0,  0,  0,  0,  2,  6,  14,
       0,  0,  0,  0,  4,  6,  14, 0,  0,  0,  0,  0,  0,  4,  6,  14, 0,  0,
       0,  0,  2,  4,  6,  14, 0,  0,  0,  0,  0,  2,  4,  6,  14, 0,  0,  0,
       8,  14, 0,  0,  0,  0,  0,  0,  0,  8,  14, 0,  0,  0,  0,  0,  2,  8,
       14, 0,  0,  0,  0,  0,  0,  2,  8,  14, 0,  0,  0,  0,  4,  8,  14, 0,
       0,  0,  0,  0,  0,  4,  8,  14, 0,  0,  0,  0,  2,  4,  8,  14, 0,  0,
       0,  0,  0,  2,  4,  8,  14, 0,  0,  0,  6,  8,  14, 0,  0,  0,  0,  0,
       0,  6,  8,  14, 0,  0,  0,  0,  2,  6,  8,  14, 0,  0,  0,  0,  0,  2,
       6,  8,  14, 0,  0,  0,  4,  6,  8,  14, 0,  0,  0,  0,  0,  4,  6,  8,
       14, 0,  0,  0,  2,  4,  6,  8,  14, 0,  0,  0,  0,  2,  4,  6,  8,  14,
       0,  0,  10, 14, 0,  0,  0,  0,  0,  0,  0,  10, 14, 0,  0,  0,  0,  0,
       2,  10, 14, 0,  0,  0,  0,  0,  0,  2,  10, 14, 0,  0,  0,  0,  4,  10,
       14, 0,  0,  0,  0,  0,  0,  4,  10, 14, 0,  0,  0,  0,  2,  4,  10, 14,
       0,  0,  0,  0,  0,  2,  4,  10, 14, 0,  0,  0,  6,  10, 14, 0,  0,  0,
       0,  0,  0,  6,  10, 14, 0,  0,  0,  0,  2,  6,  10, 14, 0,  0,  0,  0,
       0,  2,  6,  10, 14, 0,  0,  0,  4,  6,  10, 14, 0,  0,  0,  0,  0,  4,
       6,  10, 14, 0,  0,  0,  2,  4,  6,  10, 14, 0,  0,  0,  0,  2,  4,  6,
       10, 14, 0,  0,  8,  10, 14, 0,  0,  0,  0,  0,  0,  8,  10, 14, 0,  0,
       0,  0,  2,  8,  10, 14, 0,  0,  0,  0,  0,  2,  8,  10, 14, 0,  0,  0,
       4,  8,  10, 14, 0,  0,  0,  0,  0,  4,  8,  10, 14, 0,  0,  0,  2,  4,
       8,  10, 14, 0,  0,  0,  0,  2,  4,  8,  10, 14, 0,  0,  6,  8,  10, 14,
       0,  0,  0,  0,  0,  6,  8,  10, 14, 0,  0,  0,  2,  6,  8,  10, 14, 0,
       0,  0,  0,  2,  6,  8,  10, 14, 0,  0,  4,  6,  8,  10, 14, 0,  0,  0,
       0,  4,  6,  8,  10, 14, 0,  0,  2,  4,  6,  8,  10, 14, 0,  0,  0,  2,
       4,  6,  8,  10, 14, 0,  12, 14, 0,  0,  0,  0,  0,  0,  0,  12, 14, 0,
       0,  0,  0,  0,  2,  12, 14, 0,  0,  0,  0,  0,  0,  2,  12, 14, 0,  0,
       0,  0,  4,  12, 14, 0,  0,  0,  0,  0,  0,  4,  12, 14, 0,  0,  0,  0,
       2,  4,  12, 14, 0,  0,  0,  0,  0,  2,  4,  12, 14, 0,  0,  0,  6,  12,
       14, 0,  0,  0,  0,  0,  0,  6,  12, 14, 0,  0,  0,  0,  2,  6,  12, 14,
       0,  0,  0,  0,  0,  2,  6,  12, 14, 0,  0,  0,  4,  6,  12, 14, 0,  0,
       0,  0,  0,  4,  6,  12, 14, 0,  0,  0,  2,  4,  6,  12, 14, 0,  0,  0,
       0,  2,  4,  6,  12, 14, 0,  0,  8,  12, 14, 0,  0,  0,  0,  0,  0,  8,
       12, 14, 0,  0,  0,  0,  2,  8,  12, 14, 0,  0,  0,  0,  0,  2,  8,  12,
       14, 0,  0,  0,  4,  8,  12, 14, 0,  0,  0,  0,  0,  4,  8,  12, 14, 0,
       0,  0,  2,  4,  8,  12, 14, 0,  0,  0,  0,  2,  4,  8,  12, 14, 0,  0,
       6,  8,  12, 14, 0,  0,  0,  0,  0,  6,  8,  12, 14, 0,  0,  0,  2,  6,
       8,  12, 14, 0,  0,  0,  0,  2,  6,  8,  12, 14, 0,  0,  4,  6,  8,  12,
       14, 0,  0,  0,  0,  4,  6,  8,  12, 14, 0,  0,  2,  4,  6,  8,  12, 14,
       0,  0,  0,  2,  4,  6,  8,  12, 14, 0,  10, 12, 14, 0,  0,  0,  0,  0,
       0,  10, 12, 14, 0,  0,  0,  0,  2,  10, 12, 14, 0,  0,  0,  0,  0,  2,
       10, 12, 14, 0,  0,  0,  4,  10, 12, 14, 0,  0,  0,  0,  0,  4,  10, 12,
       14, 0,  0,  0,  2,  4,  10, 12, 14, 0,  0,  0,  0,  2,  4,  10, 12, 14,
       0,  0,  6,  10, 12, 14, 0,  0,  0,  0,  0,  6,  10, 12, 14, 0,  0,  0,
       2,  6,  10, 12, 14, 0,  0,  0,  0,  2,  6,  10, 12, 14, 0,  0,  4,  6,
       10, 12, 14, 0,  0,  0,  0,  4,  6,  10, 12, 14, 0,  0,  2,  4,  6,  10,
       12, 14, 0,  0,  0,  2,  4,  6,  10, 12, 14, 0,  8,  10, 12, 14, 0,  0,
       0,  0,  0,  8,  10, 12, 14, 0,  0,  0,  2,  8,  10, 12, 14, 0,  0,  0,
       0,  2,  8,  10, 12, 14, 0,  0,  4,  8,  10, 12, 14, 0,  0,  0,  0,  4,
       8,  10, 12, 14, 0,  0,  2,  4,  8,  10, 12, 14, 0,  0,  0,  2,  4,  8,
       10, 12, 14, 0,  6,  8,  10, 12, 14, 0,  0,  0,  0,  6,  8,  10, 12, 14,
       0,  0,  2,  6,  8,  10, 12, 14, 0,  0,  0,  2,  6,  8,  10, 12, 14, 0,
       4,  6,  8,  10, 12, 14, 0,  0,  0,  4,  6,  8,  10, 12, 14, 0,  2,  4,
       6,  8,  10, 12, 14, 0,  0,  2,  4,  6,  8,  10, 12, 14};

   const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
   const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
   return BitCast(d, pairs + Set(du, 0x0100));
 }

 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 16);

   // There are only 4 lanes, so we can afford to load the index vector directly.
   alignas(16) constexpr uint8_t packed_array[16 * 16] = {
       0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
       0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
       4,  5,  6,  7,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
       0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  0,  1,  2,  3,  //
       8,  9,  10, 11, 0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
       0,  1,  2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  0,  1,  2,  3,  //
       4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  0,  1,  2,  3,  //
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  //
       12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
       0,  1,  2,  3,  12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,  //
       4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,  //
       0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  //
       8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,  //
       0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  //
       4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  //
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};

   const Simd<T, N, 0> d;
   const Repartition<uint8_t, decltype(d)> d8;
   return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
 }

 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> Idx64x2FromBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 4);

   // There are only 2 lanes, so we can afford to load the index vector directly.
   alignas(16) constexpr uint8_t packed_array[4 * 16] = {
       0, 1, 2,  3,  4,  5,  6,  7,  0, 1, 2,  3,  4,  5,  6,  7,  //
       0, 1, 2,  3,  4,  5,  6,  7,  0, 1, 2,  3,  4,  5,  6,  7,  //
       8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,  //
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};

   const Simd<T, N, 0> d;
   const Repartition<uint8_t, decltype(d)> d8;
   return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
 }

 // Helper functions called by both Compress and CompressStore - avoids a
 // redundant BitsFromMask in the latter.

 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> Compress(hwy::SizeTag<2> /*tag*/, Vec128<T, N> v,
                                  const uint64_t mask_bits) {
   const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
   const DFromV<decltype(v)> d;
   const RebindToSigned<decltype(d)> di;
   return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 }

 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> Compress(hwy::SizeTag<4> /*tag*/, Vec128<T, N> v,
                                  const uint64_t mask_bits) {
   const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
   const DFromV<decltype(v)> d;
   const RebindToSigned<decltype(d)> di;
   return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 }

 template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> Compress(hwy::SizeTag<8> /*tag*/, Vec128<T, N> v,
                                  const uint64_t mask_bits) {
   const auto idx = detail::Idx64x2FromBits<T, N>(mask_bits);
   const DFromV<decltype(v)> d;
   const RebindToSigned<decltype(d)> di;
   return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 }

 }  // namespace detail

 template <typename T, size_t N>
 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
   const uint64_t mask_bits = detail::BitsFromMask(mask);
   return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
 }

 // ------------------------------ CompressBits

 template <typename T, size_t N>
 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
                                   const uint8_t* HWY_RESTRICT bits) {
   uint64_t mask_bits = 0;
   constexpr size_t kNumBytes = (N + 7) / 8;
   CopyBytes<kNumBytes>(bits, &mask_bits);
   if (N < 8) {
     mask_bits &= (1ull << N) - 1;
   }

   return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
 }

 // ------------------------------ CompressStore
 template <typename T, size_t N>
 HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
                              Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
   const uint64_t mask_bits = detail::BitsFromMask(mask);
   const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
   StoreU(c, d, unaligned);
   return PopCount(mask_bits);
 }

 // ------------------------------ CompressBlendedStore
 template <typename T, size_t N>
 HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
                                     Simd<T, N, 0> d,
                                     T* HWY_RESTRICT unaligned) {
   const RebindToUnsigned<decltype(d)> du;  // so we can support fp16/bf16
   using TU = TFromD<decltype(du)>;
   const uint64_t mask_bits = detail::BitsFromMask(m);
   const size_t count = PopCount(mask_bits);
   const Mask128<TU, N> store_mask = FirstN(du, count);
   const Vec128<TU, N> compressed =
       detail::Compress(hwy::SizeTag<sizeof(T)>(), BitCast(du, v), mask_bits);
   const Vec128<TU, N> prev = BitCast(du, LoadU(d, unaligned));
   StoreU(BitCast(d, IfThenElse(store_mask, compressed, prev)), d, unaligned);
   return count;
 }

 // ------------------------------ CompressBitsStore

 template <typename T, size_t N>
 HWY_API size_t CompressBitsStore(Vec128<T, N> v,
                                  const uint8_t* HWY_RESTRICT bits,
                                  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
   uint64_t mask_bits = 0;
   constexpr size_t kNumBytes = (N + 7) / 8;
   CopyBytes<kNumBytes>(bits, &mask_bits);
   if (N < 8) {
     mask_bits &= (1ull << N) - 1;
   }

   const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
   StoreU(c, d, unaligned);
   return PopCount(mask_bits);
 }

 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
 // TableLookupBytes)

 // 128 bits
 HWY_API void StoreInterleaved3(const Vec128<uint8_t> a, const Vec128<uint8_t> b,
                                const Vec128<uint8_t> c, Full128<uint8_t> d,
                                uint8_t* HWY_RESTRICT unaligned) {
   const auto k5 = Set(d, 5);
   const auto k6 = Set(d, 6);

   // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
   // 0x80 so lanes to be filled from other vectors are 0 for blending.
   alignas(16) static constexpr uint8_t tbl_r0[16] = {
       0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
       3, 0x80, 0x80, 4, 0x80, 0x80, 5};
   alignas(16) static constexpr uint8_t tbl_g0[16] = {
       0x80, 0, 0x80, 0x80, 1, 0x80,  //
       0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
   const auto shuf_r0 = Load(d, tbl_r0);
   const auto shuf_g0 = Load(d, tbl_g0);  // cannot reuse r0 due to 5 in MSB
   const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
   const auto r0 = TableLookupBytes(a, shuf_r0);  // 5..4..3..2..1..0
   const auto g0 = TableLookupBytes(b, shuf_g0);  // ..4..3..2..1..0.
   const auto b0 = TableLookupBytes(c, shuf_b0);  // .4..3..2..1..0..
   const auto int0 = r0 | g0 | b0;
   StoreU(int0, d, unaligned + 0 * 16);

   // Second vector: g10,r10, bgr[9:6], b5,g5
   const auto shuf_r1 = shuf_b0 + k6;  // .A..9..8..7..6..
   const auto shuf_g1 = shuf_r0 + k5;  // A..9..8..7..6..5
   const auto shuf_b1 = shuf_g0 + k5;  // ..9..8..7..6..5.
   const auto r1 = TableLookupBytes(a, shuf_r1);
   const auto g1 = TableLookupBytes(b, shuf_g1);
   const auto b1 = TableLookupBytes(c, shuf_b1);
   const auto int1 = r1 | g1 | b1;
   StoreU(int1, d, unaligned + 1 * 16);

   // Third vector: bgr[15:11], b10
   const auto shuf_r2 = shuf_b1 + k6;  // ..F..E..D..C..B.
   const auto shuf_g2 = shuf_r1 + k5;  // .F..E..D..C..B..
   const auto shuf_b2 = shuf_g1 + k5;  // F..E..D..C..B..A
   const auto r2 = TableLookupBytes(a, shuf_r2);
   const auto g2 = TableLookupBytes(b, shuf_g2);
   const auto b2 = TableLookupBytes(c, shuf_b2);
   const auto int2 = r2 | g2 | b2;
   StoreU(int2, d, unaligned + 2 * 16);
 }

 // 64 bits
 HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> a,
                                const Vec128<uint8_t, 8> b,
                                const Vec128<uint8_t, 8> c, Full64<uint8_t> d,
                                uint8_t* HWY_RESTRICT unaligned) {
   // Use full vectors for the shuffles and first result.
   const Full128<uint8_t> d_full;
   const auto k5 = Set(d_full, 5);
   const auto k6 = Set(d_full, 6);

   const Vec128<uint8_t> full_a{a.raw};
   const Vec128<uint8_t> full_b{b.raw};
   const Vec128<uint8_t> full_c{c.raw};

   // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
   // 0x80 so lanes to be filled from other vectors are 0 for blending.
   alignas(16) static constexpr uint8_t tbl_r0[16] = {
       0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
       3, 0x80, 0x80, 4, 0x80, 0x80, 5};
   alignas(16) static constexpr uint8_t tbl_g0[16] = {
       0x80, 0, 0x80, 0x80, 1, 0x80,  //
       0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
   const auto shuf_r0 = Load(d_full, tbl_r0);
   const auto shuf_g0 = Load(d_full, tbl_g0);  // cannot reuse r0 due to 5 in MSB
   const auto shuf_b0 = CombineShiftRightBytes<15>(d_full, shuf_g0, shuf_g0);
   const auto r0 = TableLookupBytes(full_a, shuf_r0);  // 5..4..3..2..1..0
   const auto g0 = TableLookupBytes(full_b, shuf_g0);  // ..4..3..2..1..0.
   const auto b0 = TableLookupBytes(full_c, shuf_b0);  // .4..3..2..1..0..
   const auto int0 = r0 | g0 | b0;
   StoreU(int0, d_full, unaligned + 0 * 16);

   // Second (HALF) vector: bgr[7:6], b5,g5
   const auto shuf_r1 = shuf_b0 + k6;  // ..7..6..
   const auto shuf_g1 = shuf_r0 + k5;  // .7..6..5
   const auto shuf_b1 = shuf_g0 + k5;  // 7..6..5.
   const auto r1 = TableLookupBytes(full_a, shuf_r1);
   const auto g1 = TableLookupBytes(full_b, shuf_g1);
   const auto b1 = TableLookupBytes(full_c, shuf_b1);
   const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
   StoreU(int1, d, unaligned + 1 * 16);
 }

 // <= 32 bits
 template <size_t N, HWY_IF_LE32(uint8_t, N)>
 HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> a,
                                const Vec128<uint8_t, N> b,
                                const Vec128<uint8_t, N> c,
                                Simd<uint8_t, N, 0> /*tag*/,
                                uint8_t* HWY_RESTRICT unaligned) {
   // Use full vectors for the shuffles and result.
   const Full128<uint8_t> d_full;

   const Vec128<uint8_t> full_a{a.raw};
   const Vec128<uint8_t> full_b{b.raw};
   const Vec128<uint8_t> full_c{c.raw};

   // Shuffle (a,b,c) vector bytes to bgr[3:0].
   // 0x80 so lanes to be filled from other vectors are 0 for blending.
   alignas(16) static constexpr uint8_t tbl_r0[16] = {
       0,    0x80, 0x80, 1,   0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80,  //
       0x80, 0x80, 0x80, 0x80};
   const auto shuf_r0 = Load(d_full, tbl_r0);
   const auto shuf_g0 = CombineShiftRightBytes<15>(d_full, shuf_r0, shuf_r0);
   const auto shuf_b0 = CombineShiftRightBytes<14>(d_full, shuf_r0, shuf_r0);
   const auto r0 = TableLookupBytes(full_a, shuf_r0);  // ......3..2..1..0
   const auto g0 = TableLookupBytes(full_b, shuf_g0);  // .....3..2..1..0.
   const auto b0 = TableLookupBytes(full_c, shuf_b0);  // ....3..2..1..0..
   const auto int0 = r0 | g0 | b0;
   alignas(16) uint8_t buf[16];
   StoreU(int0, d_full, buf);
   CopyBytes<N * 3>(buf, unaligned);
 }

 // ------------------------------ StoreInterleaved4

 // 128 bits
 HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
                                const Vec128<uint8_t> v1,
                                const Vec128<uint8_t> v2,
                                const Vec128<uint8_t> v3, Full128<uint8_t> d8,
                                uint8_t* HWY_RESTRICT unaligned) {
   const RepartitionToWide<decltype(d8)> d16;
   const RepartitionToWide<decltype(d16)> d32;
   // let a,b,c,d denote v0..3.
   const auto ba0 = ZipLower(d16, v0, v1);  // b7 a7 .. b0 a0
   const auto dc0 = ZipLower(d16, v2, v3);  // d7 c7 .. d0 c0
   const auto ba8 = ZipUpper(d16, v0, v1);
   const auto dc8 = ZipUpper(d16, v2, v3);
   const auto dcba_0 = ZipLower(d32, ba0, dc0);  // d..a3 d..a0
   const auto dcba_4 = ZipUpper(d32, ba0, dc0);  // d..a7 d..a4
   const auto dcba_8 = ZipLower(d32, ba8, dc8);  // d..aB d..a8
   const auto dcba_C = ZipUpper(d32, ba8, dc8);  // d..aF d..aC
   StoreU(BitCast(d8, dcba_0), d8, unaligned + 0 * 16);
   StoreU(BitCast(d8, dcba_4), d8, unaligned + 1 * 16);
   StoreU(BitCast(d8, dcba_8), d8, unaligned + 2 * 16);
   StoreU(BitCast(d8, dcba_C), d8, unaligned + 3 * 16);
 }

 // 64 bits
 HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> in0,
                                const Vec128<uint8_t, 8> in1,
                                const Vec128<uint8_t, 8> in2,
                                const Vec128<uint8_t, 8> in3,
                                Full64<uint8_t> /* tag */,
                                uint8_t* HWY_RESTRICT unaligned) {
   // Use full vectors to reduce the number of stores.
   const Full128<uint8_t> d_full8;
   const RepartitionToWide<decltype(d_full8)> d16;
   const RepartitionToWide<decltype(d16)> d32;
   const Vec128<uint8_t> v0{in0.raw};
   const Vec128<uint8_t> v1{in1.raw};
   const Vec128<uint8_t> v2{in2.raw};
   const Vec128<uint8_t> v3{in3.raw};
   // let a,b,c,d denote v0..3.
   const auto ba0 = ZipLower(d16, v0, v1);       // b7 a7 .. b0 a0
   const auto dc0 = ZipLower(d16, v2, v3);       // d7 c7 .. d0 c0
   const auto dcba_0 = ZipLower(d32, ba0, dc0);  // d..a3 d..a0
   const auto dcba_4 = ZipUpper(d32, ba0, dc0);  // d..a7 d..a4
   StoreU(BitCast(d_full8, dcba_0), d_full8, unaligned + 0 * 16);
   StoreU(BitCast(d_full8, dcba_4), d_full8, unaligned + 1 * 16);
 }

 // <= 32 bits
 template <size_t N, HWY_IF_LE32(uint8_t, N)>
 HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
                                const Vec128<uint8_t, N> in1,
                                const Vec128<uint8_t, N> in2,
                                const Vec128<uint8_t, N> in3,
                                Simd<uint8_t, N, 0> /*tag*/,
                                uint8_t* HWY_RESTRICT unaligned) {
   // Use full vectors to reduce the number of stores.
   const Full128<uint8_t> d_full8;
   const RepartitionToWide<decltype(d_full8)> d16;
   const RepartitionToWide<decltype(d16)> d32;
   const Vec128<uint8_t> v0{in0.raw};
   const Vec128<uint8_t> v1{in1.raw};
   const Vec128<uint8_t> v2{in2.raw};
   const Vec128<uint8_t> v3{in3.raw};
   // let a,b,c,d denote v0..3.
   const auto ba0 = ZipLower(d16, v0, v1);       // b3 a3 .. b0 a0
   const auto dc0 = ZipLower(d16, v2, v3);       // d3 c3 .. d0 c0
   const auto dcba_0 = ZipLower(d32, ba0, dc0);  // d..a3 d..a0
   alignas(16) uint8_t buf[16];
   StoreU(BitCast(d_full8, dcba_0), d_full8, buf);
   CopyBytes<4 * N>(buf, unaligned);
 }

 // ------------------------------ MulEven/Odd (Load)

 HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
                                     const Vec128<uint64_t> b) {
   alignas(16) uint64_t mul[2];
   mul[0] =
       Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
              static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
   return Load(Full128<uint64_t>(), mul);
 }

 HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
                                    const Vec128<uint64_t> b) {
   alignas(16) uint64_t mul[2];
   mul[0] =
       Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
              static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
   return Load(Full128<uint64_t>(), mul);
 }

 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)

 template <size_t N>
 HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
                                                    Vec128<bfloat16_t, 2 * N> a,
                                                    Vec128<bfloat16_t, 2 * N> b,
                                                    const Vec128<float, N> sum0,
                                                    Vec128<float, N>& sum1) {
   const Repartition<uint16_t, decltype(df32)> du16;
   const RebindToUnsigned<decltype(df32)> du32;
   const Vec128<uint16_t, 2 * N> zero = Zero(du16);
   const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
   const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
   const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
   const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
   sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
   return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }

 // ------------------------------ Reductions

 namespace detail {

 // N=1 for any T: no-op
 template <typename T>
 HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
                                    const Vec128<T, 1> v) {
   return v;
 }
 template <typename T>
 HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
                                    const Vec128<T, 1> v) {
   return v;
 }
 template <typename T>
 HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
                                    const Vec128<T, 1> v) {
   return v;
 }

 // u32/i32/f32:

 // N=2
 template <typename T>
 HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
                                    const Vec128<T, 2> v10) {
   return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
 }
 template <typename T>
 HWY_INLINE Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
                                    const Vec128<T, 2> v10) {
   return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
 }
 template <typename T>
 HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
                                    const Vec128<T, 2> v10) {
   return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
 }

 // N=4 (full)
 template <typename T>
 HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
                                 const Vec128<T> v3210) {
   const Vec128<T> v1032 = Shuffle1032(v3210);
   const Vec128<T> v31_20_31_20 = v3210 + v1032;
   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
   return v20_31_20_31 + v31_20_31_20;
 }
 template <typename T>
 HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
                                 const Vec128<T> v3210) {
   const Vec128<T> v1032 = Shuffle1032(v3210);
   const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
   return Min(v20_31_20_31, v31_20_31_20);
 }
 template <typename T>
 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
                                 const Vec128<T> v3210) {
   const Vec128<T> v1032 = Shuffle1032(v3210);
   const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
   return Max(v20_31_20_31, v31_20_31_20);
 }

 // u64/i64/f64:

 // N=2 (full)
 template <typename T>
 HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
                                 const Vec128<T> v10) {
   const Vec128<T> v01 = Shuffle01(v10);
   return v10 + v01;
 }
 template <typename T>
 HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
                                 const Vec128<T> v10) {
   const Vec128<T> v01 = Shuffle01(v10);
   return Min(v10, v01);
 }
 template <typename T>
 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
                                 const Vec128<T> v10) {
   const Vec128<T> v01 = Shuffle01(v10);
   return Max(v10, v01);
 }

 // u16/i16
 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
 HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
   const DFromV<decltype(v)> d;
   const Repartition<int32_t, decltype(d)> d32;
   const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
   const auto odd = ShiftRight<16>(BitCast(d32, v));
   const auto min = MinOfLanes(d32, Min(even, odd));
   // Also broadcast into odd lanes.
   return BitCast(d, Or(min, ShiftLeft<16>(min)));
 }
 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
 HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
   const DFromV<decltype(v)> d;
   const Repartition<int32_t, decltype(d)> d32;
   const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
   const auto odd = ShiftRight<16>(BitCast(d32, v));
   const auto min = MaxOfLanes(d32, Max(even, odd));
   // Also broadcast into odd lanes.
   return BitCast(d, Or(min, ShiftLeft<16>(min)));
 }

 }  // namespace detail

 // Supported for u/i/f 32/64. Returns the same value in each lane.
 template <typename T, size_t N>
 HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
   return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
 }
 template <typename T, size_t N>
 HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
   return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
 }
 template <typename T, size_t N>
 HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
   return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
 }

 // ------------------------------ Lt128

 namespace detail {

 template <size_t kLanes, typename T, size_t N>
 Mask128<T, N> ShiftMaskLeft(Mask128<T, N> m) {
   return MaskFromVec(ShiftLeftLanes<kLanes>(VecFromMask(Simd<T, N, 0>(), m)));
 }

 }  // namespace detail

 template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
                                Vec128<T, N> b) {
   static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
   // Truth table of Eq and Lt for Hi and Lo u64.
   // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
   // =H =L cH cL  | out = cH | (=H & cL)
   //  0  0  0  0  |  0
   //  0  0  0  1  |  0
   //  0  0  1  0  |  1
   //  0  0  1  1  |  1
   //  0  1  0  0  |  0
   //  0  1  0  1  |  0
   //  0  1  1  0  |  1
   //  1  0  0  0  |  0
   //  1  0  0  1  |  1
   //  1  1  0  0  |  0
   const Mask128<T, N> eqHL = Eq(a, b);
   const Mask128<T, N> ltHL = Lt(a, b);
   // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
   // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
   // comparison result leftwards requires only 4.
   const Mask128<T, N> ltLx = detail::ShiftMaskLeft<1>(ltHL);
   const Mask128<T, N> outHx = Or(ltHL, And(eqHL, ltLx));
   const Vec128<T, N> vecHx = VecFromMask(d, outHx);
   return MaskFromVec(InterleaveUpper(d, vecHx, vecHx));
 }

 // ------------------------------ Min128, Max128 (Lt128)

 // Without a native OddEven, it seems infeasible to go faster than Lt128.
 template <class D>
 HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
   return IfThenElse(Lt128(d, a, b), a, b);
 }

 template <class D>
 HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
   return IfThenElse(Lt128(d, a, b), b, a);
 }

 // ================================================== Operator wrapper

 template <class V>
 HWY_API V Add(V a, V b) {
   return a + b;
 }
 template <class V>
 HWY_API V Sub(V a, V b) {
   return a - b;
 }

 template <class V>
 HWY_API V Mul(V a, V b) {
   return a * b;
 }
 template <class V>
 HWY_API V Div(V a, V b) {
   return a / b;
 }

 template <class V>
 V Shl(V a, V b) {
   return a << b;
 }
 template <class V>
 V Shr(V a, V b) {
   return a >> b;
 }

 template <class V>
 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
   return a == b;
 }
 template <class V>
 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
   return a != b;
 }
 template <class V>
 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
   return a < b;
 }

 template <class V>
 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
   return a > b;
 }
 template <class V>
 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
   return a >= b;
 }

 template <class V>
 HWY_API auto Le(V a, V b) -> decltype(a == b) {
   return a <= b;
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
 HWY_AFTER_NAMESPACE();