hwy/ops/scalar-inl.h - external/github.com/google/highway - Git at Google

 // Copyright 2019 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Single-element vectors and operations.
 // External include guard in highway.h - see comment there.

 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>  // memcpy

 #include <cmath>

 #include "hwy/ops/shared-inl.h"

 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

 // Single instruction, single data.
 template <typename T>
 using Sisd = Simd<T, 1>;

 // (Wrapper class required for overloading comparison operators.)
 template <typename T>
 struct Vec1 {
   HWY_INLINE Vec1() = default;
   Vec1(const Vec1&) = default;
   Vec1& operator=(const Vec1&) = default;
   HWY_INLINE explicit Vec1(const T t) : raw(t) {}

   HWY_INLINE Vec1& operator*=(const Vec1 other) {
     return *this = (*this * other);
   }
   HWY_INLINE Vec1& operator/=(const Vec1 other) {
     return *this = (*this / other);
   }
   HWY_INLINE Vec1& operator+=(const Vec1 other) {
     return *this = (*this + other);
   }
   HWY_INLINE Vec1& operator-=(const Vec1 other) {
     return *this = (*this - other);
   }
   HWY_INLINE Vec1& operator&=(const Vec1 other) {
     return *this = (*this & other);
   }
   HWY_INLINE Vec1& operator|=(const Vec1 other) {
     return *this = (*this | other);
   }
   HWY_INLINE Vec1& operator^=(const Vec1 other) {
     return *this = (*this ^ other);
   }

   T raw;
 };

 // 0 or FF..FF, same size as Vec1.
 template <typename T>
 class Mask1 {
   using Raw = hwy::MakeUnsigned<T>;

  public:
   static HWY_INLINE Mask1<T> FromBool(bool b) {
     Mask1<T> mask;
     mask.bits = b ? ~Raw(0) : 0;
     return mask;
   }

   Raw bits;
 };

 // ------------------------------ Cast

 template <typename T, typename FromT>
 HWY_INLINE Vec1<T> BitCast(Sisd<T> /* tag */, Vec1<FromT> v) {
   static_assert(sizeof(T) <= sizeof(FromT), "Promoting is undefined");
   T to;
   CopyBytes<sizeof(FromT)>(&v.raw, &to);
   return Vec1<T>(to);
 }

 // ------------------------------ Set

 template <typename T>
 HWY_INLINE Vec1<T> Zero(Sisd<T> /* tag */) {
   return Vec1<T>(T(0));
 }

 template <typename T, typename T2>
 HWY_INLINE Vec1<T> Set(Sisd<T> /* tag */, const T2 t) {
   return Vec1<T>(static_cast<T>(t));
 }

 template <typename T>
 HWY_INLINE Vec1<T> Undefined(Sisd<T> /* tag */) {
   return Vec1<T>(0);
 }

 template <typename T, typename T2>
 Vec1<T> Iota(const Sisd<T> /* tag */, const T2 first) {
   return Vec1<T>(static_cast<T>(first));
 }

 // ================================================== SHIFTS

 // ------------------------------ Shift lanes by constant #bits

 template <int kBits, typename T>
 HWY_INLINE Vec1<T> ShiftLeft(const Vec1<T> v) {
   static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
   return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits);
 }

 template <int kBits, typename T>
 HWY_INLINE Vec1<T> ShiftRight(const Vec1<T> v) {
   static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
   return Vec1<T>(v.raw >> kBits);
 }

 // ------------------------------ Shift lanes by independent variable #bits

 // Single-lane => same as above except for the argument type.
 template <typename T>
 HWY_INLINE Vec1<T> operator<<(const Vec1<T> v, const Vec1<T> bits) {
   return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits.raw);
 }
 template <typename T>
 HWY_INLINE Vec1<T> operator>>(const Vec1<T> v, const Vec1<T> bits) {
   return Vec1<T>(v.raw >> bits.raw);
 }

 // ================================================== LOGICAL

 template <typename Bits>
 struct BitwiseOp {
   template <typename T, class Op>
   Vec1<T> operator()(const Vec1<T> a, const Vec1<T> b, const Op& op) const {
     static_assert(sizeof(T) == sizeof(Bits), "Float/int size mismatch");
     Bits ia, ib;
     CopyBytes<sizeof(Bits)>(&a, &ia);
     CopyBytes<sizeof(Bits)>(&b, &ib);
     ia = op(ia, ib);
     T ret;
     CopyBytes<sizeof(Bits)>(&ia, &ret);
     return Vec1<T>(ret);
   }
 };

 // ------------------------------ Bitwise AND

 template <typename T>
 HWY_INLINE Vec1<T> operator&(const Vec1<T> a, const Vec1<T> b) {
   return Vec1<T>(a.raw & b.raw);
 }

 HWY_INLINE Vec1<float> And(const Vec1<float> a, const Vec1<float> b) {
   return BitwiseOp<int32_t>()(a, b, [](int32_t i, int32_t j) { return i & j; });
 }
 HWY_INLINE Vec1<double> And(const Vec1<double> a, const Vec1<double> b) {
   return BitwiseOp<int64_t>()(a, b, [](int64_t i, int64_t j) { return i & j; });
 }

 // ------------------------------ Bitwise AND-NOT

 // Returns ~a & b.
 template <typename T>
 HWY_INLINE Vec1<T> AndNot(const Vec1<T> a, const Vec1<T> b) {
   return Vec1<T>(~a.raw & b.raw);
 }
 HWY_INLINE Vec1<float> AndNot(const Vec1<float> a, const Vec1<float> b) {
   return BitwiseOp<int32_t>()(a, b,
                               [](int32_t i, int32_t j) { return ~i & j; });
 }
 HWY_INLINE Vec1<double> AndNot(const Vec1<double> a, const Vec1<double> b) {
   return BitwiseOp<int64_t>()(a, b,
                               [](int64_t i, int64_t j) { return ~i & j; });
 }

 // ------------------------------ Bitwise OR

 template <typename T>
 HWY_INLINE Vec1<T> operator|(const Vec1<T> a, const Vec1<T> b) {
   return Vec1<T>(a.raw | b.raw);
 }

 HWY_INLINE Vec1<float> Or(const Vec1<float> a, const Vec1<float> b) {
   return BitwiseOp<int32_t>()(a, b, [](int32_t i, int32_t j) { return i | j; });
 }
 HWY_INLINE Vec1<double> Or(const Vec1<double> a, const Vec1<double> b) {
   return BitwiseOp<int64_t>()(a, b, [](int64_t i, int64_t j) { return i | j; });
 }

 // ------------------------------ Bitwise XOR

 template <typename T>
 HWY_INLINE Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) {
   return Vec1<T>(a.raw ^ b.raw);
 }

 HWY_INLINE Vec1<float> Xor(const Vec1<float> a, const Vec1<float> b) {
   return BitwiseOp<int32_t>()(a, b, [](int32_t i, int32_t j) { return i ^ j; });
 }
 HWY_INLINE Vec1<double> Xor(const Vec1<double> a, const Vec1<double> b) {
   return BitwiseOp<int64_t>()(a, b, [](int64_t i, int64_t j) { return i ^ j; });
 }

 // ------------------------------ CopySign

 template <typename T>
 HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) {
   static_assert(IsFloat<T>(), "Only makes sense for floating-point");
   const auto msb = SignBit(Sisd<T>());
   return Or(AndNot(msb, magn), And(msb, sign));
 }

 template <typename T>
 HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) {
   static_assert(IsFloat<T>(), "Only makes sense for floating-point");
   return Or(abs, And(SignBit(Sisd<T>()), sign));
 }

 // ------------------------------ Mask

 // v must be 0 or FF..FF.
 template <typename T>
 HWY_INLINE Mask1<T> MaskFromVec(const Vec1<T> v) {
   Mask1<T> mask;
   memcpy(&mask.bits, &v.raw, sizeof(mask.bits));
   return mask;
 }

 template <typename T>
 Vec1<T> VecFromMask(const Mask1<T> mask) {
   Vec1<T> v;
   memcpy(&v.raw, &mask.bits, sizeof(v.raw));
   return v;
 }

 // Returns mask ? yes : no.
 template <typename T>
 HWY_INLINE Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
                               const Vec1<T> no) {
   return mask.bits ? yes : no;
 }

 template <typename T>
 HWY_INLINE Vec1<T> IfThenElseZero(const Mask1<T> mask, const Vec1<T> yes) {
   return mask.bits ? yes : Vec1<T>(0);
 }

 template <typename T>
 HWY_INLINE Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) {
   return mask.bits ? Vec1<T>(0) : no;
 }

 template <typename T>
 HWY_INLINE Vec1<T> ZeroIfNegative(const Vec1<T> v) {
   return v.raw < 0 ? Vec1<T>(0) : v;
 }

 // ================================================== ARITHMETIC

 template <typename T>
 HWY_INLINE Vec1<T> operator+(Vec1<T> a, Vec1<T> b) {
   const uint64_t a64 = static_cast<int64_t>(a.raw);
   const uint64_t b64 = static_cast<int64_t>(b.raw);
   return Vec1<T>((a64 + b64) & ~T(0));
 }
 HWY_INLINE Vec1<float> operator+(const Vec1<float> a, const Vec1<float> b) {
   return Vec1<float>(a.raw + b.raw);
 }
 HWY_INLINE Vec1<double> operator+(const Vec1<double> a, const Vec1<double> b) {
   return Vec1<double>(a.raw + b.raw);
 }

 template <typename T>
 HWY_INLINE Vec1<T> operator-(Vec1<T> a, Vec1<T> b) {
   const uint64_t a64 = static_cast<int64_t>(a.raw);
   const uint64_t b64 = static_cast<int64_t>(b.raw);
   return Vec1<T>((a64 - b64) & ~T(0));
 }
 HWY_INLINE Vec1<float> operator-(const Vec1<float> a, const Vec1<float> b) {
   return Vec1<float>(a.raw - b.raw);
 }
 HWY_INLINE Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) {
   return Vec1<double>(a.raw - b.raw);
 }

 // ------------------------------ Saturating addition

 // Returns a + b clamped to the destination range.

 // Unsigned
 HWY_INLINE Vec1<uint8_t> SaturatedAdd(const Vec1<uint8_t> a,
                                       const Vec1<uint8_t> b) {
   return Vec1<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255));
 }
 HWY_INLINE Vec1<uint16_t> SaturatedAdd(const Vec1<uint16_t> a,
                                        const Vec1<uint16_t> b) {
   return Vec1<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535));
 }

 // Signed
 HWY_INLINE Vec1<int8_t> SaturatedAdd(const Vec1<int8_t> a,
                                      const Vec1<int8_t> b) {
   return Vec1<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127));
 }
 HWY_INLINE Vec1<int16_t> SaturatedAdd(const Vec1<int16_t> a,
                                       const Vec1<int16_t> b) {
   return Vec1<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767));
 }

 // ------------------------------ Saturating subtraction

 // Returns a - b clamped to the destination range.

 // Unsigned
 HWY_INLINE Vec1<uint8_t> SaturatedSub(const Vec1<uint8_t> a,
                                       const Vec1<uint8_t> b) {
   return Vec1<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255));
 }
 HWY_INLINE Vec1<uint16_t> SaturatedSub(const Vec1<uint16_t> a,
                                        const Vec1<uint16_t> b) {
   return Vec1<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535));
 }

 // Signed
 HWY_INLINE Vec1<int8_t> SaturatedSub(const Vec1<int8_t> a,
                                      const Vec1<int8_t> b) {
   return Vec1<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127));
 }
 HWY_INLINE Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a,
                                       const Vec1<int16_t> b) {
   return Vec1<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767));
 }

 // ------------------------------ Average

 // Returns (a + b + 1) / 2

 HWY_INLINE Vec1<uint8_t> AverageRound(const Vec1<uint8_t> a,
                                       const Vec1<uint8_t> b) {
   return Vec1<uint8_t>((a.raw + b.raw + 1) / 2);
 }
 HWY_INLINE Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
                                        const Vec1<uint16_t> b) {
   return Vec1<uint16_t>((a.raw + b.raw + 1) / 2);
 }

 // ------------------------------ Absolute value

 template <typename T>
 HWY_INLINE Vec1<T> Abs(const Vec1<T> a) {
   const T i = a.raw;
   return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(-i);
 }
 HWY_INLINE Vec1<float> Abs(const Vec1<float> a) {
   return Vec1<float>(std::abs(a.raw));
 }
 HWY_INLINE Vec1<double> Abs(const Vec1<double> a) {
   return Vec1<double>(std::abs(a.raw));
 }

 // ------------------------------ min/max

 template <typename T>
 HWY_INLINE Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
   return Vec1<T>(HWY_MIN(a.raw, b.raw));
 }

 template <typename T>
 HWY_INLINE Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
   return Vec1<T>(HWY_MAX(a.raw, b.raw));
 }

 // ------------------------------ Floating-point negate

 template <typename T, HWY_IF_FLOAT(T)>
 HWY_INLINE Vec1<T> Neg(const Vec1<T> v) {
   return Xor(v, SignBit(Sisd<T>()));
 }

 template <typename T, HWY_IF_NOT_FLOAT(T)>
 HWY_INLINE Vec1<T> Neg(const Vec1<T> v) {
   return Zero(Sisd<T>()) - v;
 }

 // ------------------------------ mul/div

 template <typename T>
 HWY_INLINE Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
   if (hwy::IsFloat<T>()) {
     return Vec1<T>(static_cast<T>(double(a.raw) * b.raw));
   } else if (hwy::IsSigned<T>()) {
     return Vec1<T>(static_cast<T>(int64_t(a.raw) * b.raw));
   } else {
     return Vec1<T>(static_cast<T>(uint64_t(a.raw) * b.raw));
   }
 }

 template <typename T>
 HWY_INLINE Vec1<T> operator/(const Vec1<T> a, const Vec1<T> b) {
   return Vec1<T>(a.raw / b.raw);
 }

 // Returns the upper 16 bits of a * b in each lane.
 HWY_INLINE Vec1<int16_t> MulHigh(const Vec1<int16_t> a, const Vec1<int16_t> b) {
   return Vec1<int16_t>((a.raw * b.raw) >> 16);
 }
 HWY_INLINE Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a,
                                   const Vec1<uint16_t> b) {
   // Cast to uint32_t first to prevent overflow. Otherwise the result of
   // uint16_t * uint16_t is in "int" which may overflow. In practice the result
   // is the same but this way it is also defined.
   return Vec1<uint16_t>(
       (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16);
 }

 // Multiplies even lanes (0, 2 ..) and returns the double-wide result.
 HWY_INLINE Vec1<int64_t> MulEven(const Vec1<int32_t> a, const Vec1<int32_t> b) {
   const int64_t a64 = a.raw;
   return Vec1<int64_t>(a64 * b.raw);
 }
 HWY_INLINE Vec1<uint64_t> MulEven(const Vec1<uint32_t> a,
                                   const Vec1<uint32_t> b) {
   const uint64_t a64 = a.raw;
   return Vec1<uint64_t>(a64 * b.raw);
 }

 // Approximate reciprocal
 HWY_INLINE Vec1<float> ApproximateReciprocal(const Vec1<float> v) {
   // Zero inputs are allowed, but callers are responsible for replacing the
   // return value with something else (typically using IfThenElse). This check
   // avoids a ubsan error. The return value is arbitrary.
   if (v.raw == 0.0f) return Vec1<float>(0.0f);
   return Vec1<float>(1.0f / v.raw);
 }

 // Absolute value of difference.
 HWY_INLINE Vec1<float> AbsDiff(const Vec1<float> a, const Vec1<float> b) {
   return Abs(a - b);
 }

 // ------------------------------ Floating-point multiply-add variants

 template <typename T>
 HWY_INLINE Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x,
                           const Vec1<T> add) {
   return mul * x + add;
 }

 template <typename T>
 HWY_INLINE Vec1<T> NegMulAdd(const Vec1<T> mul, const Vec1<T> x,
                              const Vec1<T> add) {
   return add - mul * x;
 }

 template <typename T>
 HWY_INLINE Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x,
                           const Vec1<T> sub) {
   return mul * x - sub;
 }

 template <typename T>
 HWY_INLINE Vec1<T> NegMulSub(const Vec1<T> mul, const Vec1<T> x,
                              const Vec1<T> sub) {
   return Neg(mul) * x - sub;
 }

 // ------------------------------ Floating-point square root

 // Approximate reciprocal square root
 HWY_INLINE Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) {
   float f = v.raw;
   const float half = f * 0.5f;
   uint32_t bits;
   CopyBytes<4>(&f, &bits);
   // Initial guess based on log2(f)
   bits = 0x5F3759DF - (bits >> 1);
   CopyBytes<4>(&bits, &f);
   // One Newton-Raphson iteration
   return Vec1<float>(f * (1.5f - (half * f * f)));
 }

 // Square root
 HWY_INLINE Vec1<float> Sqrt(const Vec1<float> v) {
   return Vec1<float>(std::sqrt(v.raw));
 }
 HWY_INLINE Vec1<double> Sqrt(const Vec1<double> v) {
   return Vec1<double>(std::sqrt(v.raw));
 }

 // ------------------------------ Floating-point rounding

 // Approximation of round-to-nearest for numbers representable as integers.
 HWY_INLINE Vec1<float> Round(const Vec1<float> v) {
   const float bias = v.raw < 0.0f ? -0.5f : 0.5f;
   const int32_t rounded = static_cast<int32_t>(v.raw + bias);
   if (rounded == 0) return Vec1<float>(v.raw < 0.0f ? -0.0f : 0.0f);
   return Vec1<float>(static_cast<float>(rounded));
 }
 HWY_INLINE Vec1<double> Round(const Vec1<double> v) {
   const double bias = v.raw < 0.0 ? -0.5 : 0.5;
   const int64_t rounded = static_cast<int64_t>(v.raw + bias);
   if (rounded == 0) return Vec1<double>(v.raw < 0.0 ? -0.0 : 0.0);
   return Vec1<double>(std::copysign(rounded, v.raw));
 }

 HWY_INLINE Vec1<float> Trunc(const Vec1<float> v) {
   const int32_t truncated = static_cast<int32_t>(v.raw);
   if (truncated == 0) return Vec1<float>(v.raw < 0.0f ? -0.0f : 0.0f);
   return Vec1<float>(static_cast<float>(truncated));
 }
 HWY_INLINE Vec1<double> Trunc(const Vec1<double> v) {
   const int64_t truncated = static_cast<int64_t>(v.raw);
   if (truncated == 0) return Vec1<double>(v.raw < 0.0 ? -0.0 : 0.0);
   return Vec1<double>(static_cast<double>(truncated));
 }

 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
           class V>
 V Ceiling(const V v) {
   const Bits kExponentMask = (1ull << kExponentBits) - 1;
   const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
   const Bits kBias = kExponentMask / 2;

   Float f = v.raw;
   const bool positive = f > Float(0.0);

   Bits bits;
   CopyBytes<sizeof(Bits)>(&v, &bits);

   const int exponent = ((bits >> kMantissaBits) & kExponentMask) - kBias;
   // Already an integer.
   if (exponent >= kMantissaBits) return v;
   // |v| <= 1 => 0 or 1.
   if (exponent < 0) return positive ? V(1) : V(-0.0);

   const Bits mantissa_mask = kMantissaMask >> exponent;
   // Already an integer
   if ((bits & mantissa_mask) == 0) return v;

   // Clear fractional bits and round up
   if (positive) bits += (kMantissaMask + 1) >> exponent;
   bits &= ~mantissa_mask;

   CopyBytes<sizeof(Bits)>(&bits, &f);
   return V(f);
 }

 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
           class V>
 V Floor(const V v) {
   const Bits kExponentMask = (1ull << kExponentBits) - 1;
   const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
   const Bits kBias = kExponentMask / 2;

   Float f = v.raw;
   const bool negative = f < Float(0.0);

   Bits bits;
   CopyBytes<sizeof(Bits)>(&v, &bits);

   const int exponent = ((bits >> kMantissaBits) & kExponentMask) - kBias;
   // Already an integer.
   if (exponent >= kMantissaBits) return v;
   // |v| <= 1 => -1 or 0.
   if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0));

   const Bits mantissa_mask = kMantissaMask >> exponent;
   // Already an integer
   if ((bits & mantissa_mask) == 0) return v;

   // Clear fractional bits and round down
   if (negative) bits += (kMantissaMask + 1) >> exponent;
   bits &= ~mantissa_mask;

   CopyBytes<sizeof(Bits)>(&bits, &f);
   return V(f);
 }

 // Toward +infinity, aka ceiling
 HWY_INLINE Vec1<float> Ceil(const Vec1<float> v) {
   return Ceiling<float, uint32_t, 23, 8>(v);
 }
 HWY_INLINE Vec1<double> Ceil(const Vec1<double> v) {
   return Ceiling<double, uint64_t, 52, 11>(v);
 }

 // Toward -infinity, aka floor
 HWY_INLINE Vec1<float> Floor(const Vec1<float> v) {
   return Floor<float, uint32_t, 23, 8>(v);
 }
 HWY_INLINE Vec1<double> Floor(const Vec1<double> v) {
   return Floor<double, uint64_t, 52, 11>(v);
 }

 // ================================================== COMPARE

 template <typename T>
 HWY_INLINE Mask1<T> operator==(const Vec1<T> a, const Vec1<T> b) {
   return Mask1<T>::FromBool(a.raw == b.raw);
 }

 template <typename T>
 HWY_INLINE Mask1<T> TestBit(const Vec1<T> v, const Vec1<T> bit) {
   static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
   return (v & bit) == bit;
 }

 template <typename T>
 HWY_INLINE Mask1<T> operator<(const Vec1<T> a, const Vec1<T> b) {
   return Mask1<T>::FromBool(a.raw < b.raw);
 }
 template <typename T>
 HWY_INLINE Mask1<T> operator>(const Vec1<T> a, const Vec1<T> b) {
   return Mask1<T>::FromBool(a.raw > b.raw);
 }

 template <typename T>
 HWY_INLINE Mask1<T> operator<=(const Vec1<T> a, const Vec1<T> b) {
   return Mask1<T>::FromBool(a.raw <= b.raw);
 }
 template <typename T>
 HWY_INLINE Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) {
   return Mask1<T>::FromBool(a.raw >= b.raw);
 }

 // ================================================== MEMORY

 // ------------------------------ Load

 template <typename T>
 HWY_INLINE Vec1<T> Load(Sisd<T> /* tag */, const T* HWY_RESTRICT aligned) {
   T t;
   CopyBytes<sizeof(T)>(aligned, &t);
   return Vec1<T>(t);
 }

 template <typename T>
 HWY_INLINE Vec1<T> LoadU(Sisd<T> d, const T* HWY_RESTRICT p) {
   return Load(d, p);
 }

 // In some use cases, "load single lane" is sufficient; otherwise avoid this.
 template <typename T>
 HWY_INLINE Vec1<T> LoadDup128(Sisd<T> d, const T* HWY_RESTRICT aligned) {
   return Load(d, aligned);
 }

 // ------------------------------ Store

 template <typename T>
 HWY_INLINE void Store(const Vec1<T> v, Sisd<T> /* tag */,
                       T* HWY_RESTRICT aligned) {
   CopyBytes<sizeof(T)>(&v.raw, aligned);
 }

 template <typename T>
 HWY_INLINE void StoreU(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT p) {
   return Store(v, d, p);
 }

 // ------------------------------ "Non-temporal" stores

 template <typename T>
 HWY_INLINE void Stream(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT aligned) {
   return Store(v, d, aligned);
 }

 // ------------------------------ Gather

 template <typename T, typename Offset>
 HWY_INLINE Vec1<T> GatherOffset(Sisd<T> d, const T* base,
                                 const Vec1<Offset> offset) {
   static_assert(sizeof(T) == sizeof(Offset), "SVE requires same size base/ofs");
   const uintptr_t addr = reinterpret_cast<uintptr_t>(base) + offset.raw;
   return Load(d, reinterpret_cast<const T*>(addr));
 }

 template <typename T, typename Index>
 HWY_INLINE Vec1<T> GatherIndex(Sisd<T> d, const T* HWY_RESTRICT base,
                                const Vec1<Index> index) {
   static_assert(sizeof(T) == sizeof(Index), "SVE requires same size base/idx");
   return Load(d, base + index.raw);
 }

 // ================================================== CONVERT

 // ConvertTo and DemoteTo with floating-point input and integer output truncate
 // (rounding toward zero).

 template <typename FromT, typename ToT>
 HWY_INLINE Vec1<ToT> PromoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
   static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting");
   return Vec1<ToT>(static_cast<ToT>(from.raw));
 }

 template <typename FromT, typename ToT>
 HWY_INLINE Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
   static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
   // Prevent ubsan errors when converting float to integers
   if (IsFloat<FromT>() && !IsFloat<ToT>()) {
     if (std::isinf(from.raw) || std::fabs(static_cast<double>(from.raw)) >
                                     static_cast<double>(LimitsMax<ToT>())) {
       return Vec1<ToT>(std::signbit(from.raw) ? LimitsMin<ToT>()
                                               : LimitsMax<ToT>());
     }
   }
   return Vec1<ToT>(static_cast<ToT>(from.raw));
 }

 template <typename FromT, typename ToT>
 HWY_INLINE Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
   // Prevent ubsan errors when converting float to integers
   if (IsFloat<FromT>() && !IsFloat<ToT>()) {
     if (std::isinf(from.raw) || std::fabs(static_cast<double>(from.raw)) >
                                     static_cast<double>(LimitsMax<ToT>())) {
       return Vec1<ToT>(std::signbit(from.raw) ? LimitsMin<ToT>()
                                               : LimitsMax<ToT>());
     }
   }
   return Vec1<ToT>(static_cast<ToT>(from.raw));
 }

 HWY_INLINE Vec1<uint32_t> U32FromU8(const Vec1<uint8_t> v) {
   return PromoteTo(Sisd<uint32_t>(), v);
 }

 HWY_INLINE Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) {
   return DemoteTo(Sisd<uint8_t>(), v);
 }

 // Approximation of round-to-nearest for numbers representable as int32_t.
 HWY_INLINE Vec1<int32_t> NearestInt(const Vec1<float> v) {
   const float f = v.raw;
   const float bias = f < 0.0f ? -0.5f : 0.5f;
   return Vec1<int32_t>(static_cast<int>(f + bias));
 }

 // ================================================== SWIZZLE

 // Unsupported: Shift*Bytes, CombineShiftRightBytes, Interleave*,
 // Shuffle*, SumsOfU8x8, UpperHalf - these require more than one lane and/or
 // actual 128-bit vectors.

 template <typename T>
 HWY_INLINE T GetLane(const Vec1<T> v) {
   return v.raw;
 }

 template <typename T>
 HWY_INLINE Vec1<T> LowerHalf(Vec1<T> v) {
   return v;
 }

 // ------------------------------ Broadcast/splat any lane

 template <int kLane, typename T>
 HWY_INLINE Vec1<T> Broadcast(const Vec1<T> v) {
   static_assert(kLane == 0, "Scalar only has one lane");
   return v;
 }

 // ------------------------------ Zip/unpack

 HWY_INLINE Vec1<uint16_t> ZipLower(const Vec1<uint8_t> a,
                                    const Vec1<uint8_t> b) {
   return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t(b.raw) << 8) + a.raw));
 }
 HWY_INLINE Vec1<uint32_t> ZipLower(const Vec1<uint16_t> a,
                                    const Vec1<uint16_t> b) {
   return Vec1<uint32_t>((uint32_t(b.raw) << 16) + a.raw);
 }
 HWY_INLINE Vec1<uint64_t> ZipLower(const Vec1<uint32_t> a,
                                    const Vec1<uint32_t> b) {
   return Vec1<uint64_t>((uint64_t(b.raw) << 32) + a.raw);
 }
 HWY_INLINE Vec1<int16_t> ZipLower(const Vec1<int8_t> a, const Vec1<int8_t> b) {
   return Vec1<int16_t>(static_cast<int16_t>((int32_t(b.raw) << 8) + a.raw));
 }
 HWY_INLINE Vec1<int32_t> ZipLower(const Vec1<int16_t> a,
                                   const Vec1<int16_t> b) {
   return Vec1<int32_t>((int32_t(b.raw) << 16) + a.raw);
 }
 HWY_INLINE Vec1<int64_t> ZipLower(const Vec1<int32_t> a,
                                   const Vec1<int32_t> b) {
   return Vec1<int64_t>((int64_t(b.raw) << 32) + a.raw);
 }

 // ------------------------------ Mask

 template <typename T>
 HWY_INLINE bool AllFalse(const Mask1<T> mask) {
   return mask.bits == 0;
 }

 template <typename T>
 HWY_INLINE bool AllTrue(const Mask1<T> mask) {
   return mask.bits != 0;
 }

 template <typename T>
 HWY_INLINE uint64_t BitsFromMask(const Mask1<T> mask) {
   return mask.bits & 1;
 }

 template <typename T>
 HWY_INLINE size_t CountTrue(const Mask1<T> mask) {
   return mask.bits == 0 ? 0 : 1;
 }

 // ------------------------------ Reductions

 // Sum of all lanes, i.e. the only one.
 template <typename T>
 HWY_INLINE Vec1<T> SumOfLanes(const Vec1<T> v0) {
   return v0;
 }
 template <typename T>
 HWY_INLINE Vec1<T> MinOfLanes(const Vec1<T> v) {
   return v;
 }
 template <typename T>
 HWY_INLINE Vec1<T> MaxOfLanes(const Vec1<T> v) {
   return v;
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
 HWY_AFTER_NAMESPACE();