Add integer Neg(ate) instruction

PiperOrigin-RevId: 348761205
diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md
index 944b154..32d9845 100644
--- a/g3doc/quick_reference.md
+++ b/g3doc/quick_reference.md
@@ -151,18 +151,19 @@
 *   <code>V **operator-**(V a, V b)</code>: returns `a[i] - b[i]` (mod 2^bits).
 
 *   `V`: `ui8/16` \
-    <code>V **SaturatedAdd**(V a, V b)</code> returns `a[i] + b[i]` saturated
-    to the minimum/maximum representable value.
+    <code>V **SaturatedAdd**(V a, V b)</code> returns `a[i] + b[i]` saturated to
+    the minimum/maximum representable value.
+
 *   `V`: `ui8/16` \
-    <code>V **SaturatedSub**(V a, V b)</code> returns `a[i] - b[i]` saturated
-    to the minimum/maximum representable value.
+    <code>V **SaturatedSub**(V a, V b)</code> returns `a[i] - b[i]` saturated to
+    the minimum/maximum representable value.
 
 *   `V`: `u8/16` \
     <code>V **AverageRound**(V a, V b)</code> returns `(a[i] + b[i] + 1) / 2`.
 
 *   `V`: `i8/16/32`, `f` \
-    <code>V **Abs**(V a)</code> returns the absolute value of `a[i]`;
-    for integers, `LimitsMin()` maps to `LimitsMax() + 1`.
+    <code>V **Abs**(V a)</code> returns the absolute value of `a[i]`; for
+    integers, `LimitsMin()` maps to `LimitsMax() + 1`.
 
 *   `V`: `ui8/16/32`, `f` \
     <code>V **Min**(V a, V b)</code>: returns `min(a[i], b[i])`.
@@ -190,8 +191,10 @@
     `1.0 / a[i]`.
 
 *   `V`: `f32` \
-    <code>V **AbsDiff**(V a, V b)</code>: returns `|a[i] - b[i]|` in each
-    lane.
+    <code>V **AbsDiff**(V a, V b)</code>: returns `|a[i] - b[i]|` in each lane.
+
+*   `V`: `if` \
+    <code>V **Neg**(V a)</code>: returns `-a[i]`.
 
 #### Multiply
 
@@ -308,9 +311,6 @@
 Special functions for floating-point types:
 
 *   `V`: `f` \
-    <code>V **Neg**(V a)</code>: returns the `-a[i]`.
-
-*   `V`: `f` \
     <code>V **CopySign**(V a, V b)</code>: returns the number with the magnitude
     of `a` and sign of `b`.
 
diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h
index bbf2b50..71e3430 100644
--- a/hwy/ops/arm_neon-inl.h
+++ b/hwy/ops/arm_neon-inl.h
@@ -950,6 +950,23 @@
 // ------------------------------ Floating-point negate
 
 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vneg, _, 1)
+HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1)
+
+HWY_INLINE Vec128<int64_t, 1> Neg(const Vec128<int64_t, 1> v) {
+#if defined(__aarch64__)
+  return Vec128<int64_t, 1>(vneg_s64(v.raw));
+#else
+  return Zero(Simd<int64_t, 1>()) - v;
+#endif
+}
+
+HWY_INLINE Vec128<int64_t> Neg(const Vec128<int64_t> v) {
+#if defined(__aarch64__)
+  return Vec128<int64_t>(vnegq_s64(v.raw));
+#else
+  return Zero(Full128<int64_t>()) - v;
+#endif
+}
 
 // ------------------------------ Floating-point mul / div
 
diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h
index c0f00fd..72d687f 100644
--- a/hwy/ops/scalar-inl.h
+++ b/hwy/ops/scalar-inl.h
@@ -385,11 +385,16 @@
 
 // ------------------------------ Floating-point negate
 
-template<typename T>
+template <typename T, HWY_IF_FLOAT(T)>
 HWY_INLINE Vec1<T> Neg(const Vec1<T> v) {
   return Xor(v, SignBit(Sisd<T>()));
 }
 
+template <typename T, HWY_IF_NOT_FLOAT(T)>
+HWY_INLINE Vec1<T> Neg(const Vec1<T> v) {
+  return Zero(Sisd<T>()) - v;
+}
+
 // ------------------------------ mul/div
 
 template <typename T>
diff --git a/hwy/ops/shared-inl.h b/hwy/ops/shared-inl.h
index 71357f6..cd0a8ac 100644
--- a/hwy/ops/shared-inl.h
+++ b/hwy/ops/shared-inl.h
@@ -58,6 +58,8 @@
 #define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
 
 #define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
+// IsSigned<float>() is true, so cannot use that to differentiate int/float.
+#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
 
 // Empty struct used as a size tag type.
 template <size_t N>
diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h
index 6563dfb..01af085 100644
--- a/hwy/ops/wasm_128-inl.h
+++ b/hwy/ops/wasm_128-inl.h
@@ -618,13 +618,30 @@
   return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
 }
 
-// ------------------------------ Floating-point negate
+// ------------------------------ Negate
 
-template <typename T, size_t N>
+template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
   return Xor(v, SignBit(Simd<T, N>()));
 }
 
+template <size_t N>
+HWY_API Vec128<int8_t, N> Neg(const Vec128<int8_t, N> v) {
+  return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int16_t, N> Neg(const Vec128<int16_t, N> v) {
+  return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int32_t, N> Neg(const Vec128<int32_t, N> v) {
+  return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)};
+}
+template <size_t N>
+HWY_API Vec128<int64_t, N> Neg(const Vec128<int64_t, N> v) {
+  return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)};
+}
+
 // ------------------------------ Floating-point mul / div
 
 template <size_t N>
diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h
index e1a495e..17c3835 100644
--- a/hwy/ops/x86_128-inl.h
+++ b/hwy/ops/x86_128-inl.h
@@ -774,13 +774,18 @@
   return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
 }
 
-// ------------------------------ Floating-point negate
+// ------------------------------ Negate
 
-template <typename T, size_t N>
+template <typename T, size_t N, HWY_IF_FLOAT(T)>
 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
   return Xor(v, SignBit(Simd<T, N>()));
 }
 
+template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
+HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
+  return Zero(Simd<T, N>()) - v;
+}
+
 // ------------------------------ Floating-point mul / div
 
 template <size_t N>
diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h
index 10c2d59..bf422ba 100644
--- a/hwy/ops/x86_256-inl.h
+++ b/hwy/ops/x86_256-inl.h
@@ -765,13 +765,18 @@
   return Vec256<uint64_t>{_mm256_mul_epu32(a.raw, b.raw)};
 }
 
-// ------------------------------ Floating-point negate
+// ------------------------------ Negate
 
-template <typename T>
+template <typename T, HWY_IF_FLOAT(T)>
 HWY_API Vec256<T> Neg(const Vec256<T> v) {
   return Xor(v, SignBit(Full256<T>()));
 }
 
+template <typename T, HWY_IF_NOT_FLOAT(T)>
+HWY_API Vec256<T> Neg(const Vec256<T> v) {
+  return Zero(Full256<T>()) - v;
+}
+
 // ------------------------------ Floating-point mul / div
 
 HWY_API Vec256<float> operator*(const Vec256<float> a, const Vec256<float> b) {
diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h
index f5ccb23..099907d 100644
--- a/hwy/ops/x86_512-inl.h
+++ b/hwy/ops/x86_512-inl.h
@@ -851,13 +851,18 @@
   return Vec512<uint64_t>{_mm512_mul_epu32(a.raw, b.raw)};
 }
 
-// ------------------------------ Floating-point negate
+// ------------------------------ Negate
 
-template <typename T>
+template <typename T, HWY_IF_FLOAT(T)>
 HWY_API Vec512<T> Neg(const Vec512<T> v) {
   return Xor(v, SignBit(Full512<T>()));
 }
 
+template <typename T, HWY_IF_NOT_FLOAT(T)>
+HWY_API Vec512<T> Neg(const Vec512<T> v) {
+  return Zero(Full512<T>()) - v;
+}
+
 // ------------------------------ Floating-point mul / div
 
 HWY_API Vec512<float> operator*(const Vec512<float> a, const Vec512<float> b) {
diff --git a/hwy/tests/arithmetic_test.cc b/hwy/tests/arithmetic_test.cc
index 0847df9..a41e978 100644
--- a/hwy/tests/arithmetic_test.cc
+++ b/hwy/tests/arithmetic_test.cc
@@ -1050,6 +1050,23 @@
   ForPartialVectors<TestAbsDiff>()(float());
 }
 
+struct TestNeg {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vn = Set(d, T(-3));
+    const auto vp = Set(d, T(3));
+    HWY_ASSERT_VEC_EQ(d, v0, Neg(v0));
+    HWY_ASSERT_VEC_EQ(d, vp, Neg(vn));
+    HWY_ASSERT_VEC_EQ(d, vn, Neg(vp));
+  }
+};
+
+HWY_NOINLINE void TestAllNeg() {
+  ForSignedTypes(ForPartialVectors<TestNeg>());
+  ForFloatTypes(ForPartialVectors<TestNeg>());
+}
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
@@ -1079,6 +1096,7 @@
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMaxOfLanes);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRound);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
 
 }  // namespace hwy
 #endif