Add Reverse

PiperOrigin-RevId: 395942081
diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md
index 0e087fd..81913ef 100644
--- a/g3doc/quick_reference.md
+++ b/g3doc/quick_reference.md
@@ -891,6 +891,10 @@
     <code>VI **SetTableIndices**(D, int32_t* idx)</code> prepares for
     `TableLookupLanes` with lane indices `idx = [0, N)` (need not be unique).
 
+*   `V`: `{u,i,f}{32}` \
+    <code>V **Reverse**(D, V a)</code> returns a vector with lanes in reversed
+    order (`out[i] == a[Lanes(D()) - 1 - i]`).
+
 ### Reductions
 
 **Note**: these 'reduce' all lanes to a single result (e.g. sum), which is
diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h
index 37c0693..69811d7 100644
--- a/hwy/ops/arm_neon-inl.h
+++ b/hwy/ops/arm_neon-inl.h
@@ -3356,6 +3356,23 @@
   return BitCast(Simd<float, N>(), TableLookupBytes(BitCast(di, v), idx_i));
 }
 
+// ------------------------------ Reverse (Shuffle0123, Shuffle2301)
+
+template <typename T>
+HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
+  return Shuffle0123(v);
+}
+
+template <typename T>
+HWY_API Vec128<T, 2> Reverse(Simd<T, 2> /* tag */, const Vec128<T, 2> v) {
+  return Vec128<T, 2>(Shuffle2301(v));
+}
+
+template <typename T>
+HWY_API Vec128<T, 1> Reverse(Simd<T, 1> /* tag */, const Vec128<T, 1> v) {
+  return v;
+}
+
 // ------------------------------ Other shuffles (TableLookupBytes)
 
 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h
index cf5ae46..73a277b 100644
--- a/hwy/ops/arm_sve-inl.h
+++ b/hwy/ops/arm_sve-inl.h
@@ -191,6 +191,20 @@
 
 namespace detail {
 
+// Returns actual lanes of a hardware vector without rounding to a power of two.
+HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<1> /* tag */) {
+  return svcntb_pat(SV_ALL);
+}
+HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<2> /* tag */) {
+  return svcnth_pat(SV_ALL);
+}
+HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<4> /* tag */) {
+  return svcntw_pat(SV_ALL);
+}
+HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<8> /* tag */) {
+  return svcntd_pat(SV_ALL);
+}
+
 // Returns actual lanes of a hardware vector, rounded down to a power of two.
 HWY_INLINE size_t HardwareLanes(hwy::SizeTag<1> /* tag */) {
   return svcntb_pat(SV_POW2);
@@ -198,12 +212,12 @@
 HWY_INLINE size_t HardwareLanes(hwy::SizeTag<2> /* tag */) {
   return svcnth_pat(SV_POW2);
 }
-HWY_INLINE size_t HardwareLanes(hwy::SizeTag<8> /* tag */) {
-  return svcntd_pat(SV_POW2);
-}
 HWY_INLINE size_t HardwareLanes(hwy::SizeTag<4> /* tag */) {
   return svcntw_pat(SV_POW2);
 }
+HWY_INLINE size_t HardwareLanes(hwy::SizeTag<8> /* tag */) {
+  return svcntd_pat(SV_POW2);
+}
 
 }  // namespace detail
 
@@ -1370,6 +1384,27 @@
 HWY_SVE_FOREACH(HWY_SVE_TABLE, TableLookupLanes, tbl)
 #undef HWY_SVE_TABLE
 
+// ------------------------------ Reverse
+
+#if 0  // if we could assume VL is a power of two
+#error "Update macro"
+#endif
+#define HWY_SVE_REVERSE(BASE, CHAR, BITS, NAME, OP)                     \
+  template <size_t N>                                                   \
+  HWY_API HWY_SVE_V(BASE, BITS)                                         \
+      NAME(Simd<HWY_SVE_T(BASE, BITS), N> d, HWY_SVE_V(BASE, BITS) v) { \
+    const auto reversed = sv##OP##_##CHAR##BITS(v);                     \
+    /* Shift right to remove extra (non-pow2 and remainder) lanes. */   \
+    const size_t all_lanes =                                            \
+        detail::AllHardwareLanes(hwy::SizeTag<BITS / 8>());             \
+    /* TODO(janwas): on SVE2, use whilege. */                           \
+    const svbool_t mask = Not(FirstN(d, all_lanes - Lanes(d)));         \
+    return detail::Splice(reversed, reversed, mask);                    \
+  }
+
+HWY_SVE_FOREACH(HWY_SVE_REVERSE, Reverse, rev)
+#undef HWY_SVE_REVERSE
+
 // ------------------------------ Compress (PromoteTo)
 
 #define HWY_SVE_COMPRESS(BASE, CHAR, BITS, NAME, OP)                           \
diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h
index cfd1c8c..c76719f 100644
--- a/hwy/ops/rvv-inl.h
+++ b/hwy/ops/rvv-inl.h
@@ -1510,6 +1510,16 @@
 HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather)
 #undef HWY_RVV_TABLE
 
+// ------------------------------ Reverse
+template <class D>
+HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) {
+  const RebindToUnsigned<D> du;
+  using TU = TFromD<decltype(du)>;
+  const size_t N = Lanes(du);
+  const auto idx = Sub(Set(du, static_cast<TU>(N - 1)), detail::Iota0(du));
+  return TableLookupLanes(v, idx);
+}
+
 // ------------------------------ Compress
 
 #define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME,  \
diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h
index 0c18a2b..522583f 100644
--- a/hwy/ops/scalar-inl.h
+++ b/hwy/ops/scalar-inl.h
@@ -1094,6 +1094,13 @@
   return v;
 }
 
+// ------------------------------ Reverse
+
+template <typename T>
+HWY_API Vec1<T> Reverse(Sisd<T> /* tag */, const Vec1<T> v) {
+  return v;
+}
+
 // ================================================== BLOCKWISE
 // Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.
 
diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h
index 44893d2..24e6363 100644
--- a/hwy/ops/wasm_128-inl.h
+++ b/hwy/ops/wasm_128-inl.h
@@ -2035,6 +2035,23 @@
                  TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
 }
 
+// ------------------------------ Reverse (Shuffle0123, Shuffle2301)
+
+template <typename T>
+HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
+  return Shuffle0123(v);
+}
+
+template <typename T>
+HWY_API Vec128<T, 2> Reverse(Simd<T, 2> /* tag */, const Vec128<T, 2> v) {
+  return Vec128<T, 2>(Shuffle2301(Vec128<T>(v.raw)).raw);
+}
+
+template <typename T>
+HWY_API Vec128<T, 1> Reverse(Simd<T, 1> /* tag */, const Vec128<T, 1> v) {
+  return v;
+}
+
 // ------------------------------ InterleaveLower
 
 template <size_t N>
diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h
index 1ff5bea..29efffb 100644
--- a/hwy/ops/x86_128-inl.h
+++ b/hwy/ops/x86_128-inl.h
@@ -3214,6 +3214,23 @@
                  TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
 }
 
+// ------------------------------ Reverse (Shuffle0123, Shuffle2301)
+
+template <typename T>
+HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
+  return Shuffle0123(v);
+}
+
+template <typename T>
+HWY_API Vec128<T, 2> Reverse(Simd<T, 2> /* tag */, const Vec128<T, 2> v) {
+  return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
+}
+
+template <typename T>
+HWY_API Vec128<T, 1> Reverse(Simd<T, 1> /* tag */, const Vec128<T, 1> v) {
+  return v;
+}
+
 // ------------------------------ InterleaveLower
 
 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h
index a433f9d..e2403b5 100644
--- a/hwy/ops/x86_256-inl.h
+++ b/hwy/ops/x86_256-inl.h
@@ -2560,6 +2560,14 @@
   return Vec256<float>{_mm256_permutevar8x32_ps(v.raw, idx.raw)};
 }
 
+// ------------------------------ Reverse
+
+template <typename T>
+HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
+  alignas(32) constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
+  return TableLookupLanes(v, SetTableIndices(d, kReverse));
+}
+
 // ------------------------------ InterleaveLower
 
 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h
index f1272f4..7f4e581 100644
--- a/hwy/ops/x86_512-inl.h
+++ b/hwy/ops/x86_512-inl.h
@@ -2326,6 +2326,15 @@
   return Vec512<float>{_mm512_permutexvar_ps(idx.raw, v.raw)};
 }
 
+// ------------------------------ Reverse
+
+template <typename T>
+HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
+  alignas(32) constexpr int32_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
+                                                7,  6,  5,  4,  3,  2,  1, 0};
+  return TableLookupLanes(v, SetTableIndices(d, kReverse));
+}
+
 // ------------------------------ InterleaveLower
 
 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
diff --git a/hwy/tests/swizzle_test.cc b/hwy/tests/swizzle_test.cc
index ef024c4..4939870 100644
--- a/hwy/tests/swizzle_test.cc
+++ b/hwy/tests/swizzle_test.cc
@@ -131,6 +131,26 @@
   test(float());
 }
 
+struct TestReverse {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const auto v = Iota(d, 1);
+    auto expected = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>(N - i);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse(d, v));
+  }
+};
+
+HWY_NOINLINE void TestAllReverse() {
+  const ForPartialVectors<TestReverse> test;
+  test(uint32_t());
+  test(int32_t());
+  test(float());
+}
+
 class TestCompress {
   template <typename T, typename TI, size_t N>
   void CheckStored(Simd<T, N> d, Simd<TI, N> di, size_t expected_pos,
@@ -396,6 +416,7 @@
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllGetLane);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllCompress);
 }  // namespace hwy