Add Reverse
PiperOrigin-RevId: 395942081
diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md
index 0e087fd..81913ef 100644
--- a/g3doc/quick_reference.md
+++ b/g3doc/quick_reference.md
@@ -891,6 +891,10 @@
<code>VI **SetTableIndices**(D, int32_t* idx)</code> prepares for
`TableLookupLanes` with lane indices `idx = [0, N)` (need not be unique).
+* `V`: `{u,i,f}{32}` \
+ <code>V **Reverse**(D, V a)</code> returns a vector with lanes in reversed
+ order (`out[i] == a[Lanes(D()) - 1 - i]`).
+
### Reductions
**Note**: these 'reduce' all lanes to a single result (e.g. sum), which is
diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h
index 37c0693..69811d7 100644
--- a/hwy/ops/arm_neon-inl.h
+++ b/hwy/ops/arm_neon-inl.h
@@ -3356,6 +3356,23 @@
return BitCast(Simd<float, N>(), TableLookupBytes(BitCast(di, v), idx_i));
}
+// ------------------------------ Reverse (Shuffle0123, Shuffle2301)
+
+template <typename T>
+HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
+ return Shuffle0123(v);
+}
+
+template <typename T>
+HWY_API Vec128<T, 2> Reverse(Simd<T, 2> /* tag */, const Vec128<T, 2> v) {
+ return Vec128<T, 2>(Shuffle2301(v));
+}
+
+template <typename T>
+HWY_API Vec128<T, 1> Reverse(Simd<T, 1> /* tag */, const Vec128<T, 1> v) {
+ return v;
+}
+
// ------------------------------ Other shuffles (TableLookupBytes)
// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h
index cf5ae46..73a277b 100644
--- a/hwy/ops/arm_sve-inl.h
+++ b/hwy/ops/arm_sve-inl.h
@@ -191,6 +191,20 @@
namespace detail {
+// Returns actual lanes of a hardware vector without rounding to a power of two.
+HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<1> /* tag */) {
+ return svcntb_pat(SV_ALL);
+}
+HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<2> /* tag */) {
+ return svcnth_pat(SV_ALL);
+}
+HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<4> /* tag */) {
+ return svcntw_pat(SV_ALL);
+}
+HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<8> /* tag */) {
+ return svcntd_pat(SV_ALL);
+}
+
// Returns actual lanes of a hardware vector, rounded down to a power of two.
HWY_INLINE size_t HardwareLanes(hwy::SizeTag<1> /* tag */) {
return svcntb_pat(SV_POW2);
@@ -198,12 +212,12 @@
HWY_INLINE size_t HardwareLanes(hwy::SizeTag<2> /* tag */) {
return svcnth_pat(SV_POW2);
}
-HWY_INLINE size_t HardwareLanes(hwy::SizeTag<8> /* tag */) {
- return svcntd_pat(SV_POW2);
-}
HWY_INLINE size_t HardwareLanes(hwy::SizeTag<4> /* tag */) {
return svcntw_pat(SV_POW2);
}
+HWY_INLINE size_t HardwareLanes(hwy::SizeTag<8> /* tag */) {
+ return svcntd_pat(SV_POW2);
+}
} // namespace detail
@@ -1370,6 +1384,27 @@
HWY_SVE_FOREACH(HWY_SVE_TABLE, TableLookupLanes, tbl)
#undef HWY_SVE_TABLE
+// ------------------------------ Reverse
+
+#if 0 // if we could assume VL is a power of two
+#error "Update macro"
+#endif
+#define HWY_SVE_REVERSE(BASE, CHAR, BITS, NAME, OP) \
+ template <size_t N> \
+ HWY_API HWY_SVE_V(BASE, BITS) \
+ NAME(Simd<HWY_SVE_T(BASE, BITS), N> d, HWY_SVE_V(BASE, BITS) v) { \
+ const auto reversed = sv##OP##_##CHAR##BITS(v); \
+ /* Shift right to remove extra (non-pow2 and remainder) lanes. */ \
+ const size_t all_lanes = \
+ detail::AllHardwareLanes(hwy::SizeTag<BITS / 8>()); \
+ /* TODO(janwas): on SVE2, use whilege. */ \
+ const svbool_t mask = Not(FirstN(d, all_lanes - Lanes(d))); \
+ return detail::Splice(reversed, reversed, mask); \
+ }
+
+HWY_SVE_FOREACH(HWY_SVE_REVERSE, Reverse, rev)
+#undef HWY_SVE_REVERSE
+
// ------------------------------ Compress (PromoteTo)
#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, NAME, OP) \
diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h
index cfd1c8c..c76719f 100644
--- a/hwy/ops/rvv-inl.h
+++ b/hwy/ops/rvv-inl.h
@@ -1510,6 +1510,16 @@
HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather)
#undef HWY_RVV_TABLE
+// ------------------------------ Reverse
+template <class D>
+HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) {
+ const RebindToUnsigned<D> du;
+ using TU = TFromD<decltype(du)>;
+ const size_t N = Lanes(du);
+ const auto idx = Sub(Set(du, static_cast<TU>(N - 1)), detail::Iota0(du));
+ return TableLookupLanes(v, idx);
+}
+
// ------------------------------ Compress
#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, LMUL, X2, HALF, SHIFT, MLEN, NAME, \
diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h
index 0c18a2b..522583f 100644
--- a/hwy/ops/scalar-inl.h
+++ b/hwy/ops/scalar-inl.h
@@ -1094,6 +1094,13 @@
return v;
}
+// ------------------------------ Reverse
+
+template <typename T>
+HWY_API Vec1<T> Reverse(Sisd<T> /* tag */, const Vec1<T> v) {
+ return v;
+}
+
// ================================================== BLOCKWISE
// Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.
diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h
index 44893d2..24e6363 100644
--- a/hwy/ops/wasm_128-inl.h
+++ b/hwy/ops/wasm_128-inl.h
@@ -2035,6 +2035,23 @@
TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
}
+// ------------------------------ Reverse (Shuffle0123, Shuffle2301)
+
+template <typename T>
+HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
+ return Shuffle0123(v);
+}
+
+template <typename T>
+HWY_API Vec128<T, 2> Reverse(Simd<T, 2> /* tag */, const Vec128<T, 2> v) {
+ return Vec128<T, 2>(Shuffle2301(Vec128<T>(v.raw)).raw);
+}
+
+template <typename T>
+HWY_API Vec128<T, 1> Reverse(Simd<T, 1> /* tag */, const Vec128<T, 1> v) {
+ return v;
+}
+
// ------------------------------ InterleaveLower
template <size_t N>
diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h
index 1ff5bea..29efffb 100644
--- a/hwy/ops/x86_128-inl.h
+++ b/hwy/ops/x86_128-inl.h
@@ -3214,6 +3214,23 @@
TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
}
+// ------------------------------ Reverse (Shuffle0123, Shuffle2301)
+
+template <typename T>
+HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
+ return Shuffle0123(v);
+}
+
+template <typename T>
+HWY_API Vec128<T, 2> Reverse(Simd<T, 2> /* tag */, const Vec128<T, 2> v) {
+ return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
+}
+
+template <typename T>
+HWY_API Vec128<T, 1> Reverse(Simd<T, 1> /* tag */, const Vec128<T, 1> v) {
+ return v;
+}
+
// ------------------------------ InterleaveLower
// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h
index a433f9d..e2403b5 100644
--- a/hwy/ops/x86_256-inl.h
+++ b/hwy/ops/x86_256-inl.h
@@ -2560,6 +2560,14 @@
return Vec256<float>{_mm256_permutevar8x32_ps(v.raw, idx.raw)};
}
+// ------------------------------ Reverse
+
+template <typename T>
+HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
+ alignas(32) constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
+ return TableLookupLanes(v, SetTableIndices(d, kReverse));
+}
+
// ------------------------------ InterleaveLower
// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h
index f1272f4..7f4e581 100644
--- a/hwy/ops/x86_512-inl.h
+++ b/hwy/ops/x86_512-inl.h
@@ -2326,6 +2326,15 @@
return Vec512<float>{_mm512_permutexvar_ps(idx.raw, v.raw)};
}
+// ------------------------------ Reverse
+
+template <typename T>
+HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
+ alignas(32) constexpr int32_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0};
+ return TableLookupLanes(v, SetTableIndices(d, kReverse));
+}
+
// ------------------------------ InterleaveLower
// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
diff --git a/hwy/tests/swizzle_test.cc b/hwy/tests/swizzle_test.cc
index ef024c4..4939870 100644
--- a/hwy/tests/swizzle_test.cc
+++ b/hwy/tests/swizzle_test.cc
@@ -131,6 +131,26 @@
test(float());
}
+struct TestReverse {
+ template <class T, class D>
+ HWY_NOINLINE void operator()(T /*unused*/, D d) {
+ const size_t N = Lanes(d);
+ const auto v = Iota(d, 1);
+ auto expected = AllocateAligned<T>(N);
+ for (size_t i = 0; i < N; ++i) {
+ expected[i] = static_cast<T>(N - i);
+ }
+ HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse(d, v));
+ }
+};
+
+HWY_NOINLINE void TestAllReverse() {
+ const ForPartialVectors<TestReverse> test;
+ test(uint32_t());
+ test(int32_t());
+ test(float());
+}
+
class TestCompress {
template <typename T, typename TI, size_t N>
void CheckStored(Simd<T, N> d, Simd<TI, N> di, size_t expected_pos,
@@ -396,6 +416,7 @@
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllGetLane);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllCompress);
} // namespace hwy