Merge pull request #1954 from DiamonDinoia:master

PiperOrigin-RevId: 612729932
diff --git a/.bazelrc b/.bazelrc
new file mode 100644
index 0000000..3ce91d2
--- /dev/null
+++ b/.bazelrc
@@ -0,0 +1 @@
+common --enable_bzlmod
diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml
index b3c728b..5c6609f 100644
--- a/.github/workflows/build_test.yml
+++ b/.github/workflows/build_test.yml
@@ -26,54 +26,108 @@
             extra_deps: clang-6.0
             c_compiler: clang-6.0
             cxx_compiler: clang++-6.0
+            cxx_standard: 11
+
+          - name: Clang-6.0 (C++14)
+            extra_deps: clang-6.0
+            c_compiler: clang-6.0
+            cxx_compiler: clang++-6.0
+            cxx_standard: 14
+
+          - name: Clang-6.0 (C++17)
+            extra_deps: clang-6.0
+            c_compiler: clang-6.0
+            cxx_compiler: clang++-6.0
+            cxx_standard: 17
 
           - name: Clang-7
             extra_deps: clang-7
             c_compiler: clang-7
             cxx_compiler: clang++-7
+            cxx_standard: 11
+
+          - name: Clang-7 (C++14)
+            extra_deps: clang-7
+            c_compiler: clang-7
+            cxx_compiler: clang++-7
+            cxx_standard: 14
+
+          - name: Clang-7 (C++17)
+            extra_deps: clang-7
+            c_compiler: clang-7
+            cxx_compiler: clang++-7
+            cxx_standard: 17
 
           - name: Clang-8
             extra_deps: clang-8
             c_compiler: clang-8
             cxx_compiler: clang++-8
+            cxx_standard: 11
 
           - name: Clang-9
             extra_deps: clang-9
             c_compiler: clang-9
             cxx_compiler: clang++-9
+            cxx_standard: 11
 
           - name: Clang-10
             extra_deps: clang-10
             c_compiler: clang-10
             cxx_compiler: clang++-10
+            cxx_standard: 11
+
+          - name: Clang-10 (C++20)
+            extra_deps: clang-10
+            c_compiler: clang-10
+            cxx_compiler: clang++-10
+            cxx_standard: 20
 
           - name: Clang-11
             extra_deps: clang-11
             c_compiler: clang-11
             cxx_compiler: clang++-11
+            cxx_standard: 11
 
           - name: Clang-12
             extra_deps: clang-12
             c_compiler: clang-12
             cxx_compiler: clang++-12
+            cxx_standard: 11
 
           - name: GCC-8
             extra_deps: g++-8
             c_compiler: gcc-8
             cxx_compiler: g++-8
             cxx_flags: -ftrapv
+            cxx_standard: 11
+
+          - name: GCC-8 (C++14)
+            extra_deps: g++-8
+            c_compiler: gcc-8
+            cxx_compiler: g++-8
+            cxx_flags: -ftrapv
+            cxx_standard: 14
+
+          - name: GCC-8 (C++17)
+            extra_deps: g++-8
+            c_compiler: gcc-8
+            cxx_compiler: g++-8
+            cxx_flags: -ftrapv
+            cxx_standard: 17
 
           - name: GCC-9
             extra_deps: g++-9
             c_compiler: gcc-9
             cxx_compiler: g++-9
             cxx_flags: -ftrapv
+            cxx_standard: 11
 
           - name: GCC-10
             extra_deps: g++-10
             c_compiler: gcc-10
             cxx_compiler: g++-10
             cxx_flags: -ftrapv
+            cxx_standard: 11
 
     steps:
       - name: Harden Runner
@@ -90,7 +144,7 @@
         run: |
           export CMAKE_BUILD_PARALLEL_LEVEL=2
           export CTEST_PARALLEL_LEVEL=2
-          CXXFLAGS=${{ matrix.cxx_flags }} CC=${{ matrix.c_compiler }} CXX=${{ matrix.cxx_compiler }} cmake -DHWY_WARNINGS_ARE_ERRORS=ON -B out .
+          CXXFLAGS=${{ matrix.cxx_flags }} CC=${{ matrix.c_compiler }} CXX=${{ matrix.cxx_compiler }} cmake -DHWY_WARNINGS_ARE_ERRORS=ON -DCMAKE_CXX_STANDARD=${{ matrix.cxx_standard }} -B out .
           cmake --build out
           ctest --test-dir out
 
@@ -104,28 +158,53 @@
             extra_deps: clang-13
             c_compiler: clang-13
             cxx_compiler: clang++-13
+            cxx_standard: 11
 
           - name: Clang-14
             extra_deps: clang-14
             c_compiler: clang-14
             cxx_compiler: clang++-14
+            cxx_standard: 11
 
           - name: Clang-15
             extra_deps: clang-15
             c_compiler: clang-15
             cxx_compiler: clang++-15
+            cxx_standard: 11
+
+          - name: Clang-15 (C++20)
+            extra_deps: clang-15
+            c_compiler: clang-15
+            cxx_compiler: clang++-15
+            cxx_standard: 20
 
           - name: GCC-11
             extra_deps: g++-11
             c_compiler: gcc-11
             cxx_compiler: g++-11
             cxx_flags: -ftrapv
+            cxx_standard: 11
+
+          - name: GCC-11 (C++20)
+            extra_deps: g++-11
+            c_compiler: gcc-11
+            cxx_compiler: g++-11
+            cxx_flags: -ftrapv
+            cxx_standard: 20
 
           - name: GCC-12
             extra_deps: g++-12
             c_compiler: gcc-12
             cxx_compiler: g++-12
             cxx_flags: -ftrapv
+            cxx_standard: 11
+
+          - name: GCC-12 (C++20)
+            extra_deps: g++-12
+            c_compiler: gcc-12
+            cxx_compiler: g++-12
+            cxx_flags: -ftrapv
+            cxx_standard: 20
 
     steps:
       - name: Harden Runner
@@ -142,7 +221,7 @@
         run: |
           export CMAKE_BUILD_PARALLEL_LEVEL=2
           export CTEST_PARALLEL_LEVEL=2
-          CXXFLAGS=${{ matrix.cxx_flags }} CC=${{ matrix.c_compiler }} CXX=${{ matrix.cxx_compiler }} cmake -DHWY_WARNINGS_ARE_ERRORS=ON -B out .
+          CXXFLAGS=${{ matrix.cxx_flags }} CC=${{ matrix.c_compiler }} CXX=${{ matrix.cxx_compiler }} cmake -DHWY_WARNINGS_ARE_ERRORS=ON -DCMAKE_CXX_STANDARD=${{ matrix.cxx_standard }} -B out .
           cmake --build out
           ctest --test-dir out
 
@@ -158,7 +237,7 @@
 
       - uses: bazelbuild/setup-bazelisk@b39c379c82683a5f25d34f0d062761f62693e0b2 # v3.0.0
 
-      - uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0
+      - uses: actions/cache@ab5e6d0c87105b4c9c2047343972218f562e4319 # v4.0.1
         with:
           path: ~/.cache/bazel
           key: bazel-${{ runner.os }}
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index e9d05a7..8e49f65 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -38,7 +38,7 @@
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@e8893c57a1f3a2b659b6b55564fdfdbbd2982911 # v3.24.0
+      uses: github/codeql-action/init@8a470fddafa5cbb6266ee11b37ef4d8aae19c571 # v3.24.6
       with:
         languages: ${{ matrix.language }}
         # If you wish to specify custom queries, you can do so here or in a config file.
@@ -52,7 +52,7 @@
     # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift).
     # If this step fails, then you should remove it and run the build manually (see below)
     - name: Autobuild
-      uses: github/codeql-action/autobuild@e8893c57a1f3a2b659b6b55564fdfdbbd2982911 # v3.24.0
+      uses: github/codeql-action/autobuild@8a470fddafa5cbb6266ee11b37ef4d8aae19c571 # v3.24.6
 
     # â„šī¸ Command-line programs to run using the OS shell.
     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
@@ -65,6 +65,6 @@
     #     ./location_of_script_within_repo/buildscript.sh
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@e8893c57a1f3a2b659b6b55564fdfdbbd2982911 # v3.24.0
+      uses: github/codeql-action/analyze@8a470fddafa5cbb6266ee11b37ef4d8aae19c571 # v3.24.6
       with:
         category: "/language:${{matrix.language}}"
\ No newline at end of file
diff --git a/BUILD b/BUILD
index 73c7fb5..7ccb46f 100644
--- a/BUILD
+++ b/BUILD
@@ -210,6 +210,19 @@
 )
 
 cc_library(
+    name = "stats",
+    srcs = [
+        "hwy/stats.cc",
+    ],
+    hdrs = [
+        "hwy/stats.h",
+    ],
+    compatible_with = [],
+    copts = COPTS,
+    deps = [":hwy"],
+)
+
+cc_library(
     name = "nanobenchmark",
     srcs = [
         "hwy/nanobenchmark.cc",
@@ -302,6 +315,9 @@
     copts = COPTS,
     deps = [
         ":hwy",  # HWY_ASSERT
+        ":nanobenchmark",
+        ":profiler",
+        ":stats",
     ],
 )
 
@@ -461,6 +477,7 @@
     ("hwy/tests/", "mask_combine_test"),
     ("hwy/tests/", "mask_convert_test"),
     ("hwy/tests/", "mask_mem_test"),
+    ("hwy/tests/", "mask_slide_test"),
     ("hwy/tests/", "mask_test"),
     ("hwy/tests/", "masked_arithmetic_test"),
     ("hwy/tests/", "memory_test"),
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2df89be..817ccd6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,7 +33,7 @@
   cmake_policy(SET CMP0128 NEW)
 endif()
 
-project(hwy VERSION 1.0.7)  # Keep in sync with highway.h version
+project(hwy VERSION 1.1.0)  # Keep in sync with highway.h version
 # `hwy` is lowercase to handle find_package() in Config mode:
 set(namespace "${PROJECT_NAME}::")
 
@@ -263,6 +263,18 @@
     list(APPEND HWY_FLAGS -msse2 -mfpmath=sse)
   endif()
 
+  # Suppress STL iterator warnings. Supported by GCC 4.4.7 and newer, which
+  # predates the C++11 we require.
+  if (${CMAKE_CXX_COMPILER_ID} MATCHES "GNU")
+    list(APPEND HWY_FLAGS -Wno-psabi)
+  endif()
+  # Clang supports this flag from 11.0.
+  if (${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 11.0)
+      list(APPEND HWY_FLAGS -Wno-psabi)
+    endif()
+  endif()
+
   if (HWY_CMAKE_ARM7)
     list(APPEND HWY_FLAGS
       -march=armv7-a
@@ -575,6 +587,7 @@
   hwy/tests/mask_combine_test.cc
   hwy/tests/mask_convert_test.cc
   hwy/tests/mask_mem_test.cc
+  hwy/tests/mask_slide_test.cc
   hwy/tests/mask_test.cc
   hwy/tests/masked_arithmetic_test.cc
   hwy/tests/memory_test.cc
diff --git a/LICENSE-BSD3 b/LICENSE-BSD3
index 7e13e8b..51d1bd4 100644
--- a/LICENSE-BSD3
+++ b/LICENSE-BSD3
@@ -1,25 +1,18 @@
-Copyright (c) the Highway Project Authors.
-All rights reserved.
+Copyright (c) The Highway Project Authors. All rights reserved.
 
-Redistribution and use in source and binary forms,
-with or without
-modification, are permitted provided that
-the following conditions are met:
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
 
-1. Redistributions of source code
-must retain the above copyright notice, this
-list of conditions and
-the following disclaimer.
+1.  Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
 
-2. Redistributions in binary form
-must reproduce the above copyright notice,
-this list of conditions
-and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
+2.  Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
 
-3. Neither the name of the copyright holder nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
+3.  Neither the name of the copyright holder nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
diff --git a/MODULE.bazel b/MODULE.bazel
index 39a94ac..46dfa13 100644
--- a/MODULE.bazel
+++ b/MODULE.bazel
@@ -1,4 +1,4 @@
-module(name = "highway", version = "1.0.7")
+module(name = "highway", version = "1.1.0")
 
 bazel_dep(name = "bazel_skylib", version = "1.3.0")
 bazel_dep(name = "googletest", version = "1.12.1")
diff --git a/README.md b/README.md
index d38b1c7..42d36fe 100644
--- a/README.md
+++ b/README.md
@@ -101,10 +101,11 @@
 
 ### Targets
 
-Highway supports 20 targets, listed in alphabetical order of platform:
+Highway supports 22 targets, listed in alphabetical order of platform:
 
 -   Any: `EMU128`, `SCALAR`;
 -   Arm: `NEON` (Armv7+), `SVE`, `SVE2`, `SVE_256`, `SVE2_128`;
+-   IBM Z: `Z14`, `Z15`;
 -   POWER: `PPC8` (v2.07), `PPC9` (v3.0), `PPC10` (v3.1B, not yet supported
     due to compiler bugs, see #1207; also requires QEMU 7.2);
 -   RISC-V: `RVV` (1.0);
diff --git a/debian/changelog b/debian/changelog
index 104e940..05776ff 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,26 @@
+
+highway (1.1.0-1) UNRELEASED; urgency=medium
+
+  * Add BitCastScalar, DispatchedTarget, Foreach
+  * Add Div/Mod and MaskedDiv/ModOr, SaturatedAbs, SaturatedNeg
+  * Add InterleaveWholeLower/Upper, Dup128VecFromValues
+  * Add IsInteger, IsIntegerLaneType, RemoveVolatile, RemoveCvRef
+  * Add MaskedAdd/Sub/Mul/Div/Gather/Min/Max/SatAdd/SatSubOr
+  * Add MaskFalse, IfNegativeThenNegOrUndefIfZero, PromoteEven/OddTo
+  * Add ReduceMin/Max, 8-bit reductions, f16 <-> f64 conversions
+  * Add Span, AlignedArray, matrix-vector mul
+  * Add SumsOf2/4, I8 SumsOf8, SumsOfAdjQuadAbsDiff, SumsOfShuffledQuadAbsDiff
+  * Add ThreadPool, hierarchical profiler
+  * Build: use bazel_platforms
+  * Enable clang16 Arm/PPC runtime dispatch, F16 for GCC AVX3_SPR
+  * Extend Dot to f32*bf16, FMA to integer
+  * Fix: RVV 8-bit overflow, UB in vqsort, big-endian bugs, PPC HTM
+  * Improved codegen in various ops, fp16/bf16 tests and conversions
+  * New targets: HWY_Z14, HWY_Z15
+  * Test: add foreign_arch builders, CodeQL
+
+ -- Jan Wassenberg <janwas@google.com>  Sat, 17 Feb 2024 12:00:00 +0100
+
 highway (1.0.7-1) UNRELEASED; urgency=medium
 
   * Add LoadNOr, GatherIndexN, ScatterIndexN
diff --git a/g3doc/faq.md b/g3doc/faq.md
index 1584bdb..52f35e4 100644
--- a/g3doc/faq.md
+++ b/g3doc/faq.md
@@ -8,7 +8,7 @@
 
 A: Highway is available in numerous package managers, e.g. under the name
 libhwy-dev. After installing, you can add it to your CMake-based build via
-`find_package(HWY 1.0.7)` and `target_link_libraries(your_project PRIVATE hwy)`.
+`find_package(HWY 1.1.0)` and `target_link_libraries(your_project PRIVATE hwy)`.
 
 Alternatively, if using Git for version control, you can use Highway as a
 'submodule' by adding the following to .gitmodules:
diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md
index 73fe5f3..98ecd9b 100644
--- a/g3doc/quick_reference.md
+++ b/g3doc/quick_reference.md
@@ -842,9 +842,13 @@
 *   `V`: `{u,i}` \
     <code>V **ShiftRight**&lt;int&gt;(V a)</code> returns `a[i] >> int`.
 
-*   `V`: `{u}` \
-    <code>V **RotateRight**&lt;int&gt;(V a)</code> returns `(a[i] >> int) |
-    (a[i] << (sizeof(T)*8 - int))`.
+*   `V`: `{u,i}` \
+    <code>V **RotateLeft**&lt;int&gt;(V a)</code> returns `(a[i] << int) |
+    (static_cast<TU>(a[i]) >> (sizeof(T)*8 - int))`.
+
+*   `V`: `{u,i}` \
+    <code>V **RotateRight**&lt;int&gt;(V a)</code> returns
+    `(static_cast<TU>(a[i]) >> int) | (a[i] << (sizeof(T)*8 - int))`.
 
 Shift all lanes by the same (not necessarily compile-time constant) amount:
 
@@ -854,6 +858,18 @@
 *   `V`: `{u,i}` \
     <code>V **ShiftRightSame**(V a, int bits)</code> returns `a[i] >> bits`.
 
+*   `V`: `{u,i}` \
+    <code>V **RotateLeftSame**(V a, int bits)</code> returns
+    `(a[i] << shl_bits) | (static_cast<TU>(a[i]) >>
+    (sizeof(T)*8 - shl_bits))`, where `shl_bits` is equal to
+    `bits & (sizeof(T)*8 - 1)`.
+
+*   `V`: `{u,i}` \
+    <code>V **RotateRightSame**(V a, int bits)</code> returns
+    `(static_cast<TU>(a[i]) >> shr_bits) | (a[i] >>
+    (sizeof(T)*8 - shr_bits))`, where `shr_bits` is equal to
+    `bits & (sizeof(T)*8 - 1)`.
+
 Per-lane variable shifts (slow if SSSE3/SSE4, or 16-bit, or Shr i64 on AVX2):
 
 *   `V`: `{u,i}` \
@@ -864,6 +880,18 @@
     <code>V **operator>>**(V a, V b)</code> returns `a[i] >> b[i]`. Currently
     unavailable on SVE/RVV; use the equivalent `Shr` instead.
 
+*   `V`: `{u,i}` \
+    <code>V **Rol**(V a, V b)</code> returns
+    `(a[i] << (b[i] & shift_amt_mask)) |
+    (static_cast<TU>(a[i]) >> ((sizeof(T)*8 - b[i]) & shift_amt_mask))`,
+    where `shift_amt_mask` is equal to `sizeof(T)*8 - 1`.
+
+*   `V`: `{u,i}` \
+    <code>V **Ror**(V a, V b)</code> returns
+    `(static_cast<TU>(a[i]) >> (b[i] & shift_amt_mask)) |
+    (a[i] << ((sizeof(T)*8 - b[i]) & shift_amt_mask))`, where `shift_amt_mask` is
+    equal to `sizeof(T)*8 - 1`.
+
 #### Floating-point rounding
 
 *   `V`: `{f}` \
@@ -1114,6 +1142,34 @@
 
     CombineMasks is only available if `HWY_TARGET != HWY_SCALAR` is true.
 
+#### Slide mask across blocks
+
+*   <code>M **SlideMaskUpLanes**(D d, M m, size_t N)</code>:
+    Slides `m` up `N` lanes. `SlideMaskUpLanes(d, m, N)` is equivalent to
+    `MaskFromVec(SlideUpLanes(d, VecFromMask(d, m), N))`, but
+    `SlideMaskUpLanes(d, m, N)` is more efficient on some targets.
+
+    The results of SlideMaskUpLanes is implementation-defined if
+    `N >= Lanes(d)`.
+
+*   <code>M **SlideMaskDownLanes**(D d, M m, size_t N)</code>:
+    Slides `m` down `N` lanes. `SlideMaskDownLanes(d, m, N)` is equivalent to
+    `MaskFromVec(SlideDownLanes(d, VecFromMask(d, m), N))`, but
+    `SlideMaskDownLanes(d, m, N)` is more efficient on some targets.
+
+    The results of SlideMaskDownLanes is implementation-defined if
+    `N >= Lanes(d)`.
+
+*   <code>M **SlideMask1Up**(D d, M m)</code>:
+    Slides `m` up 1 lane. `SlideMask1Up(d, m)` is equivalent to
+    `MaskFromVec(Slide1Up(d, VecFromMask(d, m)))`, but `SlideMask1Up(d, m)` is
+     more efficient on some targets.
+
+*   <code>M **SlideMask1Down**(D d, M m)</code>:
+    Slides `m` down 1 lane. `SlideMask1Down(d, m)` is equivalent to
+    `MaskFromVec(Slide1Down(d, VecFromMask(d, m)))`, but `SlideMask1Down(d, m)` is
+    more efficient on some targets.
+
 #### Test mask
 
 *   <code>bool **AllTrue**(D, M m)</code>: returns whether all `m[i]` are true.
@@ -1610,7 +1666,7 @@
     <code>Vec&lt;D&gt; **DemoteTo**(D, V v)</code>: converts 64-bit integer to
     `float`.
 
-*   `V`,`D`: (`f32,f16`), (`f32,bf16`) \
+*   `V`,`D`: (`f32,f16`), (`f64,f16`), (`f32,bf16`) \
     <code>Vec&lt;D&gt; **DemoteTo**(D, V v)</code>: narrows float to half (for
     bf16, it is unspecified whether this truncates or rounds).
 
@@ -1620,7 +1676,7 @@
 `LowerHalf` or `UpperHalf`, or load them using a half-sized `D`.
 
 *   Unsigned `V` to wider signed/unsigned `D`; signed to wider signed, `f16` to
-    `f32`, `bf16` to `f32`, `f32` to `f64` \
+    `f32`, `f16` to `f64`, `bf16` to `f32`, `f32` to `f64` \
     <code>Vec&lt;D&gt; **PromoteTo**(D, V part)</code>: returns `part[i]` zero-
     or sign-extended to the integer type `MakeWide<T>`, or widened to the
     floating-point type `MakeFloat<MakeWide<T>>`.
@@ -1820,6 +1876,25 @@
     alternating lanes from the upper halves of `a` and `b` (`a[N/2]` in the
     least-significant lane). `D` is `DFromV<V>`.
 
+*   <code>V **InterleaveEven**([D, ] V a, V b)</code>: returns *blocks* with
+    alternating lanes from the even lanes of `a` and `b` (`a[0]` in the
+    least-significant lane, followed by `b[0]`, followed by `a[2]`, followed by
+    `b[2]`, and so on). The optional `D` (provided for consistency with
+    `InterleaveOdd`) is `DFromV<V>`.
+
+    `InterleaveEven(a, b)` and `InterleaveEven(d, a, b)` are both equivalent to
+    `OddEven(DupEven(b), a)`, but `InterleaveEven(a, b)` is usually more
+    efficient than `OddEven(DupEven(b), a)`.
+
+*   <code>V **InterleaveOdd**(D, V a, V b)</code>: returns *blocks* with
+    alternating lanes from the odd lanes of `a` and `b` (`a[1]` in the
+    least-significant lane, followed by `b[1]`, followed by `a[3]`, followed by
+    `b[3]`, and so on). `D` is `DFromV<V>`.
+
+    `InterleaveOdd(d, a, b)` is equivalent to `OddEven(b, DupOdd(a))`, but
+    `InterleaveOdd(d, a, b)` is usually more efficient than
+    `OddEven(b, DupOdd(a))`.
+
 #### Zip
 
 *   `Ret`: `MakeWide<T>`; `V`: `{u,i}{8,16,32}` \
@@ -2254,7 +2329,7 @@
 *   `HWY_IS_LITTLE_ENDIAN` expands to 1 on little-endian targets and to 0 on
     big-endian targets.
 
-*   `HWY_IS_BIG_ENDIAN` expands to 0 on big-endian targets and to 1 on
+*   `HWY_IS_BIG_ENDIAN` expands to 1 on big-endian targets and to 0 on
     little-endian targets.
 
 The following were used to signal the maximum number of lanes for certain
diff --git a/g3doc/release_testing_process.md b/g3doc/release_testing_process.md
index f25cf8e..ab6599f 100644
--- a/g3doc/release_testing_process.md
+++ b/g3doc/release_testing_process.md
@@ -18,7 +18,7 @@
 
 ### Version updates
 
-Update the current version in:
+Prepend to debian/changelog and update mentions of the current version in:
 
 *   highway.h
 *   CMakeLists.txt
diff --git a/hwy/aligned_allocator.h b/hwy/aligned_allocator.h
index dfd5e0a..c31894e 100644
--- a/hwy/aligned_allocator.h
+++ b/hwy/aligned_allocator.h
@@ -40,6 +40,11 @@
 // access pairs of lines, and POWER8 also has 128.
 #define HWY_ALIGNMENT 128
 
+template <typename T>
+HWY_API constexpr bool IsAligned(T* ptr, size_t align = HWY_ALIGNMENT) {
+  return reinterpret_cast<uintptr_t>(ptr) % align == 0;
+}
+
 // Pointers to functions equivalent to malloc/free with an opaque void* passed
 // to them.
 using AllocPtr = void* (*)(void* opaque, size_t bytes);
diff --git a/hwy/base.h b/hwy/base.h
index e3e72c2..63ac8bf 100644
--- a/hwy/base.h
+++ b/hwy/base.h
@@ -258,6 +258,13 @@
 #define HWY_IS_TSAN 0
 #endif
 
+#if HWY_HAS_FEATURE(undefined_behavior_sanitizer) || \
+    defined(UNDEFINED_BEHAVIOR_SANITIZER)
+#define HWY_IS_UBSAN 1
+#else
+#define HWY_IS_UBSAN 0
+#endif
+
 // MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
 // You can disable MSAN by adding this attribute to the function that fails.
 #if HWY_IS_MSAN
@@ -271,7 +278,7 @@
 // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
 // MSVC defines NDEBUG (if not, could instead check _DEBUG).
 #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
-    HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
+    HWY_IS_MSAN || HWY_IS_TSAN || HWY_IS_UBSAN || defined(__clang_analyzer__)
 #define HWY_IS_DEBUG_BUILD 1
 #else
 #define HWY_IS_DEBUG_BUILD 0
@@ -385,8 +392,8 @@
 // hwy::float16_t and hwy::bfloat16_t are forward declared here to allow
 // BitCastScalar to be implemented before the implementations of the
 // hwy::float16_t and hwy::bfloat16_t types
-struct alignas(2) float16_t;
-struct alignas(2) bfloat16_t;
+struct float16_t;
+struct bfloat16_t;
 
 using float32_t = float;
 using float64_t = double;
@@ -1263,7 +1270,7 @@
 #endif  // HWY_HAVE_SCALAR_F16_OPERATORS
 
 HWY_API HWY_F16_CONSTEXPR float F32FromF16(float16_t f16) {
-#if HWY_HAVE_SCALAR_F16_OPERATORS
+#if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
   return static_cast<float>(f16);
 #endif
 #if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
@@ -1333,7 +1340,7 @@
         // HWY_COMPILER_MSVC >= 1926)
 
 HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32) {
-#if HWY_HAVE_SCALAR_F16_OPERATORS
+#if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
   return float16_t(static_cast<float16_t::Native>(f32));
 #endif
 #if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
@@ -2422,6 +2429,11 @@
   return DivCeil(what, align) * align;
 }
 
+// Works for any `align`; if a power of two, compiler emits AND.
+constexpr inline size_t RoundDownTo(size_t what, size_t align) {
+  return what - (what % align);
+}
+
 // Undefined results for x == 0.
 HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
   HWY_DASSERT(x != 0);
@@ -2606,6 +2618,46 @@
 #endif
 }
 
+// Precomputation for fast n / divisor and n % divisor, where n is a variable
+// and divisor is unchanging but unknown at compile-time.
+class Divisor {
+ public:
+  explicit Divisor(uint32_t divisor) : divisor_(divisor) {
+    if (divisor <= 1) return;
+
+    const uint32_t len =
+        static_cast<uint32_t>(31 - Num0BitsAboveMS1Bit_Nonzero32(divisor - 1));
+    const uint64_t u_hi = (2ULL << len) - divisor;
+    const uint32_t q = Truncate((u_hi << 32) / divisor);
+
+    mul_ = q + 1;
+    shift1_ = 1;
+    shift2_ = len;
+  }
+
+  uint32_t GetDivisor() const { return divisor_; }
+
+  // Returns n / divisor_.
+  uint32_t Divide(uint32_t n) const {
+    const uint64_t mul = mul_;
+    const uint32_t t = Truncate((mul * n) >> 32);
+    return (t + ((n - t) >> shift1_)) >> shift2_;
+  }
+
+  // Returns n % divisor_.
+  uint32_t Remainder(uint32_t n) const { return n - (Divide(n) * divisor_); }
+
+ private:
+  static uint32_t Truncate(uint64_t x) {
+    return static_cast<uint32_t>(x & 0xFFFFFFFFu);
+  }
+
+  uint32_t divisor_;
+  uint32_t mul_ = 1;
+  uint32_t shift1_ = 0;
+  uint32_t shift2_ = 0;
+};
+
 namespace detail {
 
 template <typename T>
diff --git a/hwy/base_test.cc b/hwy/base_test.cc
index e2aa0d7..87ee0d4 100644
--- a/hwy/base_test.cc
+++ b/hwy/base_test.cc
@@ -228,6 +228,45 @@
   HWY_ASSERT_EQ(size_t{64}, PopCount(0xFFFFFFFFFFFFFFFFull));
 }
 
+// Exhaustive test for small/large dividends and divisors
+HWY_NOINLINE void TestAllDivisor() {
+  // Small d, small n
+  for (uint32_t d = 1; d < 256; ++d) {
+    const Divisor divisor(d);
+    for (uint32_t n = 0; n < 256; ++n) {
+      HWY_ASSERT(divisor.Divide(n) == n / d);
+      HWY_ASSERT(divisor.Remainder(n) == n % d);
+    }
+  }
+
+  // Large d, small n
+  for (uint32_t d = 0xFFFFFF00u; d != 0; ++d) {
+    const Divisor divisor(d);
+    for (uint32_t n = 0; n < 256; ++n) {
+      HWY_ASSERT(divisor.Divide(n) == n / d);
+      HWY_ASSERT(divisor.Remainder(n) == n % d);
+    }
+  }
+
+  // Small d, large n
+  for (uint32_t d = 1; d < 256; ++d) {
+    const Divisor divisor(d);
+    for (uint32_t n = 0xFFFFFF00u; n != 0; ++n) {
+      HWY_ASSERT(divisor.Divide(n) == n / d);
+      HWY_ASSERT(divisor.Remainder(n) == n % d);
+    }
+  }
+
+  // Large d, large n
+  for (uint32_t d = 0xFFFFFF00u; d != 0; ++d) {
+    const Divisor divisor(d);
+    for (uint32_t n = 0xFFFFFF00u; n != 0; ++n) {
+      HWY_ASSERT(divisor.Divide(n) == n / d);
+      HWY_ASSERT(divisor.Remainder(n) == n % d);
+    }
+  }
+}
+
 template <class T>
 static HWY_INLINE T TestEndianGetIntegerVal(T val) {
   static_assert(!IsFloat<T>() && !IsSpecialFloat<T>(),
@@ -643,6 +682,7 @@
 HWY_EXPORT_AND_TEST_P(BaseTest, TestAllIsSame);
 HWY_EXPORT_AND_TEST_P(BaseTest, TestAllBitScan);
 HWY_EXPORT_AND_TEST_P(BaseTest, TestAllPopCount);
+HWY_EXPORT_AND_TEST_P(BaseTest, TestAllDivisor);
 HWY_EXPORT_AND_TEST_P(BaseTest, TestAllEndian);
 HWY_EXPORT_AND_TEST_P(BaseTest, TestAllSpecialFloat);
 }  // namespace hwy
diff --git a/hwy/contrib/algo/find-inl.h b/hwy/contrib/algo/find-inl.h
index 9a9f55b..dc0a8ca 100644
--- a/hwy/contrib/algo/find-inl.h
+++ b/hwy/contrib/algo/find-inl.h
@@ -14,7 +14,8 @@
 // limitations under the License.
 
 // Per-target include guard
-#if defined(HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#if defined(HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)  // NOLINT
 #ifdef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
 #undef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
 #else
diff --git a/hwy/contrib/thread_pool/futex.h b/hwy/contrib/thread_pool/futex.h
index ff3b77f..b7ee84c 100644
--- a/hwy/contrib/thread_pool/futex.h
+++ b/hwy/contrib/thread_pool/futex.h
@@ -77,13 +77,14 @@
 
 // Waits until `current != prev` and returns the new value. May return
 // immediately if `current` already changed, or after blocking and waking.
-static inline uint32_t BlockUntilDifferent(const uint32_t prev,
-                                           std::atomic<uint32_t>& current) {
+static inline uint32_t BlockUntilDifferent(
+    const uint32_t prev, const std::atomic<uint32_t>& current) {
   const auto acq = std::memory_order_acquire;
 
 #if HWY_ARCH_WASM
   // It is always safe to cast to void.
-  volatile void* address = static_cast<volatile void*>(&current);
+  volatile void* address =
+      const_cast<volatile void*>(static_cast<const volatile void*>(&current));
   const double max_ms = INFINITY;
   for (;;) {
     const uint32_t next = current.load(acq);
@@ -95,7 +96,7 @@
 
 #elif HWY_OS_LINUX
   // Safe to cast because std::atomic is a standard layout type.
-  uint32_t* address = reinterpret_cast<uint32_t*>(&current);
+  const uint32_t* address = reinterpret_cast<const uint32_t*>(&current);
   // _PRIVATE requires this only be used in the same process, and avoids
   // virtual->physical lookups and atomic reference counting.
   const int op = FUTEX_WAIT_PRIVATE;
@@ -112,7 +113,8 @@
 
 #elif HWY_OS_WIN && !defined(HWY_DISABLE_FUTEX)
   // It is always safe to cast to void.
-  volatile void* address = static_cast<volatile void*>(&current);
+  volatile void* address =
+      const_cast<volatile void*>(static_cast<const volatile void*>(&current));
   // API is not const-correct, but only loads from the pointer.
   PVOID pprev = const_cast<void*>(static_cast<const void*>(&prev));
   const DWORD max_ms = INFINITE;
@@ -126,7 +128,7 @@
 
 #elif HWY_OS_APPLE && !defined(HWY_DISABLE_FUTEX)
   // It is always safe to cast to void.
-  void* address = static_cast<void*>(&current);
+  void* address = const_cast<void*>(static_cast<const void*>(&current));
   for (;;) {
     const uint32_t next = current.load(acq);
     if (next != prev) return next;
diff --git a/hwy/contrib/thread_pool/thread_pool.h b/hwy/contrib/thread_pool/thread_pool.h
index 3271a4f..24954bf 100644
--- a/hwy/contrib/thread_pool/thread_pool.h
+++ b/hwy/contrib/thread_pool/thread_pool.h
@@ -51,47 +51,6 @@
 
 namespace hwy {
 
-// Precomputation for fast n / divisor and n % divisor, where n is a variable
-// and divisor is unchanging but unknown at compile-time.
-class Divisor {
- public:
-  Divisor() = default;  // for PoolWorker
-  explicit Divisor(uint32_t divisor) : divisor_(divisor) {
-    if (divisor <= 1) return;
-
-    const uint32_t len =
-        static_cast<uint32_t>(31 - Num0BitsAboveMS1Bit_Nonzero32(divisor - 1));
-    const uint64_t u_hi = (2ULL << len) - divisor;
-    const uint32_t q = Truncate((u_hi << 32) / divisor);
-
-    mul_ = q + 1;
-    shift1_ = 1;
-    shift2_ = len;
-  }
-
-  uint32_t GetDivisor() const { return divisor_; }
-
-  // Returns n / divisor_.
-  uint32_t Divide(uint32_t n) const {
-    const uint64_t mul = mul_;
-    const uint32_t t = Truncate((mul * n) >> 32);
-    return (t + ((n - t) >> shift1_)) >> shift2_;
-  }
-
-  // Returns n % divisor_.
-  uint32_t Remainder(uint32_t n) const { return n - (Divide(n) * divisor_); }
-
- private:
-  static uint32_t Truncate(uint64_t x) {
-    return static_cast<uint32_t>(x & 0xFFFFFFFFu);
-  }
-
-  uint32_t divisor_;
-  uint32_t mul_ = 1;
-  uint32_t shift1_ = 0;
-  uint32_t shift2_ = 0;
-};
-
 // Generates a random permutation of [0, size). O(1) storage.
 class ShuffledIota {
  public:
diff --git a/hwy/contrib/thread_pool/thread_pool_test.cc b/hwy/contrib/thread_pool/thread_pool_test.cc
index f1ab133..4c2e36c 100644
--- a/hwy/contrib/thread_pool/thread_pool_test.cc
+++ b/hwy/contrib/thread_pool/thread_pool_test.cc
@@ -36,45 +36,6 @@
 namespace {
 using HWY_NAMESPACE::AdjustedReps;
 
-// Exhaustive test for small/large dividends and divisors
-TEST(ThreadPoolTest, TestDivisor) {
-  // Small d, small n
-  for (uint32_t d = 1; d < 256; ++d) {
-    const Divisor divisor(d);
-    for (uint32_t n = 0; n < 256; ++n) {
-      HWY_ASSERT(divisor.Divide(n) == n / d);
-      HWY_ASSERT(divisor.Remainder(n) == n % d);
-    }
-  }
-
-  // Large d, small n
-  for (uint32_t d = 0xFFFFFF00u; d != 0; ++d) {
-    const Divisor divisor(d);
-    for (uint32_t n = 0; n < 256; ++n) {
-      HWY_ASSERT(divisor.Divide(n) == n / d);
-      HWY_ASSERT(divisor.Remainder(n) == n % d);
-    }
-  }
-
-  // Small d, large n
-  for (uint32_t d = 1; d < 256; ++d) {
-    const Divisor divisor(d);
-    for (uint32_t n = 0xFFFFFF00u; n != 0; ++n) {
-      HWY_ASSERT(divisor.Divide(n) == n / d);
-      HWY_ASSERT(divisor.Remainder(n) == n % d);
-    }
-  }
-
-  // Large d, large n
-  for (uint32_t d = 0xFFFFFF00u; d != 0; ++d) {
-    const Divisor divisor(d);
-    for (uint32_t n = 0xFFFFFF00u; n != 0; ++n) {
-      HWY_ASSERT(divisor.Divide(n) == n / d);
-      HWY_ASSERT(divisor.Remainder(n) == n % d);
-    }
-  }
-}
-
 TEST(ThreadPoolTest, TestCoprime) {
   // 1 is coprime with anything
   for (uint32_t i = 1; i < 500; ++i) {
diff --git a/hwy/contrib/unroller/unroller_test.cc b/hwy/contrib/unroller/unroller_test.cc
index a2b8d4b..061b219 100644
--- a/hwy/contrib/unroller/unroller_test.cc
+++ b/hwy/contrib/unroller/unroller_test.cc
@@ -132,7 +132,7 @@
   using DI = RebindToSigned<D>;
   DI di;
 
-  FindUnit<T>(T find) : to_find(find) {}
+  FindUnit(T find) : to_find(find) {}
 
   hn::Vec<DI> Func(ptrdiff_t idx, const hn::Vec<D> x, const hn::Vec<DI> y) {
     const Mask<D> msk = hn::Eq(x, hn::Set(d, to_find));
diff --git a/hwy/highway.h b/hwy/highway.h
index 5fa224d..d87bc81 100644
--- a/hwy/highway.h
+++ b/hwy/highway.h
@@ -32,8 +32,8 @@
 
 // API version (https://semver.org/); keep in sync with CMakeLists.txt.
 #define HWY_MAJOR 1
-#define HWY_MINOR 0
-#define HWY_PATCH 7
+#define HWY_MINOR 1
+#define HWY_PATCH 0
 
 //------------------------------------------------------------------------------
 // Shorthand for tags (defined in shared-inl.h) used to select overloads.
diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h
index af456be..1c8d387 100644
--- a/hwy/ops/arm_neon-inl.h
+++ b/hwy/ops/arm_neon-inl.h
@@ -2133,12 +2133,16 @@
 #pragma pop_macro("HWY_NEON_DEF_FUNCTION")
 
 // ------------------------------ RotateRight (ShiftRight, Or)
-template <int kBits, typename T, size_t N>
+template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
   constexpr size_t kSizeInBits = sizeof(T) * 8;
   static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
   if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v),
+
+  return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
             ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
 }
 
@@ -6841,6 +6845,36 @@
   return IfThenElse(MaskFromVec(vec), b, a);
 }
 
+// ------------------------------ InterleaveEven
+template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+#if HWY_ARCH_ARM_A64
+  return detail::InterleaveEven(a, b);
+#else
+  return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[0]);
+#endif
+}
+
+template <class D, HWY_IF_T_SIZE_D(D, 8)>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return InterleaveLower(a, b);
+}
+
+// ------------------------------ InterleaveOdd
+template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+#if HWY_ARCH_ARM_A64
+  return detail::InterleaveOdd(a, b);
+#else
+  return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[1]);
+#endif
+}
+
+template <class D, HWY_IF_T_SIZE_D(D, 8)>
+HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
+  return InterleaveUpper(d, a, b);
+}
+
 // ------------------------------ OddEvenBlocks
 template <typename T, size_t N>
 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
@@ -7568,22 +7602,35 @@
 HWY_NEON_DEF_REDUCTION_UI64(ReduceSum, vaddv)
 
 // Emulate missing UI64 and partial N=2.
-template <class D, HWY_IF_LANES_D(D, 2)>
+template <class D, HWY_IF_LANES_D(D, 2),
+          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
 HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v10) {
   return GetLane(v10) + ExtractLane(v10, 1);
 }
 
-template <class D, HWY_IF_LANES_D(D, 2)>
+template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D),
+          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))>
 HWY_API TFromD<D> ReduceMin(D /* tag */, VFromD<D> v10) {
   return HWY_MIN(GetLane(v10), ExtractLane(v10, 1));
 }
 
-template <class D, HWY_IF_LANES_D(D, 2)>
+template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D),
+          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))>
 HWY_API TFromD<D> ReduceMax(D /* tag */, VFromD<D> v10) {
   return HWY_MAX(GetLane(v10), ExtractLane(v10, 1));
 }
 
 #if HWY_HAVE_FLOAT16
+template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
+HWY_API float16_t ReduceMin(D d, VFromD<D> v10) {
+  return GetLane(Min(v10, Reverse2(d, v10)));
+}
+
+template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
+HWY_API float16_t ReduceMax(D d, VFromD<D> v10) {
+  return GetLane(Max(v10, Reverse2(d, v10)));
+}
+
 template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 8)>
 HWY_API float16_t ReduceSum(D /* tag */, VFromD<D> v) {
   const float16x4_t x2 = vpadd_f16(v.raw, v.raw);
diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h
index 89260bb..56def0c 100644
--- a/hwy/ops/arm_sve-inl.h
+++ b/hwy/ops/arm_sve-inl.h
@@ -1012,14 +1012,15 @@
 
 // ------------------------------ ShiftLeft[Same]
 
-#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP)               \
-  template <int kBits>                                                  \
-  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) {         \
-    return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits);    \
-  }                                                                     \
-  HWY_API HWY_SVE_V(BASE, BITS)                                         \
-      NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \
-    return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits);     \
+#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP)                  \
+  template <int kBits>                                                     \
+  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) {            \
+    return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits);       \
+  }                                                                        \
+  HWY_API HWY_SVE_V(BASE, BITS)                                            \
+      NAME##Same(HWY_SVE_V(BASE, BITS) v, int bits) {                      \
+    return sv##OP##_##CHAR##BITS##_x(                                      \
+        HWY_SVE_PTRUE(BITS), v, static_cast<HWY_SVE_T(uint, BITS)>(bits)); \
   }
 
 HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT_N, ShiftLeft, lsl_n)
@@ -1033,15 +1034,35 @@
 
 // ------------------------------ RotateRight
 
-// TODO(janwas): svxar on SVE2
-template <int kBits, class V>
+#if HWY_SVE_HAVE_2
+
+#define HWY_SVE_ROTATE_RIGHT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
+  template <int kBits>                                           \
+  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) {  \
+    if (kBits == 0) return v;                                    \
+    return sv##OP##_##CHAR##BITS(v, Zero(DFromV<decltype(v)>()), \
+                                 HWY_MAX(kBits, 1));             \
+  }
+
+HWY_SVE_FOREACH_U(HWY_SVE_ROTATE_RIGHT_N, RotateRight, xar_n)
+HWY_SVE_FOREACH_I(HWY_SVE_ROTATE_RIGHT_N, RotateRight, xar_n)
+
+#undef HWY_SVE_ROTATE_RIGHT_N
+
+#else  // !HWY_SVE_HAVE_2
+template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
 HWY_API V RotateRight(const V v) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
   constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
   static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
   if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v),
+
+  return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
             ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
 }
+#endif
 
 // ------------------------------ Shl/r
 
@@ -2089,6 +2110,22 @@
   return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
 }
 
+#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
+#undef HWY_NATIVE_PROMOTE_F16_TO_F64
+#else
+#define HWY_NATIVE_PROMOTE_F16_TO_F64
+#endif
+
+template <size_t N, int kPow2>
+HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
+                              const svfloat16_t v) {
+  // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
+  // first replicate each lane once.
+  const svfloat16_t vv = detail::ZipLowerSame(v, v);
+  return svcvt_f64_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()),
+                         detail::ZipLowerSame(vv, vv));
+}
+
 template <size_t N, int kPow2>
 HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
                               const svfloat32_t v) {
@@ -2546,6 +2583,20 @@
                                 in_even);  // lower half
 }
 
+#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
+#undef HWY_NATIVE_DEMOTE_F64_TO_F16
+#else
+#define HWY_NATIVE_DEMOTE_F64_TO_F16
+#endif
+
+template <size_t N, int kPow2>
+HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat64_t v) {
+  const svfloat16_t in_lo16 = svcvt_f16_f64_x(detail::PTrue(d), v);
+  const svfloat16_t in_even = detail::ConcatEvenFull(in_lo16, in_lo16);
+  return detail::ConcatEvenFull(in_even,
+                                in_even);  // lower half
+}
+
 template <size_t N, int kPow2>
 HWY_API VBF16 DemoteTo(Simd<bfloat16_t, N, kPow2> dbf16, svfloat32_t v) {
   const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v);
@@ -3155,6 +3206,18 @@
 
 #endif  // HWY_TARGET
 
+// ------------------------------ InterleaveEven
+template <class D>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return detail::InterleaveEven(a, b);
+}
+
+// ------------------------------ InterleaveOdd
+template <class D>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return detail::InterleaveOdd(a, b);
+}
+
 // ------------------------------ OddEvenBlocks
 template <class V>
 HWY_API V OddEvenBlocks(const V odd, const V even) {
diff --git a/hwy/ops/emu128-inl.h b/hwy/ops/emu128-inl.h
index 5055ea0..1398268 100644
--- a/hwy/ops/emu128-inl.h
+++ b/hwy/ops/emu128-inl.h
@@ -382,16 +382,11 @@
   return to;
 }
 
-template <typename T, size_t N>
-Vec128<T, N> VecFromMask(Mask128<T, N> mask) {
-  Vec128<T, N> v;
-  CopySameSize(&mask.bits, &v.raw);
-  return v;
-}
-
 template <class D>
 VFromD<D> VecFromMask(D /* tag */, MFromD<D> mask) {
-  return VecFromMask(mask);
+  VFromD<D> v;
+  CopySameSize(&mask.bits, &v.raw);
+  return v;
 }
 
 template <class D>
@@ -407,19 +402,20 @@
 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
                                 Vec128<T, N> no) {
-  return IfVecThenElse(VecFromMask(mask), yes, no);
+  const DFromV<decltype(yes)> d;
+  return IfVecThenElse(VecFromMask(d, mask), yes, no);
 }
 
 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
   const DFromV<decltype(yes)> d;
-  return IfVecThenElse(VecFromMask(mask), yes, Zero(d));
+  return IfVecThenElse(VecFromMask(d, mask), yes, Zero(d));
 }
 
 template <typename T, size_t N>
 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
   const DFromV<decltype(no)> d;
-  return IfVecThenElse(VecFromMask(mask), Zero(d), no);
+  return IfVecThenElse(VecFromMask(d, mask), Zero(d), no);
 }
 
 template <typename T, size_t N>
@@ -445,7 +441,8 @@
 
 template <typename T, size_t N>
 HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
-  return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
+  const Simd<T, N, 0> d;
+  return MaskFromVec(Not(VecFromMask(d, m)));
 }
 
 template <typename T, size_t N>
@@ -526,12 +523,16 @@
 }
 
 // ------------------------------ RotateRight (ShiftRight)
-template <int kBits, typename T, size_t N>
+template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
   constexpr size_t kSizeInBits = sizeof(T) * 8;
   static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
   if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v),
+
+  return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
             ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
 }
 
@@ -1610,7 +1611,7 @@
 
 template <class ToT, class ToTypeTag, class FromT>
 HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
-  return static_cast<ToT>(val);
+  return ConvertScalarTo<ToT>(val);
 }
 
 template <class ToT>
@@ -2157,6 +2158,24 @@
   return odd;
 }
 
+template <class D>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  constexpr size_t N = HWY_MAX_LANES_D(D);
+  for (size_t i = 1; i < N; i += 2) {
+    a.raw[i] = b.raw[i - 1];
+  }
+  return a;
+}
+
+template <class D>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  constexpr size_t N = HWY_MAX_LANES_D(D);
+  for (size_t i = 1; i < N; i += 2) {
+    b.raw[i - 1] = a.raw[i];
+  }
+  return b;
+}
+
 template <typename T, size_t N>
 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
   return even;
diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h
index 22c5c7c..fe940c1 100644
--- a/hwy/ops/generic_ops-inl.h
+++ b/hwy/ops/generic_ops-inl.h
@@ -367,6 +367,299 @@
 
 #endif  // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
 
+// ------------------------------ RotateLeft
+template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
+HWY_API V RotateLeft(V v) {
+  constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
+  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
+
+  constexpr int kRotateRightAmt =
+      (kBits == 0) ? 0 : static_cast<int>(kSizeInBits) - kBits;
+  return RotateRight<kRotateRightAmt>(v);
+}
+
+// ------------------------------ Rol/Ror
+#if (defined(HWY_NATIVE_ROL_ROR_8) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_ROL_ROR_8
+#undef HWY_NATIVE_ROL_ROR_8
+#else
+#define HWY_NATIVE_ROL_ROR_8
+#endif
+
+template <class V, HWY_IF_UI8(TFromV<V>)>
+HWY_API V Rol(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const RebindToSigned<decltype(d)> di;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const auto shift_amt_mask = Set(du, uint8_t{7});
+  const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
+  const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
+
+  const auto vu = BitCast(du, a);
+  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
+}
+
+template <class V, HWY_IF_UI8(TFromV<V>)>
+HWY_API V Ror(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const RebindToSigned<decltype(d)> di;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const auto shift_amt_mask = Set(du, uint8_t{7});
+  const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
+  const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
+
+  const auto vu = BitCast(du, a);
+  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
+}
+
+#endif  // HWY_NATIVE_ROL_ROR_8
+
+#if (defined(HWY_NATIVE_ROL_ROR_16) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_ROL_ROR_16
+#undef HWY_NATIVE_ROL_ROR_16
+#else
+#define HWY_NATIVE_ROL_ROR_16
+#endif
+
+template <class V, HWY_IF_UI16(TFromV<V>)>
+HWY_API V Rol(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const RebindToSigned<decltype(d)> di;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const auto shift_amt_mask = Set(du, uint16_t{15});
+  const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
+  const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
+
+  const auto vu = BitCast(du, a);
+  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
+}
+
+template <class V, HWY_IF_UI16(TFromV<V>)>
+HWY_API V Ror(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const RebindToSigned<decltype(d)> di;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const auto shift_amt_mask = Set(du, uint16_t{15});
+  const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
+  const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
+
+  const auto vu = BitCast(du, a);
+  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
+}
+
+#endif  // HWY_NATIVE_ROL_ROR_16
+
+#if (defined(HWY_NATIVE_ROL_ROR_32_64) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_ROL_ROR_32_64
+#undef HWY_NATIVE_ROL_ROR_32_64
+#else
+#define HWY_NATIVE_ROL_ROR_32_64
+#endif
+
+template <class V, HWY_IF_UI32(TFromV<V>)>
+HWY_API V Rol(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const RebindToSigned<decltype(d)> di;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const auto shift_amt_mask = Set(du, uint32_t{31});
+  const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
+  const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
+
+  const auto vu = BitCast(du, a);
+  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
+}
+
+template <class V, HWY_IF_UI32(TFromV<V>)>
+HWY_API V Ror(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const RebindToSigned<decltype(d)> di;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const auto shift_amt_mask = Set(du, uint32_t{31});
+  const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
+  const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
+
+  const auto vu = BitCast(du, a);
+  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
+}
+
+#if HWY_HAVE_INTEGER64
+template <class V, HWY_IF_UI64(TFromV<V>)>
+HWY_API V Rol(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const RebindToSigned<decltype(d)> di;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const auto shift_amt_mask = Set(du, uint64_t{63});
+  const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
+  const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
+
+  const auto vu = BitCast(du, a);
+  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
+}
+
+template <class V, HWY_IF_UI64(TFromV<V>)>
+HWY_API V Ror(V a, V b) {
+  const DFromV<decltype(a)> d;
+  const RebindToSigned<decltype(d)> di;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const auto shift_amt_mask = Set(du, uint64_t{63});
+  const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
+  const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
+
+  const auto vu = BitCast(du, a);
+  return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
+}
+#endif  // HWY_HAVE_INTEGER64
+
+#endif  // HWY_NATIVE_ROL_ROR_32_64
+
+// ------------------------------ RotateLeftSame/RotateRightSame
+
+#if (defined(HWY_NATIVE_ROL_ROR_SAME_8) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_ROL_ROR_SAME_8
+#undef HWY_NATIVE_ROL_ROR_SAME_8
+#else
+#define HWY_NATIVE_ROL_ROR_SAME_8
+#endif
+
+template <class V, HWY_IF_UI8(TFromV<V>)>
+HWY_API V RotateLeftSame(V v, int bits) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const int shl_amt = bits & 7;
+  const int shr_amt = static_cast<int>((0u - static_cast<unsigned>(bits)) & 7u);
+
+  const auto vu = BitCast(du, v);
+  return BitCast(d,
+                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
+}
+
+template <class V, HWY_IF_UI8(TFromV<V>)>
+HWY_API V RotateRightSame(V v, int bits) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const int shr_amt = bits & 7;
+  const int shl_amt = static_cast<int>((0u - static_cast<unsigned>(bits)) & 7u);
+
+  const auto vu = BitCast(du, v);
+  return BitCast(d,
+                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
+}
+
+#endif  // HWY_NATIVE_ROL_ROR_SAME_8
+
+#if (defined(HWY_NATIVE_ROL_ROR_SAME_16) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_ROL_ROR_SAME_16
+#undef HWY_NATIVE_ROL_ROR_SAME_16
+#else
+#define HWY_NATIVE_ROL_ROR_SAME_16
+#endif
+
+template <class V, HWY_IF_UI16(TFromV<V>)>
+HWY_API V RotateLeftSame(V v, int bits) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const int shl_amt = bits & 15;
+  const int shr_amt =
+      static_cast<int>((0u - static_cast<unsigned>(bits)) & 15u);
+
+  const auto vu = BitCast(du, v);
+  return BitCast(d,
+                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
+}
+
+template <class V, HWY_IF_UI16(TFromV<V>)>
+HWY_API V RotateRightSame(V v, int bits) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const int shr_amt = bits & 15;
+  const int shl_amt =
+      static_cast<int>((0u - static_cast<unsigned>(bits)) & 15u);
+
+  const auto vu = BitCast(du, v);
+  return BitCast(d,
+                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
+}
+#endif  // HWY_NATIVE_ROL_ROR_SAME_16
+
+#if (defined(HWY_NATIVE_ROL_ROR_SAME_32_64) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
+#undef HWY_NATIVE_ROL_ROR_SAME_32_64
+#else
+#define HWY_NATIVE_ROL_ROR_SAME_32_64
+#endif
+
+template <class V, HWY_IF_UI32(TFromV<V>)>
+HWY_API V RotateLeftSame(V v, int bits) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const int shl_amt = bits & 31;
+  const int shr_amt =
+      static_cast<int>((0u - static_cast<unsigned>(bits)) & 31u);
+
+  const auto vu = BitCast(du, v);
+  return BitCast(d,
+                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
+}
+
+template <class V, HWY_IF_UI32(TFromV<V>)>
+HWY_API V RotateRightSame(V v, int bits) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const int shr_amt = bits & 31;
+  const int shl_amt =
+      static_cast<int>((0u - static_cast<unsigned>(bits)) & 31u);
+
+  const auto vu = BitCast(du, v);
+  return BitCast(d,
+                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
+}
+
+#if HWY_HAVE_INTEGER64
+template <class V, HWY_IF_UI64(TFromV<V>)>
+HWY_API V RotateLeftSame(V v, int bits) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const int shl_amt = bits & 63;
+  const int shr_amt =
+      static_cast<int>((0u - static_cast<unsigned>(bits)) & 63u);
+
+  const auto vu = BitCast(du, v);
+  return BitCast(d,
+                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
+}
+
+template <class V, HWY_IF_UI64(TFromV<V>)>
+HWY_API V RotateRightSame(V v, int bits) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
+  const int shr_amt = bits & 63;
+  const int shl_amt =
+      static_cast<int>((0u - static_cast<unsigned>(bits)) & 63u);
+
+  const auto vu = BitCast(du, v);
+  return BitCast(d,
+                 Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
+}
+#endif  // HWY_HAVE_INTEGER64
+
+#endif  // HWY_NATIVE_ROL_ROR_SAME_32_64
+
 // ------------------------------ InterleaveWholeLower/InterleaveWholeUpper
 #if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE))
 #ifdef HWY_NATIVE_INTERLEAVE_WHOLE
@@ -410,6 +703,17 @@
 }
 #endif  // HWY_TARGET != HWY_SCALAR
 
+// ------------------------------ InterleaveEven
+
+#if HWY_TARGET != HWY_SCALAR
+// InterleaveEven without the optional D parameter is generic for all vector
+// lengths
+template <class V>
+HWY_API V InterleaveEven(V a, V b) {
+  return InterleaveEven(DFromV<V>(), a, b);
+}
+#endif
+
 // ------------------------------ AddSub
 
 template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
@@ -3045,6 +3349,55 @@
 
 #endif  // HWY_NATIVE_F16C
 
+// ------------------------------ F64->F16 DemoteTo
+#if (defined(HWY_NATIVE_DEMOTE_F64_TO_F16) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
+#undef HWY_NATIVE_DEMOTE_F64_TO_F16
+#else
+#define HWY_NATIVE_DEMOTE_F64_TO_F16
+#endif
+
+#if HWY_HAVE_FLOAT64
+template <class D, HWY_IF_F16_D(D)>
+HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
+  const Rebind<double, D> df64;
+  const Rebind<uint64_t, D> du64;
+  const Rebind<float, D> df32;
+
+  // The mantissa bits of v[i] are first rounded using round-to-odd rounding to
+  // the nearest F64 value that has the lower 29 bits zeroed out to ensure that
+  // the result is correctly rounded to a F16.
+
+  const auto vf64_rounded = OrAnd(
+      And(v,
+          BitCast(df64, Set(du64, static_cast<uint64_t>(0xFFFFFFFFE0000000u)))),
+      BitCast(df64, Add(BitCast(du64, v),
+                        Set(du64, static_cast<uint64_t>(0x000000001FFFFFFFu)))),
+      BitCast(df64, Set(du64, static_cast<uint64_t>(0x0000000020000000ULL))));
+
+  return DemoteTo(df16, DemoteTo(df32, vf64_rounded));
+}
+#endif  // HWY_HAVE_FLOAT64
+
+#endif  // HWY_NATIVE_DEMOTE_F64_TO_F16
+
+// ------------------------------ F16->F64 PromoteTo
+#if (defined(HWY_NATIVE_PROMOTE_F16_TO_F64) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
+#undef HWY_NATIVE_PROMOTE_F16_TO_F64
+#else
+#define HWY_NATIVE_PROMOTE_F16_TO_F64
+#endif
+
+#if HWY_HAVE_FLOAT64
+template <class D, HWY_IF_F64_D(D)>
+HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<float16_t, D>> v) {
+  return PromoteTo(df64, PromoteTo(Rebind<float, D>(), v));
+}
+#endif  // HWY_HAVE_FLOAT64
+
+#endif  // HWY_NATIVE_PROMOTE_F16_TO_F64
+
 // ------------------------------ SumsOf2
 
 #if HWY_TARGET != HWY_SCALAR
@@ -6294,6 +6647,37 @@
 }
 #endif
 
+// ------------------------------ Slide mask up/down
+#if (defined(HWY_NATIVE_SLIDE_MASK) == defined(HWY_TARGET_TOGGLE))
+
+#ifdef HWY_NATIVE_SLIDE_MASK
+#undef HWY_NATIVE_SLIDE_MASK
+#else
+#define HWY_NATIVE_SLIDE_MASK
+#endif
+
+template <class D>
+HWY_API Mask<D> SlideMask1Up(D d, Mask<D> m) {
+  return MaskFromVec(Slide1Up(d, VecFromMask(d, m)));
+}
+
+template <class D>
+HWY_API Mask<D> SlideMask1Down(D d, Mask<D> m) {
+  return MaskFromVec(Slide1Down(d, VecFromMask(d, m)));
+}
+
+template <class D>
+HWY_API Mask<D> SlideMaskUpLanes(D d, Mask<D> m, size_t amt) {
+  return MaskFromVec(SlideUpLanes(d, VecFromMask(d, m), amt));
+}
+
+template <class D>
+HWY_API Mask<D> SlideMaskDownLanes(D d, Mask<D> m, size_t amt) {
+  return MaskFromVec(SlideDownLanes(d, VecFromMask(d, m), amt));
+}
+
+#endif  // HWY_NATIVE_SLIDE_MASK
+
 // ------------------------------ SumsOfAdjQuadAbsDiff
 
 #if (defined(HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF) == \
diff --git a/hwy/ops/ppc_vsx-inl.h b/hwy/ops/ppc_vsx-inl.h
index 787e24f..66cbcfc 100644
--- a/hwy/ops/ppc_vsx-inl.h
+++ b/hwy/ops/ppc_vsx-inl.h
@@ -1618,19 +1618,83 @@
   return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mulo(a.raw, b.raw)};
 }
 
+// ------------------------------ Rol/Ror
+
+#ifdef HWY_NATIVE_ROL_ROR_8
+#undef HWY_NATIVE_ROL_ROR_8
+#else
+#define HWY_NATIVE_ROL_ROR_8
+#endif
+
+#ifdef HWY_NATIVE_ROL_ROR_16
+#undef HWY_NATIVE_ROL_ROR_16
+#else
+#define HWY_NATIVE_ROL_ROR_16
+#endif
+
+#ifdef HWY_NATIVE_ROL_ROR_32_64
+#undef HWY_NATIVE_ROL_ROR_32_64
+#else
+#define HWY_NATIVE_ROL_ROR_32_64
+#endif
+
+template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
+HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
+  const DFromV<decltype(a)> d;
+  const RebindToUnsigned<decltype(d)> du;
+  return BitCast(
+      d, VFromD<decltype(du)>{vec_rl(BitCast(du, a).raw, BitCast(du, b).raw)});
+}
+
+template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
+HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
+  const DFromV<decltype(a)> d;
+  const RebindToSigned<decltype(d)> di;
+  return Rol(a, BitCast(d, Neg(BitCast(di, b))));
+}
+
 // ------------------------------ RotateRight
-template <int kBits, typename T, size_t N>
+template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
   const DFromV<decltype(v)> d;
-  const RebindToUnsigned<decltype(d)> du;
   constexpr size_t kSizeInBits = sizeof(T) * 8;
   static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
 
-  if (kBits == 0) return v;
+  return (kBits == 0)
+             ? v
+             : Rol(v, Set(d, static_cast<T>(static_cast<int>(kSizeInBits) -
+                                            kBits)));
+}
 
-  // Do an unsigned vec_rl operation to avoid undefined behavior
-  return BitCast(d, VFromD<decltype(du)>{vec_rl(
-                        BitCast(du, v).raw, Set(du, kSizeInBits - kBits).raw)});
+// ------------------------------ RotateLeftSame/RotateRightSame
+#ifdef HWY_NATIVE_ROL_ROR_SAME_8
+#undef HWY_NATIVE_ROL_ROR_SAME_8
+#else
+#define HWY_NATIVE_ROL_ROR_SAME_8
+#endif
+
+#ifdef HWY_NATIVE_ROL_ROR_SAME_16
+#undef HWY_NATIVE_ROL_ROR_SAME_16
+#else
+#define HWY_NATIVE_ROL_ROR_SAME_16
+#endif
+
+#ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
+#undef HWY_NATIVE_ROL_ROR_SAME_32_64
+#else
+#define HWY_NATIVE_ROL_ROR_SAME_32_64
+#endif
+
+template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
+HWY_API Vec128<T, N> RotateLeftSame(Vec128<T, N> v, int bits) {
+  const DFromV<decltype(v)> d;
+  return Rol(v, Set(d, static_cast<T>(static_cast<unsigned>(bits))));
+}
+
+template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
+HWY_API Vec128<T, N> RotateRightSame(Vec128<T, N> v, int bits) {
+  const DFromV<decltype(v)> d;
+  return Rol(v, Set(d, static_cast<T>(0u - static_cast<unsigned>(bits))));
 }
 
 // ------------------------------ ZeroIfNegative (BroadcastSignBit)
@@ -1658,8 +1722,7 @@
              BitCast(du, no).raw, BitCast(du, yes).raw, BitCast(du, v).raw)});
 #else
   const RebindToSigned<decltype(d)> di;
-  return IfThenElse(MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))),
-                    yes, no);
+  return IfVecThenElse(BitCast(d, BroadcastSignBit(BitCast(di, v))), yes, no);
 #endif
 }
 
@@ -3034,6 +3097,96 @@
   return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 8>{mask}), b, a);
 }
 
+// ------------------------------ InterleaveEven
+
+template <class D, HWY_IF_T_SIZE_D(D, 1)>
+HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
+  const Full128<TFromD<D>> d_full;
+  const Indices128<TFromD<D>> idx{
+      Dup128VecFromValues(Full128<uint8_t>(), 0, 16, 2, 18, 4, 20, 6, 22, 8, 24,
+                          10, 26, 12, 28, 14, 30)
+          .raw};
+  return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
+                                               ResizeBitCast(d_full, b), idx));
+}
+
+template <class D, HWY_IF_T_SIZE_D(D, 2)>
+HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
+  const Full128<TFromD<D>> d_full;
+  const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
+                                                      16, 17, 4, 5, 20, 21, 8,
+                                                      9, 24, 25, 12, 13, 28, 29)
+                                      .raw};
+  return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
+                                               ResizeBitCast(d_full, b), idx));
+}
+
+template <class D, HWY_IF_T_SIZE_D(D, 4)>
+HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
+#if HWY_S390X_HAVE_Z14
+  const Full128<TFromD<D>> d_full;
+  const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
+                                                      2, 3, 16, 17, 18, 19, 8,
+                                                      9, 10, 11, 24, 25, 26, 27)
+                                      .raw};
+  return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
+                                               ResizeBitCast(d_full, b), idx));
+#else
+  (void)d;
+  return VFromD<D>{vec_mergee(a.raw, b.raw)};
+#endif
+}
+
+template <class D, HWY_IF_T_SIZE_D(D, 8)>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return InterleaveLower(a, b);
+}
+
+// ------------------------------ InterleaveOdd
+
+template <class D, HWY_IF_T_SIZE_D(D, 1)>
+HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
+  const Full128<TFromD<D>> d_full;
+  const Indices128<TFromD<D>> idx{
+      Dup128VecFromValues(Full128<uint8_t>(), 1, 17, 3, 19, 5, 21, 7, 23, 9, 25,
+                          11, 27, 13, 29, 15, 31)
+          .raw};
+  return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
+                                               ResizeBitCast(d_full, b), idx));
+}
+
+template <class D, HWY_IF_T_SIZE_D(D, 2)>
+HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
+  const Full128<TFromD<D>> d_full;
+  const Indices128<TFromD<D>> idx{
+      Dup128VecFromValues(Full128<uint8_t>(), 2, 3, 18, 19, 6, 7, 22, 23, 10,
+                          11, 26, 27, 14, 15, 30, 31)
+          .raw};
+  return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
+                                               ResizeBitCast(d_full, b), idx));
+}
+
+template <class D, HWY_IF_T_SIZE_D(D, 4)>
+HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
+#if HWY_S390X_HAVE_Z14
+  const Full128<TFromD<D>> d_full;
+  const Indices128<TFromD<D>> idx{
+      Dup128VecFromValues(Full128<uint8_t>(), 4, 5, 6, 7, 20, 21, 22, 23, 12,
+                          13, 14, 15, 28, 29, 30, 31)
+          .raw};
+  return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
+                                               ResizeBitCast(d_full, b), idx));
+#else
+  (void)d;
+  return VFromD<D>{vec_mergeo(a.raw, b.raw)};
+#endif
+}
+
+template <class D, HWY_IF_T_SIZE_D(D, 8)>
+HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
+  return InterleaveUpper(d, a, b);
+}
+
 // ------------------------------ OddEvenBlocks
 template <typename T, size_t N>
 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
@@ -3903,6 +4056,101 @@
 
 #endif  // HWY_PPC_HAVE_9
 
+#if HWY_PPC_HAVE_9
+
+#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
+#undef HWY_NATIVE_DEMOTE_F64_TO_F16
+#else
+#define HWY_NATIVE_DEMOTE_F64_TO_F16
+#endif
+
+namespace detail {
+
+// On big-endian PPC9, VsxXscvdphp converts vf64[0] to a F16, returned as an U64
+// vector with the resulting F16 bits in the lower 16 bits of U64 lane 0
+
+// On little-endian PPC9, VsxXscvdphp converts vf64[1] to a F16, returned as
+// an U64 vector with the resulting F16 bits in the lower 16 bits of U64 lane 1
+static HWY_INLINE Vec128<uint64_t> VsxXscvdphp(Vec128<double> vf64) {
+  // Inline assembly is needed for the PPC9 xscvdphp instruction as there is
+  // currently no intrinsic available for the PPC9 xscvdphp instruction
+  __vector unsigned long long raw_result;
+  __asm__("xscvdphp %x0, %x1" : "=wa"(raw_result) : "wa"(vf64.raw));
+  return Vec128<uint64_t>{raw_result};
+}
+
+}  // namespace detail
+
+template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 1)>
+HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
+  const RebindToUnsigned<decltype(df16)> du16;
+  const Rebind<uint64_t, decltype(df16)> du64;
+
+  const Full128<double> df64_full;
+#if HWY_IS_LITTLE_ENDIAN
+  const auto bits16_as_u64 =
+      UpperHalf(du64, detail::VsxXscvdphp(Combine(df64_full, v, v)));
+#else
+  const auto bits16_as_u64 =
+      LowerHalf(du64, detail::VsxXscvdphp(ResizeBitCast(df64_full, v)));
+#endif
+
+  return BitCast(df16, TruncateTo(du16, bits16_as_u64));
+}
+
+template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 2)>
+HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
+  const RebindToUnsigned<decltype(df16)> du16;
+  const Rebind<uint64_t, decltype(df16)> du64;
+  const Rebind<double, decltype(df16)> df64;
+
+#if HWY_IS_LITTLE_ENDIAN
+  const auto bits64_as_u64_0 = detail::VsxXscvdphp(InterleaveLower(df64, v, v));
+  const auto bits64_as_u64_1 = detail::VsxXscvdphp(v);
+  const auto bits64_as_u64 =
+      InterleaveUpper(du64, bits64_as_u64_0, bits64_as_u64_1);
+#else
+  const auto bits64_as_u64_0 = detail::VsxXscvdphp(v);
+  const auto bits64_as_u64_1 = detail::VsxXscvdphp(InterleaveUpper(df64, v, v));
+  const auto bits64_as_u64 =
+      InterleaveLower(du64, bits64_as_u64_0, bits64_as_u64_1);
+#endif
+
+  return BitCast(df16, TruncateTo(du16, bits64_as_u64));
+}
+
+#elif HWY_S390X_HAVE_Z14
+
+#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
+#undef HWY_NATIVE_DEMOTE_F64_TO_F16
+#else
+#define HWY_NATIVE_DEMOTE_F64_TO_F16
+#endif
+
+namespace detail {
+
+template <class DF32, HWY_IF_F32_D(DF32)>
+static HWY_INLINE VFromD<DF32> DemoteToF32WithRoundToOdd(
+    DF32 df32, VFromD<Rebind<double, DF32>> v) {
+  const Twice<DF32> dt_f32;
+
+  __vector float raw_f32_in_even;
+  __asm__("vledb %0,%1,0,3" : "=v"(raw_f32_in_even) : "v"(v.raw));
+
+  const VFromD<decltype(dt_f32)> f32_in_even{raw_f32_in_even};
+  return LowerHalf(df32, ConcatEven(dt_f32, f32_in_even, f32_in_even));
+}
+
+}  // namespace detail
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
+HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
+  const Rebind<float, decltype(df16)> df32;
+  return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v));
+}
+
+#endif  // HWY_PPC_HAVE_9
+
 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
 HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
   const Rebind<uint32_t, decltype(dbf16)> du32;  // for logical shift right
diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h
index a431347..c1214c7 100644
--- a/hwy/ops/rvv-inl.h
+++ b/hwy/ops/rvv-inl.h
@@ -528,9 +528,9 @@
   }
 
 // mask = f(mask)
-#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
-  HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) {   \
-    return __riscv_vm##OP##_m_b##MLEN(m, ~0ull);      \
+#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP)              \
+  HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) {                \
+    return __riscv_vm##OP##_m_b##MLEN(m, HWY_RVV_AVL(SEW, SHIFT)); \
   }
 
 // ================================================== INIT
@@ -552,17 +552,17 @@
 // Treat bfloat16_t as int16_t (using the previously defined Set overloads);
 // required for Zero and VFromD.
 template <size_t N, int kPow2>
-decltype(Set(Simd<int16_t, N, kPow2>(), 0)) Set(Simd<bfloat16_t, N, kPow2> d,
-                                                bfloat16_t arg) {
+decltype(Set(Simd<int16_t, N, kPow2>(), 0)) Set(
+    Simd<hwy::bfloat16_t, N, kPow2> d, hwy::bfloat16_t arg) {
   return Set(RebindToSigned<decltype(d)>(), BitCastScalar<int16_t>(arg));
 }
 #if !HWY_HAVE_FLOAT16  // Otherwise already defined above.
 // WARNING: returns a different type than emulated bfloat16_t so that we can
 // implement PromoteTo overloads for both bfloat16_t and float16_t, and also
-// provide a Neg(float16_t) overload that coexists with Neg(int16_t).
+// provide a Neg(hwy::float16_t) overload that coexists with Neg(int16_t).
 template <size_t N, int kPow2>
-decltype(Set(Simd<uint16_t, N, kPow2>(), 0)) Set(Simd<float16_t, N, kPow2> d,
-                                                 float16_t arg) {
+decltype(Set(Simd<uint16_t, N, kPow2>(), 0)) Set(
+    Simd<hwy::float16_t, N, kPow2> d, hwy::float16_t arg) {
   return Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg));
 }
 #endif
@@ -760,7 +760,7 @@
 #else
 template <size_t N, int kPow2>
 HWY_INLINE VFromD<Simd<uint16_t, N, kPow2>> BitCastFromByte(
-    Simd<float16_t, N, kPow2> /* d */, VFromD<Simd<uint8_t, N, kPow2>> v) {
+    Simd<hwy::float16_t, N, kPow2> /* d */, VFromD<Simd<uint8_t, N, kPow2>> v) {
   return BitCastFromByte(Simd<uint16_t, N, kPow2>(), v);
 }
 #endif
@@ -774,7 +774,8 @@
 
 template <size_t N, int kPow2>
 HWY_INLINE VFromD<Simd<int16_t, N, kPow2>> BitCastFromByte(
-    Simd<bfloat16_t, N, kPow2> /* d */, VFromD<Simd<uint8_t, N, kPow2>> v) {
+    Simd<hwy::bfloat16_t, N, kPow2> /* d */,
+    VFromD<Simd<uint8_t, N, kPow2>> v) {
   return BitCastFromByte(Simd<int16_t, N, kPow2>(), v);
 }
 
@@ -1088,12 +1089,16 @@
 }
 
 // ------------------------------ RotateRight
-template <int kBits, class V>
+template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
 HWY_API V RotateRight(const V v) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
   constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
   static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
   if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v),
+
+  return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
             ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
 }
 
@@ -1312,32 +1317,6 @@
 // vboolXX_t is a power of two divisor for vector bits. SEW=8 / LMUL=1 = 1/8th
 // of all bits; SEW=8 / LMUL=4 = half of all bits.
 
-// SFINAE for mapping Simd<> to MLEN (up to 64).
-#define HWY_RVV_IF_MLEN_D(D, MLEN) \
-  hwy::EnableIf<MLenFromD(D()) == MLEN>* = nullptr
-
-// Specialized for RVV instead of the generic generic_ops-inl.h implementation
-// because more efficient, and helps implement MFromD.
-
-#ifdef HWY_NATIVE_MASK_FALSE
-#undef HWY_NATIVE_MASK_FALSE
-#else
-#define HWY_NATIVE_MASK_FALSE
-#endif
-
-#define HWY_RVV_MASK_FALSE(SEW, SHIFT, MLEN, NAME, OP) \
-  template <class D, HWY_RVV_IF_MLEN_D(D, MLEN)>       \
-  HWY_API HWY_RVV_M(MLEN) NAME(D d) {                  \
-    return __riscv_vm##OP##_m_b##MLEN(Lanes(d));       \
-  }
-
-HWY_RVV_FOREACH_B(HWY_RVV_MASK_FALSE, MaskFalse, clr)
-#undef HWY_RVV_MASK_FALSE
-#undef HWY_RVV_IF_MLEN_D
-
-template <class D>
-using MFromD = decltype(MaskFalse(D()));
-
 // mask = f(vector, vector)
 #define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
                            SHIFT, MLEN, NAME, OP)                           \
@@ -1476,11 +1455,32 @@
 #undef HWY_RVV_IF_THEN_ZERO_ELSE
 
 // ------------------------------ MaskFromVec
+
+template <class D>
+using MFromD = decltype(Eq(Zero(D()), Zero(D())));
+
 template <class V>
 HWY_API MFromD<DFromV<V>> MaskFromVec(const V v) {
   return detail::NeS(v, 0);
 }
 
+// ------------------------------ MaskFalse
+
+// For mask ops including vmclr, elements past VL are tail-agnostic and cannot
+// be relied upon, so define a variant of the generic_ops-inl implementation of
+// MaskFalse that ensures all bits are zero as required by mask_test.
+#ifdef HWY_NATIVE_MASK_FALSE
+#undef HWY_NATIVE_MASK_FALSE
+#else
+#define HWY_NATIVE_MASK_FALSE
+#endif
+
+template <class D>
+HWY_API MFromD<D> MaskFalse(D d) {
+  const DFromV<VFromD<decltype(d)>> d_full;
+  return MaskFromVec(Zero(d_full));
+}
+
 // ------------------------------ RebindMask
 template <class D, typename MFrom>
 HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
@@ -1498,10 +1498,12 @@
   template <size_t N>                                                          \
   HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
       NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) m) {              \
-    const RebindToSigned<decltype(d)> di;                                      \
+    /* MaskFalse requires we set all lanes for capped d and virtual LMUL. */   \
+    const DFromV<VFromD<decltype(d)>> d_full;                                  \
+    const RebindToSigned<decltype(d_full)> di;                                 \
     using TI = TFromD<decltype(di)>;                                           \
-    return BitCast(                                                            \
-        d, __riscv_v##OP##_i##SEW##LMUL(Zero(di), TI{-1}, m, Lanes(d)));       \
+    return BitCast(d_full, __riscv_v##OP##_i##SEW##LMUL(Zero(di), TI{-1}, m,   \
+                                                        Lanes(d_full)));       \
   }
 
 HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, VecFromMask, merge_vxm, _ALL_VIRT)
@@ -2105,7 +2107,7 @@
 
 template <size_t N, int kPow2>
 HWY_API auto PromoteTo(Simd<float32_t, N, kPow2> d,
-                       VFromD<Rebind<bfloat16_t, decltype(d)>> v)
+                       VFromD<Rebind<hwy::bfloat16_t, decltype(d)>> v)
     -> VFromD<decltype(d)> {
   const RebindToSigned<decltype(d)> di32;
   const Rebind<uint16_t, decltype(d)> du16;
@@ -2742,8 +2744,8 @@
 #undef HWY_RVV_DEMOTE_TO_SHR_16
 
 template <size_t N, int kPow2>
-HWY_API VFromD<Simd<bfloat16_t, N, kPow2>> DemoteTo(
-    Simd<bfloat16_t, N, kPow2> d, VFromD<Simd<float, N, kPow2 + 1>> v) {
+HWY_API VFromD<Simd<hwy::bfloat16_t, N, kPow2>> DemoteTo(
+    Simd<hwy::bfloat16_t, N, kPow2> d, VFromD<Simd<float, N, kPow2 + 1>> v) {
   const RebindToUnsigned<decltype(d)> du16;
   const Rebind<uint32_t, decltype(d)> du32;
   return BitCast(d, detail::DemoteToShr16(du16, BitCast(du32, v)));
@@ -2813,7 +2815,7 @@
 
 template <class D, class V>
 HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
-  using T = MakeUnsigned<TFromD<D>>;
+  using T = MakeUnsigned<TFromV<V>>;
   return AndS(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
 }
 
@@ -3027,7 +3029,7 @@
 
 // ------------------------------ InsertLane
 
-// T template arg because TFromV<V> might not match the float16_t argument.
+// T template arg because TFromV<V> might not match the hwy::float16_t argument.
 template <class V, typename T, HWY_IF_NOT_T_SIZE_V(V, 1)>
 HWY_API V InsertLane(const V v, size_t i, T t) {
   const Rebind<T, DFromV<V>> d;
@@ -3101,6 +3103,18 @@
   return OddEven(v, down);
 }
 
+// ------------------------------ InterleaveEven (OddEven)
+template <class D>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return OddEven(detail::Slide1Up(b), a);
+}
+
+// ------------------------------ InterleaveOdd (OddEven)
+template <class D>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return OddEven(b, detail::Slide1Down(a));
+}
+
 // ------------------------------ OddEvenBlocks
 template <class V>
 HWY_API V OddEvenBlocks(const V a, const V b) {
@@ -3231,6 +3245,67 @@
   return TableLookupLanes(v, idx);
 }
 
+// ------------------------------ ResizeBitCast
+
+// Extends or truncates a vector to match the given d.
+namespace detail {
+
+template <class D>
+HWY_INLINE VFromD<D> ChangeLMUL(D /* d */, VFromD<D> v) {
+  return v;
+}
+
+// Sanity check: when calling ChangeLMUL, the caller (ResizeBitCast) already
+// BitCast to the same lane type. Note that V may use the native lane type for
+// f16, so convert D to that before checking.
+#define HWY_RVV_IF_SAME_T_DV(D, V) \
+  hwy::EnableIf<IsSame<NativeLaneType<TFromD<D>>, TFromV<V>>()>* = nullptr
+
+// LMUL of VFromD<D> < LMUL of V: need to truncate v
+template <class D, class V,  // HWY_RVV_IF_SAME_T_DV(D, V),
+          HWY_IF_POW2_LE_D(DFromV<VFromD<D>>, DFromV<V>().Pow2() - 1)>
+HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
+  const DFromV<V> d_from;
+  const Half<decltype(d_from)> dh_from;
+  static_assert(
+      DFromV<VFromD<decltype(dh_from)>>().Pow2() < DFromV<V>().Pow2(),
+      "The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V");
+  static_assert(
+      DFromV<VFromD<D>>().Pow2() <= DFromV<VFromD<decltype(dh_from)>>().Pow2(),
+      "The LMUL of VFromD<D> must be less than or equal to the LMUL of "
+      "VFromD<decltype(dh_from)>");
+  return ChangeLMUL(d, Trunc(v));
+}
+
+// LMUL of VFromD<D> > LMUL of V: need to extend v
+template <class D, class V,  // HWY_RVV_IF_SAME_T_DV(D, V),
+          HWY_IF_POW2_GT_D(DFromV<VFromD<D>>, DFromV<V>().Pow2())>
+HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
+  const DFromV<V> d_from;
+  const Twice<decltype(d_from)> dt_from;
+  static_assert(DFromV<VFromD<decltype(dt_from)>>().Pow2() > DFromV<V>().Pow2(),
+                "The LMUL of VFromD<decltype(dt_from)> must be greater than "
+                "the LMUL of V");
+  static_assert(
+      DFromV<VFromD<D>>().Pow2() >= DFromV<VFromD<decltype(dt_from)>>().Pow2(),
+      "The LMUL of VFromD<D> must be greater than or equal to the LMUL of "
+      "VFromD<decltype(dt_from)>");
+  return ChangeLMUL(d, Ext(dt_from, v));
+}
+
+#undef HWY_RVV_IF_SAME_T_DV
+
+}  // namespace detail
+
+template <class DTo, class VFrom>
+HWY_API VFromD<DTo> ResizeBitCast(DTo /*dto*/, VFrom v) {
+  const DFromV<decltype(v)> d_from;
+  const Repartition<uint8_t, decltype(d_from)> du8_from;
+  const DFromV<VFromD<DTo>> d_to;
+  const Repartition<uint8_t, decltype(d_to)> du8_to;
+  return BitCast(d_to, detail::ChangeLMUL(du8_to, BitCast(du8_from, v)));
+}
+
 // ------------------------------ Reverse2 (RotateRight, OddEven)
 
 // Per-target flags to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
@@ -3592,50 +3667,6 @@
 
 // ------------------------------ TableLookupBytes
 
-// Extends or truncates a vector to match the given d.
-namespace detail {
-
-template <class D>
-HWY_INLINE VFromD<D> ChangeLMUL(D /* d */, VFromD<D> v) {
-  return v;
-}
-
-// LMUL of VFromD<D> < LMUL of V: need to truncate v
-template <class D, class V,
-          hwy::EnableIf<IsSame<TFromD<D>, TFromV<V>>()>* = nullptr,
-          HWY_IF_POW2_LE_D(DFromV<VFromD<D>>, DFromV<V>().Pow2() - 1)>
-HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
-  const DFromV<decltype(v)> d_from;
-  const Half<decltype(d_from)> dh_from;
-  static_assert(
-      DFromV<VFromD<decltype(dh_from)>>().Pow2() < DFromV<V>().Pow2(),
-      "The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V");
-  static_assert(
-      DFromV<VFromD<D>>().Pow2() <= DFromV<VFromD<decltype(dh_from)>>().Pow2(),
-      "The LMUL of VFromD<D> must be less than or equal to the LMUL of "
-      "VFromD<decltype(dh_from)>");
-  return ChangeLMUL(d, Trunc(v));
-}
-
-// LMUL of VFromD<D> > LMUL of V: need to extend v
-template <class D, class V,
-          hwy::EnableIf<IsSame<TFromD<D>, TFromV<V>>()>* = nullptr,
-          HWY_IF_POW2_GT_D(DFromV<VFromD<D>>, DFromV<V>().Pow2())>
-HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
-  const DFromV<decltype(v)> d_from;
-  const Twice<decltype(d_from)> dt_from;
-  static_assert(DFromV<VFromD<decltype(dt_from)>>().Pow2() > DFromV<V>().Pow2(),
-                "The LMUL of VFromD<decltype(dt_from)> must be greater than "
-                "the LMUL of V");
-  static_assert(
-      DFromV<VFromD<D>>().Pow2() >= DFromV<VFromD<decltype(dt_from)>>().Pow2(),
-      "The LMUL of VFromD<D> must be greater than or equal to the LMUL of "
-      "VFromD<decltype(dt_from)>");
-  return ChangeLMUL(d, Ext(dt_from, v));
-}
-
-}  // namespace detail
-
 template <class VT, class VI>
 HWY_API VI TableLookupBytes(const VT vt, const VI vi) {
   const DFromV<VT> dt;  // T=table, I=index.
@@ -3712,8 +3743,9 @@
 template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
           HWY_IF_POW2_LE_D(D, 2)>
 HWY_API V Broadcast(const V v) {
-  HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
   const D d;
+  HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
+
   const Rebind<uint16_t, decltype(d)> du16;
   VFromD<decltype(du16)> idx =
       detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
@@ -3727,8 +3759,9 @@
 template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
           HWY_IF_POW2_GT_D(D, 2)>
 HWY_API V Broadcast(const V v) {
-  HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
   const D d;
+  HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
+
   const Half<decltype(d)> dh;
   using VH = VFromD<decltype(dh)>;
   const Rebind<uint16_t, decltype(dh)> du16;
@@ -3739,14 +3772,15 @@
   }
   const VH lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx);
   const VH hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx);
-  return Combine(d, lo, hi);
+  return Combine(d, hi, lo);
 }
 
 template <int kLane, class V, class D = DFromV<V>,
           HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
 HWY_API V Broadcast(const V v) {
-  HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
   const D d;
+  HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
+
   const RebindToUnsigned<decltype(d)> du;
   auto idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(du));
   if (kLane != 0) {
@@ -3931,17 +3965,51 @@
 #define HWY_NATIVE_INTERLEAVE_WHOLE
 #endif
 
+namespace detail {
+// Returns double-length vector with interleaved lanes.
+template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
+          HWY_IF_POW2_GT_D(D, -3)>
+HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
+  const RebindToUnsigned<decltype(d)> du;
+  using TW = MakeWide<TFromD<decltype(du)>>;
+  const Rebind<TW, Half<decltype(du)>> dw;
+  const Half<decltype(du)> duh;  // cast inputs to unsigned so we zero-extend
+
+  const VFromD<decltype(dw)> aw = PromoteTo(dw, BitCast(duh, a));
+  const VFromD<decltype(dw)> bw = PromoteTo(dw, BitCast(duh, b));
+  return BitCast(d, Or(aw, BitCast(dw, detail::Slide1Up(BitCast(du, bw)))));
+}
+// 64-bit: cannot PromoteTo, but can Ext.
+template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
+HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
+  const RebindToUnsigned<decltype(d)> du;
+  const auto idx = ShiftRight<1>(detail::Iota0(du));
+  return OddEven(TableLookupLanes(detail::Ext(d, b), idx),
+                 TableLookupLanes(detail::Ext(d, a), idx));
+}
+template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_GT_D(D, 2)>
+HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
+  const Half<D> dh;
+  const Half<decltype(dh)> dq;
+  const VFromD<decltype(dh)> i0 =
+      InterleaveWhole(dh, LowerHalf(dq, a), LowerHalf(dq, b));
+  const VFromD<decltype(dh)> i1 =
+      InterleaveWhole(dh, UpperHalf(dq, a), UpperHalf(dq, b));
+  return Combine(d, i1, i0);
+}
+
+}  // namespace detail
+
 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
 HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
   const RebindToUnsigned<decltype(d)> du;
   const detail::AdjustSimdTagToMinVecPow2<RepartitionToWide<decltype(du)>> dw;
   const RepartitionToNarrow<decltype(dw)> du_src;
 
-  const auto aw =
+  const VFromD<D> aw =
       ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, a)));
-  const auto bw =
+  const VFromD<D> bw =
       ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, b)));
-
   return Or(aw, detail::Slide1Up(bw));
 }
 
@@ -3980,32 +4048,106 @@
 
 // ------------------------------ InterleaveLower (InterleaveWholeLower)
 
-// Using InterleaveWholeLower and 64-bit Compress avoids 8-bit overflow.
+namespace detail {
 
-// More than one block: match x86 semantics (independent blocks).
-template <class D, class V, HWY_IF_V_SIZE_GT_D(D, 16)>
-HWY_API V InterleaveLower(D d, const V a, const V b) {
+// Definitely at least 128 bit: match x86 semantics (independent blocks). Using
+// InterleaveWhole and 64-bit Compress avoids 8-bit overflow.
+template <class D, class V, HWY_IF_POW2_LE_D(D, 2)>
+HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) {
   static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
-  // Pad vectors to at least one u64.
-  const ScalableTag<uint64_t, HWY_MAX(d.Pow2(), -1)> du64;
-  const Repartition<TFromD<D>, decltype(du64)> d_resized;
-  const MFromD<decltype(du64)> is_even = detail::IsEven(du64);
-  // Concat the lower halves of 128-bit blocks into the lower half, which is all
-  // that InterleaveWholeLower uses. This is faster than ConcatEven.
-  const VFromD<decltype(du64)> a_lo = Compress(ResizeBitCast(du64, a), is_even);
-  const VFromD<decltype(du64)> b_lo = Compress(ResizeBitCast(du64, b), is_even);
-  return ResizeBitCast(
-      d, InterleaveWholeLower(d_resized, ResizeBitCast(d_resized, a_lo),
-                              ResizeBitCast(d_resized, b_lo)));
+  const Twice<D> dt;
+  const RebindToUnsigned<decltype(dt)> dt_u;
+  const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
+  // Keep only even 128-bit blocks. This is faster than u64 ConcatEven
+  // because we only have a single vector.
+  constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>));
+  const VFromD<decltype(dt_u)> idx_block =
+      ShiftRight<kShift>(detail::Iota0(dt_u));
+  const MFromD<decltype(dt_u)> is_even =
+      detail::EqS(detail::AndS(idx_block, 1), 0);
+  return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_even)));
+}
+template <class D, class V, HWY_IF_POW2_GT_D(D, 2)>
+HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) {
+  const Half<D> dh;
+  const VFromD<decltype(dh)> i0 =
+      InterleaveLowerBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b));
+  const VFromD<decltype(dh)> i1 =
+      InterleaveLowerBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b));
+  return Combine(d, i1, i0);
+}
+
+// As above, for the upper half of blocks.
+template <class D, class V, HWY_IF_POW2_LE_D(D, 2)>
+HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) {
+  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
+  const Twice<D> dt;
+  const RebindToUnsigned<decltype(dt)> dt_u;
+  const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
+  // Keep only odd 128-bit blocks. This is faster than u64 ConcatEven
+  // because we only have a single vector.
+  constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>));
+  const VFromD<decltype(dt_u)> idx_block =
+      ShiftRight<kShift>(detail::Iota0(dt_u));
+  const MFromD<decltype(dt_u)> is_odd =
+      detail::EqS(detail::AndS(idx_block, 1), 1);
+  return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_odd)));
+}
+template <class D, class V, HWY_IF_POW2_GT_D(D, 2)>
+HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) {
+  const Half<D> dh;
+  const VFromD<decltype(dh)> i0 =
+      InterleaveUpperBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b));
+  const VFromD<decltype(dh)> i1 =
+      InterleaveUpperBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b));
+  return Combine(d, i1, i0);
+}
+
+// RVV vectors are at least 128 bit when there is no fractional LMUL nor cap.
+// Used by functions with per-block behavior such as InterleaveLower.
+template <typename T, size_t N, int kPow2>
+constexpr bool IsGE128(Simd<T, N, kPow2> /* d */) {
+  return N * sizeof(T) >= 16 && kPow2 >= 0;
+}
+
+// Definitely less than 128-bit only if there is a small cap; fractional LMUL
+// might not be enough if vectors are large.
+template <typename T, size_t N, int kPow2>
+constexpr bool IsLT128(Simd<T, N, kPow2> /* d */) {
+  return N * sizeof(T) < 16;
+}
+
+}  // namespace detail
+
+#define HWY_RVV_IF_GE128_D(D) hwy::EnableIf<detail::IsGE128(D())>* = nullptr
+#define HWY_RVV_IF_LT128_D(D) hwy::EnableIf<detail::IsLT128(D())>* = nullptr
+#define HWY_RVV_IF_CAN128_D(D) \
+  hwy::EnableIf<!detail::IsLT128(D()) && !detail::IsGE128(D())>* = nullptr
+
+template <class D, class V, HWY_RVV_IF_GE128_D(D)>
+HWY_API V InterleaveLower(D d, const V a, const V b) {
+  return detail::InterleaveLowerBlocks(d, a, b);
 }
 
 // Single block: interleave without extra Compress.
-template <class D, class V, HWY_IF_V_SIZE_LE_D(D, 16)>
+template <class D, class V, HWY_RVV_IF_LT128_D(D)>
 HWY_API V InterleaveLower(D d, const V a, const V b) {
   static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
   return InterleaveWholeLower(d, a, b);
 }
 
+// Could be either; branch at runtime.
+template <class D, class V, HWY_RVV_IF_CAN128_D(D)>
+HWY_API V InterleaveLower(D d, const V a, const V b) {
+  if (Lanes(d) * sizeof(TFromD<D>) <= 16) {
+    return InterleaveWholeLower(d, a, b);
+  }
+  // Fractional LMUL: use LMUL=1 to ensure we can cast to u64.
+  const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1;
+  return ResizeBitCast(d, detail::InterleaveLowerBlocks(
+                              d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b)));
+}
+
 template <class V>
 HWY_API V InterleaveLower(const V a, const V b) {
   return InterleaveLower(DFromV<V>(), a, b);
@@ -4013,31 +4155,30 @@
 
 // ------------------------------ InterleaveUpper (Compress)
 
-// More than one block: match x86 semantics (independent blocks).
-template <class D, class V, HWY_IF_V_SIZE_GT_D(D, 16)>
-HWY_API V InterleaveUpper(const D d, const V a, const V b) {
-  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
-  // Pad vectors to at least one u64.
-  const ScalableTag<uint64_t, HWY_MAX(d.Pow2(), -1)> du64;
-  const Repartition<TFromD<D>, decltype(du64)> d_resized;
-  const MFromD<decltype(du64)> is_odd = detail::IsOdd(du64);
-  // Concat the upper halves of 128-bit blocks into the lower half, which is all
-  // that InterleaveWholeLower uses. This is faster than ConcatOdd.
-  const VFromD<decltype(du64)> a_lo = Compress(ResizeBitCast(du64, a), is_odd);
-  const VFromD<decltype(du64)> b_lo = Compress(ResizeBitCast(du64, b), is_odd);
-  // Still using InterleaveWholeLower because Compress filled the lower half.
-  return ResizeBitCast(
-      d, InterleaveWholeLower(d_resized, ResizeBitCast(d_resized, a_lo),
-                              ResizeBitCast(d_resized, b_lo)));
+template <class D, class V, HWY_RVV_IF_GE128_D(D)>
+HWY_API V InterleaveUpper(D d, const V a, const V b) {
+  return detail::InterleaveUpperBlocks(d, a, b);
 }
 
 // Single block: interleave without extra Compress.
-template <class D, class V, HWY_IF_V_SIZE_LE_D(D, 16)>
+template <class D, class V, HWY_RVV_IF_LT128_D(D)>
 HWY_API V InterleaveUpper(D d, const V a, const V b) {
   static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
   return InterleaveWholeUpper(d, a, b);
 }
 
+// Could be either; branch at runtime.
+template <class D, class V, HWY_RVV_IF_CAN128_D(D)>
+HWY_API V InterleaveUpper(D d, const V a, const V b) {
+  if (Lanes(d) * sizeof(TFromD<D>) <= 16) {
+    return InterleaveWholeUpper(d, a, b);
+  }
+  // Fractional LMUL: use LMUL=1 to ensure we can cast to u64.
+  const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1;
+  return ResizeBitCast(d, detail::InterleaveUpperBlocks(
+                              d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b)));
+}
+
 // ------------------------------ ZipLower
 
 template <class V, class DW = RepartitionToWide<DFromV<V>>>
@@ -4482,25 +4623,14 @@
 
 #endif  // HWY_HAVE_TUPLE
 
-// ------------------------------ ResizeBitCast
-
-template <class D, class FromV>
-HWY_API VFromD<D> ResizeBitCast(D /*d*/, FromV v) {
-  const DFromV<decltype(v)> d_from;
-  const Repartition<uint8_t, decltype(d_from)> du8_from;
-  const DFromV<VFromD<D>> d_to;
-  const Repartition<uint8_t, decltype(d_to)> du8_to;
-  return BitCast(d_to, detail::ChangeLMUL(du8_to, BitCast(du8_from, v)));
-}
-
 // ------------------------------ Dup128VecFromValues (ResizeBitCast)
 
-template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
+template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) {
   return Set(d, t0);
 }
 
-template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
+template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
   const auto even_lanes = Set(d, t0);
 #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
@@ -4858,10 +4988,23 @@
 
 // ------------------------------ Dup128MaskFromMaskBits
 
+namespace detail {
+// Even though this is only used after checking if (kN < X), this helper
+// function prevents "shift count exceeded" errors.
+template <size_t kN, HWY_IF_LANES_LE(kN, 31)>
+constexpr unsigned MaxMaskBits() {
+  return (1u << kN) - 1;
+}
+template <size_t kN, HWY_IF_LANES_GT(kN, 31)>
+constexpr unsigned MaxMaskBits() {
+  return ~0u;
+}
+}  // namespace detail
+
 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_LE_D(D, 8)>
 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   constexpr size_t kN = MaxLanes(d);
-  if (kN < 8) mask_bits &= (1u << kN) - 1;
+  if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
 
 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
   return detail::U8MaskBitsVecToMask(
@@ -4882,11 +5025,13 @@
 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 8)>
 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
+  const ScalableTag<uint8_t> du8;
+  const ScalableTag<uint16_t> du16;
+  // There are exactly 16 mask bits for 128 vector bits of 8-bit lanes.
   return detail::U8MaskBitsVecToMask(
-      d,
-      BitCast(ScalableTag<uint8_t>(),
-              Set(ScalableTag<uint16_t>(), static_cast<uint16_t>(mask_bits))));
+      d, BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits))));
 #else
+  // Slow fallback for completeness; the above bits to mask cast is preferred.
   const RebindToUnsigned<decltype(d)> du8;
   const Repartition<uint16_t, decltype(du8)> du16;
   const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
@@ -4908,56 +5053,55 @@
 template <class D, HWY_IF_T_SIZE_D(D, 2)>
 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   constexpr size_t kN = MaxLanes(d);
-  if (kN < 8) mask_bits &= (1u << kN) - 1;
+  if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
 
 #if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
-  return detail::U8MaskBitsVecToMask(
-      d, Set(ScalableTag<uint8_t>(), static_cast<uint8_t>(mask_bits)));
+  const ScalableTag<uint8_t> du8;
+  // There are exactly 8 mask bits for 128 vector bits of 16-bit lanes.
+  return detail::U8MaskBitsVecToMask(d,
+                                     Set(du8, static_cast<uint8_t>(mask_bits)));
 #else
-  const Rebind<uint8_t, detail::AdjustSimdTagToMinVecPow2<decltype(d)>> du8;
-  const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
-      du64;
-
-  const auto bytes = ResizeBitCast(
-      du8, detail::AndS(
-               ResizeBitCast(du64, Set(du8, static_cast<uint8_t>(mask_bits))),
-               uint64_t{0x8040201008040201u}));
-  return detail::NeS(bytes, uint8_t{0});
+  // Slow fallback for completeness; the above bits to mask cast is preferred.
+  const RebindToUnsigned<D> du;
+  const VFromD<decltype(du)> bits =
+      Shl(Set(du, uint16_t{1}), Iota(du, uint16_t{0}));
+  return TestBit(Set(du, static_cast<uint16_t>(mask_bits)), bits);
 #endif
 }
 
 template <class D, HWY_IF_T_SIZE_D(D, 4)>
 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   constexpr size_t kN = MaxLanes(d);
-  if (kN < 4) {
-    mask_bits &= (1u << kN) - 1;
-  }
+  if (kN < 4) mask_bits &= detail::MaxMaskBits<kN>();
 
-  const Rebind<uint8_t, detail::AdjustSimdTagToMinVecPow2<decltype(d)>> du8;
-  const detail::AdjustSimdTagToMinVecPow2<Repartition<uint32_t, decltype(du8)>>
-      du32;
-
-  const auto bytes = ResizeBitCast(
-      du8, detail::AndS(
-               ResizeBitCast(du32, Set(du8, static_cast<uint8_t>(mask_bits))),
-               uint32_t{0x08040201u}));
-  return detail::NeS(bytes, uint8_t{0});
+#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
+  const ScalableTag<uint8_t> du8;
+  return detail::U8MaskBitsVecToMask(
+      d, Set(du8, static_cast<uint8_t>(mask_bits * 0x11)));
+#else
+  // Slow fallback for completeness; the above bits to mask cast is preferred.
+  const RebindToUnsigned<D> du;
+  const VFromD<decltype(du)> bits =
+      Shl(Set(du, uint32_t{1}), Iota(du, uint32_t{0}));
+  return TestBit(Set(du, static_cast<uint32_t>(mask_bits)), bits);
+#endif
 }
 
 template <class D, HWY_IF_T_SIZE_D(D, 8)>
 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
-  if (MaxLanes(d) < 2) {
-    mask_bits &= 1u;
-  }
+  constexpr size_t kN = MaxLanes(d);
+  if (kN < 2) mask_bits &= detail::MaxMaskBits<kN>();
 
-  const Rebind<uint8_t, detail::AdjustSimdTagToMinVecPow2<decltype(d)>> du8;
-  const Repartition<uint16_t, decltype(du8)> du16;
-
-  const auto bytes = BitCast(
-      du8,
-      detail::AndS(BitCast(du16, Set(du8, static_cast<uint8_t>(mask_bits))),
-                   uint16_t{0x0201u}));
-  return detail::NeS(bytes, uint8_t{0});
+#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
+  const ScalableTag<uint8_t> du8;
+  return detail::U8MaskBitsVecToMask(
+      d, Set(du8, static_cast<uint8_t>(mask_bits * 0x55)));
+#else
+  // Slow fallback for completeness; the above bits to mask cast is preferred.
+  const RebindToUnsigned<D> du;
+  const VFromD<decltype(du)> bits = Dup128VecFromValues(du, 0, 1);
+  return TestBit(Set(du, static_cast<uint64_t>(mask_bits)), bits);
+#endif
 }
 
 // ------------------------------ Neg (Sub)
@@ -4979,7 +5123,7 @@
 
 #if !HWY_HAVE_FLOAT16
 
-template <class V, HWY_IF_U16_D(DFromV<V>)>  // float16_t
+template <class V, HWY_IF_U16_D(DFromV<V>)>  // hwy::float16_t
 HWY_API V Neg(V v) {
   const DFromV<decltype(v)> d;
   const RebindToUnsigned<decltype(d)> du;
@@ -5165,8 +5309,8 @@
 // ------------------------------ ReorderDemote2To (OddEven, Combine)
 
 template <size_t N, int kPow2>
-HWY_API VFromD<Simd<bfloat16_t, N, kPow2>> ReorderDemote2To(
-    Simd<bfloat16_t, N, kPow2> dbf16,
+HWY_API VFromD<Simd<hwy::bfloat16_t, N, kPow2>> ReorderDemote2To(
+    Simd<hwy::bfloat16_t, N, kPow2> dbf16,
     VFromD<RepartitionToWide<decltype(dbf16)>> a,
     VFromD<RepartitionToWide<decltype(dbf16)>> b) {
   const RebindToUnsigned<decltype(dbf16)> du16;
@@ -5258,7 +5402,7 @@
 // ------------------------------ WidenMulPairwiseAdd
 
 template <class D32, HWY_IF_F32_D(D32),
-          class V16 = VFromD<Repartition<bfloat16_t, D32>>>
+          class V16 = VFromD<Repartition<hwy::bfloat16_t, D32>>>
 HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
   const RebindToUnsigned<decltype(df32)> du32;
   using VU32 = VFromD<decltype(du32)>;
@@ -5302,7 +5446,7 @@
 // Non-overloaded wrapper function so we can define DF32 in template args.
 template <size_t N, int kPow2, class DF32 = Simd<float, N, kPow2>,
           class VF32 = VFromD<DF32>,
-          class DBF16 = Repartition<bfloat16_t, Simd<float, N, kPow2>>>
+          class DBF16 = Repartition<hwy::bfloat16_t, Simd<float, N, kPow2>>>
 HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd<float, N, kPow2> df32,
                                            VFromD<DBF16> a, VFromD<DBF16> b,
                                            const VF32 sum0, VF32& sum1) {
@@ -5659,6 +5803,9 @@
 #undef HWY_RVV_FOREACH_UI3264
 #undef HWY_RVV_FOREACH_UI64
 #undef HWY_RVV_IF_EMULATED_D
+#undef HWY_RVV_IF_CAN128_D
+#undef HWY_RVV_IF_GE128_D
+#undef HWY_RVV_IF_LT128_D
 #undef HWY_RVV_INSERT_VXRM
 #undef HWY_RVV_M
 #undef HWY_RVV_RETM_ARGM
diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h
index 49cb26e..d9e86be 100644
--- a/hwy/ops/scalar-inl.h
+++ b/hwy/ops/scalar-inl.h
@@ -139,7 +139,7 @@
   using TFrom = TFromV<FromV>;
   using TTo = TFromD<D>;
   constexpr size_t kCopyLen = HWY_MIN(sizeof(TFrom), sizeof(TTo));
-  TTo to = TTo{0};
+  TTo to{};
   CopyBytes<kCopyLen>(&v.raw, &to);
   return VFromD<D>(to);
 }
@@ -496,12 +496,16 @@
 }
 
 // ------------------------------ RotateRight (ShiftRight)
-template <int kBits, typename T>
+template <int kBits, typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
 HWY_API Vec1<T> RotateRight(const Vec1<T> v) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
   constexpr size_t kSizeInBits = sizeof(T) * 8;
-  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift");
+  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
   if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v),
+
+  return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
             ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
 }
 
@@ -1342,7 +1346,7 @@
 
 template <class ToT, class ToTypeTag, class FromT>
 HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) {
-  return static_cast<ToT>(val);
+  return ConvertScalarTo<ToT>(val);
 }
 
 template <class ToT>
@@ -1359,6 +1363,12 @@
 
 }  // namespace detail
 
+#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
+#undef HWY_NATIVE_PROMOTE_F16_TO_F64
+#else
+#define HWY_NATIVE_PROMOTE_F16_TO_F64
+#endif
+
 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom>
 HWY_API Vec1<TTo> PromoteTo(DTo /* tag */, Vec1<TFrom> from) {
   static_assert(sizeof(TTo) > sizeof(TFrom), "Not promoting");
diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h
index 481b470..6e09e08 100644
--- a/hwy/ops/wasm_128-inl.h
+++ b/hwy/ops/wasm_128-inl.h
@@ -654,12 +654,16 @@
 }
 
 // ------------------------------ RotateRight (ShiftRight, Or)
-template <int kBits, typename T, size_t N>
+template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
   constexpr size_t kSizeInBits = sizeof(T) * 8;
   static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
+
   if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v),
+  return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
             ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
 }
 
@@ -3806,6 +3810,50 @@
   return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
 }
 
+// ------------------------------ InterleaveEven
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 2, 18, 4, 20, 6, 22,
+                                      8, 24, 10, 26, 12, 28, 14, 30)};
+}
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 2, 10, 4, 12, 6, 14)};
+}
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 2, 6)};
+}
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return InterleaveLower(a, b);
+}
+
+// ------------------------------ InterleaveOdd
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 1)>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 17, 3, 19, 5, 21, 7, 23,
+                                      9, 25, 11, 27, 13, 29, 15, 31)};
+}
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 2)>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 9, 3, 11, 5, 13, 7, 15)};
+}
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 4)>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 5, 3, 7)};
+}
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE_D(D, 8)>
+HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
+  return InterleaveUpper(d, a, b);
+}
+
 // ------------------------------ OddEvenBlocks
 template <typename T, size_t N>
 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
diff --git a/hwy/ops/wasm_256-inl.h b/hwy/ops/wasm_256-inl.h
index 3a85f90..d5578ab 100644
--- a/hwy/ops/wasm_256-inl.h
+++ b/hwy/ops/wasm_256-inl.h
@@ -245,12 +245,17 @@
 }
 
 // ------------------------------ RotateRight (ShiftRight, Or)
-template <int kBits, typename T>
+template <int kBits, typename T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
 HWY_API Vec256<T> RotateRight(const Vec256<T> v) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+
   constexpr size_t kSizeInBits = sizeof(T) * 8;
   static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
   if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
+
+  return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
+            ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
 }
 
 // ------------------------------ Shift lanes by same variable #bits
@@ -1368,6 +1373,24 @@
   return a;
 }
 
+// ------------------------------ InterleaveEven
+template <class D, HWY_IF_V_SIZE_D(D, 32)>
+HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
+  const Half<decltype(d)> dh;
+  a.v0 = InterleaveEven(dh, a.v0, b.v0);
+  a.v1 = InterleaveEven(dh, a.v1, b.v1);
+  return a;
+}
+
+// ------------------------------ InterleaveOdd
+template <class D, HWY_IF_V_SIZE_D(D, 32)>
+HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
+  const Half<decltype(d)> dh;
+  a.v0 = InterleaveOdd(dh, a.v0, b.v0);
+  a.v1 = InterleaveOdd(dh, a.v1, b.v1);
+  return a;
+}
+
 // ------------------------------ OddEvenBlocks
 template <typename T>
 HWY_API Vec256<T> OddEvenBlocks(Vec256<T> odd, Vec256<T> even) {
diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h
index 7f777c8..521f82c 100644
--- a/hwy/ops/x86_128-inl.h
+++ b/hwy/ops/x86_128-inl.h
@@ -1072,6 +1072,101 @@
                       MH{static_cast<RawMH>(a.raw)});
 }
 
+// ------------------------------ Slide mask up/down
+#ifdef HWY_NATIVE_SLIDE_MASK
+#undef HWY_NATIVE_SLIDE_MASK
+#else
+#define HWY_NATIVE_SLIDE_MASK
+#endif
+
+template <class D, HWY_IF_LANES_LE_D(D, 8)>
+HWY_API MFromD<D> SlideMask1Up(D d, MFromD<D> m) {
+  using RawM = decltype(MFromD<D>().raw);
+  constexpr size_t kN = MaxLanes(d);
+  constexpr unsigned kValidLanesMask = (1u << kN) - 1u;
+
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  MFromD<D> result_mask{
+      static_cast<RawM>(_kshiftli_mask8(static_cast<__mmask8>(m.raw), 1))};
+
+  if (kN < 8) {
+    result_mask =
+        And(result_mask, MFromD<D>{static_cast<RawM>(kValidLanesMask)});
+  }
+#else
+  MFromD<D> result_mask{
+      static_cast<RawM>((static_cast<unsigned>(m.raw) << 1) & kValidLanesMask)};
+#endif
+
+  return result_mask;
+}
+
+template <class D, HWY_IF_LANES_D(D, 16)>
+HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
+  using RawM = decltype(MFromD<D>().raw);
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return MFromD<D>{
+      static_cast<RawM>(_kshiftli_mask16(static_cast<__mmask16>(m.raw), 1))};
+#else
+  return MFromD<D>{static_cast<RawM>(static_cast<unsigned>(m.raw) << 1)};
+#endif
+}
+
+template <class D, HWY_IF_LANES_LE_D(D, 8)>
+HWY_API MFromD<D> SlideMask1Down(D d, MFromD<D> m) {
+  using RawM = decltype(MFromD<D>().raw);
+  constexpr size_t kN = MaxLanes(d);
+  constexpr unsigned kValidLanesMask = (1u << kN) - 1u;
+
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  if (kN < 8) {
+    m = And(m, MFromD<D>{static_cast<RawM>(kValidLanesMask)});
+  }
+
+  return MFromD<D>{
+      static_cast<RawM>(_kshiftri_mask8(static_cast<__mmask8>(m.raw), 1))};
+#else
+  return MFromD<D>{
+      static_cast<RawM>((static_cast<unsigned>(m.raw) & kValidLanesMask) >> 1)};
+#endif
+}
+
+template <class D, HWY_IF_LANES_D(D, 16)>
+HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
+  using RawM = decltype(MFromD<D>().raw);
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return MFromD<D>{
+      static_cast<RawM>(_kshiftri_mask16(static_cast<__mmask16>(m.raw), 1))};
+#else
+  return MFromD<D>{
+      static_cast<RawM>((static_cast<unsigned>(m.raw) & 0xFFFFu) >> 1)};
+#endif
+}
+
+// Generic for all vector lengths
+template <class D>
+HWY_API MFromD<D> SlideMaskUpLanes(D d, MFromD<D> m, size_t amt) {
+  using RawM = decltype(MFromD<D>().raw);
+  constexpr size_t kN = MaxLanes(d);
+  constexpr uint64_t kValidLanesMask =
+      static_cast<uint64_t>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL);
+
+  return MFromD<D>{static_cast<RawM>(
+      (static_cast<uint64_t>(m.raw) << (amt & 63)) & kValidLanesMask)};
+}
+
+// Generic for all vector lengths
+template <class D>
+HWY_API MFromD<D> SlideMaskDownLanes(D d, MFromD<D> m, size_t amt) {
+  using RawM = decltype(MFromD<D>().raw);
+  constexpr size_t kN = MaxLanes(d);
+  constexpr uint64_t kValidLanesMask =
+      static_cast<uint64_t>(((kN < 64) ? (1ULL << kN) : 0ULL) - 1ULL);
+
+  return MFromD<D>{static_cast<RawM>(
+      (static_cast<uint64_t>(m.raw) & kValidLanesMask) >> (amt & 63))};
+}
+
 // ------------------------------ VecFromMask
 
 template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
@@ -4126,15 +4221,25 @@
 
 // ------------------------------ RotateRight (ShiftRight, Or)
 
-template <int kBits, typename T, size_t N,
-          HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
-HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
-  constexpr size_t kSizeInBits = sizeof(T) * 8;
-  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
+// U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8
+// RotateRight uses detail::GaloisAffine on AVX3_DL
+
+#if HWY_TARGET > HWY_AVX3_DL
+template <int kBits, size_t N>
+HWY_API Vec128<uint8_t, N> RotateRight(const Vec128<uint8_t, N> v) {
+  static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
   if (kBits == 0) return v;
-  // AVX3 does not support 8/16-bit.
-  return Or(ShiftRight<kBits>(v),
-            ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
+  // AVX3 does not support 8-bit.
+  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
+}
+#endif
+
+template <int kBits, size_t N>
+HWY_API Vec128<uint16_t, N> RotateRight(const Vec128<uint16_t, N> v) {
+  static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
+  if (kBits == 0) return v;
+  // AVX3 does not support 16-bit.
+  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
 }
 
 template <int kBits, size_t N>
@@ -4159,6 +4264,71 @@
 #endif
 }
 
+// I8/I16/I32/I64 RotateRight is generic for all vector lengths
+template <int kBits, class V, HWY_IF_SIGNED_V(V)>
+HWY_API V RotateRight(V v) {
+  const DFromV<decltype(v)> d;
+  const RebindToUnsigned<decltype(d)> du;
+  return BitCast(d, RotateRight<kBits>(BitCast(du, v)));
+}
+
+// ------------------------------ Rol/Ror
+#if HWY_TARGET <= HWY_AVX3
+
+#ifdef HWY_NATIVE_ROL_ROR_32_64
+#undef HWY_NATIVE_ROL_ROR_32_64
+#else
+#define HWY_NATIVE_ROL_ROR_32_64
+#endif
+
+template <class T, size_t N, HWY_IF_UI32(T)>
+HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
+  return Vec128<T, N>{_mm_rolv_epi32(a.raw, b.raw)};
+}
+
+template <class T, size_t N, HWY_IF_UI32(T)>
+HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
+  return Vec128<T, N>{_mm_rorv_epi32(a.raw, b.raw)};
+}
+
+template <class T, size_t N, HWY_IF_UI64(T)>
+HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
+  return Vec128<T, N>{_mm_rolv_epi64(a.raw, b.raw)};
+}
+
+template <class T, size_t N, HWY_IF_UI64(T)>
+HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
+  return Vec128<T, N>{_mm_rorv_epi64(a.raw, b.raw)};
+}
+
+#endif
+
+// ------------------------------ RotateLeftSame/RotateRightSame
+
+#if HWY_TARGET <= HWY_AVX3
+
+#ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
+#undef HWY_NATIVE_ROL_ROR_SAME_32_64
+#else
+#define HWY_NATIVE_ROL_ROR_SAME_32_64
+#endif
+
+// Generic for all vector lengths
+template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
+          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
+HWY_API V RotateLeftSame(V v, int bits) {
+  const DFromV<decltype(v)> d;
+  return Rol(v, Set(d, static_cast<TFromV<V>>(static_cast<unsigned>(bits))));
+}
+
+template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
+          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
+HWY_API V RotateRightSame(V v, int bits) {
+  const DFromV<decltype(v)> d;
+  return Ror(v, Set(d, static_cast<TFromV<V>>(static_cast<unsigned>(bits))));
+}
+#endif  // HWY_TARGET <= HWY_AVX3
+
 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
 
 template <size_t N>
@@ -8122,6 +8292,94 @@
 #endif
 }
 
+// -------------------------- InterleaveEven
+
+template <class D, HWY_IF_LANES_LE_D(D, 2)>
+HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
+  return ConcatEven(d, b, a);
+}
+
+// I8/U8 InterleaveEven is generic for all vector lengths that are >= 4 bytes
+template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
+HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
+  const Repartition<uint16_t, decltype(d)> du16;
+  return OddEven(BitCast(d, ShiftLeft<8>(BitCast(du16, b))), a);
+}
+
+// I16/U16 InterleaveEven is generic for all vector lengths that are >= 8 bytes
+template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)>
+HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
+  const Repartition<uint32_t, decltype(d)> du32;
+  return OddEven(BitCast(d, ShiftLeft<16>(BitCast(du32, b))), a);
+}
+
+#if HWY_TARGET <= HWY_AVX3
+template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{_mm_mask_shuffle_epi32(
+      a.raw, static_cast<__mmask8>(0x0A), b.raw,
+      static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
+}
+template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{_mm_mask_shuffle_ps(a.raw, static_cast<__mmask8>(0x0A),
+                                       b.raw, b.raw, _MM_SHUFFLE(2, 2, 0, 0))};
+}
+#else
+template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)>
+HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
+  const RebindToFloat<decltype(d)> df;
+  const auto b2_b0_a2_a0 = ConcatEven(df, BitCast(df, b), BitCast(df, a));
+  return BitCast(
+      d, VFromD<decltype(df)>{_mm_shuffle_ps(b2_b0_a2_a0.raw, b2_b0_a2_a0.raw,
+                                             _MM_SHUFFLE(3, 1, 2, 0))});
+}
+#endif
+
+// -------------------------- InterleaveOdd
+
+template <class D, HWY_IF_LANES_LE_D(D, 2)>
+HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
+  return ConcatOdd(d, b, a);
+}
+
+// I8/U8 InterleaveOdd is generic for all vector lengths that are >= 4 bytes
+template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
+HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
+  const Repartition<uint16_t, decltype(d)> du16;
+  return OddEven(b, BitCast(d, ShiftRight<8>(BitCast(du16, a))));
+}
+
+// I16/U16 InterleaveOdd is generic for all vector lengths that are >= 8 bytes
+template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 2)>
+HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
+  const Repartition<uint32_t, decltype(d)> du32;
+  return OddEven(b, BitCast(d, ShiftRight<16>(BitCast(du32, a))));
+}
+
+#if HWY_TARGET <= HWY_AVX3
+template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_UI32_D(D)>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{_mm_mask_shuffle_epi32(
+      b.raw, static_cast<__mmask8>(0x05), a.raw,
+      static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
+}
+template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_F32_D(D)>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{_mm_mask_shuffle_ps(b.raw, static_cast<__mmask8>(0x05),
+                                       a.raw, a.raw, _MM_SHUFFLE(3, 3, 1, 1))};
+}
+#else
+template <class D, HWY_IF_LANES_D(D, 4), HWY_IF_T_SIZE_D(D, 4)>
+HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
+  const RebindToFloat<decltype(d)> df;
+  const auto b3_b1_a3_a1 = ConcatOdd(df, BitCast(df, b), BitCast(df, a));
+  return BitCast(
+      d, VFromD<decltype(df)>{_mm_shuffle_ps(b3_b1_a3_a1.raw, b3_b1_a3_a1.raw,
+                                             _MM_SHUFFLE(3, 1, 2, 0))});
+}
+#endif
+
 // ------------------------------ OddEvenBlocks
 template <typename T, size_t N>
 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
@@ -9065,6 +9323,21 @@
 
 #endif  // HWY_NATIVE_F16C
 
+#if HWY_HAVE_FLOAT16
+
+#ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
+#undef HWY_NATIVE_PROMOTE_F16_TO_F64
+#else
+#define HWY_NATIVE_PROMOTE_F16_TO_F64
+#endif
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F64_D(D)>
+HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, VFromD<Rebind<float16_t, D>> v) {
+  return VFromD<D>{_mm_cvtph_pd(v.raw)};
+}
+
+#endif  // HWY_HAVE_FLOAT16
+
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_F32_D(D)>
 HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
   const Rebind<uint16_t, decltype(df32)> du16;
@@ -9308,6 +9581,21 @@
 
 #endif  // F16C
 
+#if HWY_HAVE_FLOAT16
+
+#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
+#undef HWY_NATIVE_DEMOTE_F64_TO_F16
+#else
+#define HWY_NATIVE_DEMOTE_F64_TO_F16
+#endif
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
+HWY_API VFromD<D> DemoteTo(D /*df16*/, VFromD<Rebind<double, D>> v) {
+  return VFromD<D>{_mm_cvtpd_ph(v.raw)};
+}
+
+#endif  // HWY_HAVE_FLOAT16
+
 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
 HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
   // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16.
diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h
index f15144f..5521f4a 100644
--- a/hwy/ops/x86_256-inl.h
+++ b/hwy/ops/x86_256-inl.h
@@ -1074,6 +1074,28 @@
   return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
 }
 
+template <class D, HWY_IF_LANES_D(D, 32)>
+HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
+  using RawM = decltype(MFromD<D>().raw);
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return MFromD<D>{
+      static_cast<RawM>(_kshiftli_mask32(static_cast<__mmask32>(m.raw), 1))};
+#else
+  return MFromD<D>{static_cast<RawM>(static_cast<uint32_t>(m.raw) << 1)};
+#endif
+}
+
+template <class D, HWY_IF_LANES_D(D, 32)>
+HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
+  using RawM = decltype(MFromD<D>().raw);
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return MFromD<D>{
+      static_cast<RawM>(_kshiftri_mask32(static_cast<__mmask32>(m.raw), 1))};
+#else
+  return MFromD<D>{static_cast<RawM>(static_cast<uint32_t>(m.raw) >> 1)};
+#endif
+}
+
 #else  // AVX2
 
 // ------------------------------ Mask
@@ -2249,14 +2271,25 @@
 
 // ------------------------------ RotateRight
 
-template <int kBits, typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
-HWY_API Vec256<T> RotateRight(const Vec256<T> v) {
-  constexpr size_t kSizeInBits = sizeof(T) * 8;
-  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
+// U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8
+// RotateRight uses detail::GaloisAffine on AVX3_DL
+
+#if HWY_TARGET > HWY_AVX3_DL
+template <int kBits>
+HWY_API Vec256<uint8_t> RotateRight(const Vec256<uint8_t> v) {
+  static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
   if (kBits == 0) return v;
-  // AVX3 does not support 8/16-bit.
-  return Or(ShiftRight<kBits>(v),
-            ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
+  // AVX3 does not support 8-bit.
+  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
+}
+#endif
+
+template <int kBits>
+HWY_API Vec256<uint16_t> RotateRight(const Vec256<uint16_t> v) {
+  static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
+  if (kBits == 0) return v;
+  // AVX3 does not support 16-bit.
+  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
 }
 
 template <int kBits>
@@ -2281,6 +2314,31 @@
 #endif
 }
 
+// ------------------------------ Rol/Ror
+#if HWY_TARGET <= HWY_AVX3
+
+template <class T, HWY_IF_UI32(T)>
+HWY_API Vec256<T> Rol(Vec256<T> a, Vec256<T> b) {
+  return Vec256<T>{_mm256_rolv_epi32(a.raw, b.raw)};
+}
+
+template <class T, HWY_IF_UI32(T)>
+HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
+  return Vec256<T>{_mm256_rorv_epi32(a.raw, b.raw)};
+}
+
+template <class T, HWY_IF_UI64(T)>
+HWY_API Vec256<T> Rol(Vec256<T> a, Vec256<T> b) {
+  return Vec256<T>{_mm256_rolv_epi64(a.raw, b.raw)};
+}
+
+template <class T, HWY_IF_UI64(T)>
+HWY_API Vec256<T> Ror(Vec256<T> a, Vec256<T> b) {
+  return Vec256<T>{_mm256_rorv_epi64(a.raw, b.raw)};
+}
+
+#endif
+
 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
 
 HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
@@ -5218,6 +5276,72 @@
   return Vec256<double>{_mm256_blend_pd(a.raw, b.raw, 5)};
 }
 
+// -------------------------- InterleaveEven
+
+#if HWY_TARGET <= HWY_AVX3
+template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_UI32_D(D)>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{_mm256_mask_shuffle_epi32(
+      a.raw, static_cast<__mmask8>(0xAA), b.raw,
+      static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
+}
+template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_F32_D(D)>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{_mm256_mask_shuffle_ps(a.raw, static_cast<__mmask8>(0xAA),
+                                          b.raw, b.raw,
+                                          _MM_SHUFFLE(2, 2, 0, 0))};
+}
+#else
+template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
+HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
+  const RebindToFloat<decltype(d)> df;
+  const VFromD<decltype(df)> b2_b0_a2_a0{_mm256_shuffle_ps(
+      BitCast(df, a).raw, BitCast(df, b).raw, _MM_SHUFFLE(2, 0, 2, 0))};
+  return BitCast(
+      d, VFromD<decltype(df)>{_mm256_shuffle_ps(
+             b2_b0_a2_a0.raw, b2_b0_a2_a0.raw, _MM_SHUFFLE(3, 1, 2, 0))});
+}
+#endif
+
+// I64/U64/F64 InterleaveEven is generic for vector lengths >= 32 bytes
+template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 8)>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return InterleaveLower(a, b);
+}
+
+// -------------------------- InterleaveOdd
+
+#if HWY_TARGET <= HWY_AVX3
+template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_UI32_D(D)>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{_mm256_mask_shuffle_epi32(
+      b.raw, static_cast<__mmask8>(0x55), a.raw,
+      static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
+}
+template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_F32_D(D)>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{_mm256_mask_shuffle_ps(b.raw, static_cast<__mmask8>(0x55),
+                                          a.raw, a.raw,
+                                          _MM_SHUFFLE(3, 3, 1, 1))};
+}
+#else
+template <class D, HWY_IF_LANES_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
+HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
+  const RebindToFloat<decltype(d)> df;
+  const VFromD<decltype(df)> b3_b1_a3_a3{_mm256_shuffle_ps(
+      BitCast(df, a).raw, BitCast(df, b).raw, _MM_SHUFFLE(3, 1, 3, 1))};
+  return BitCast(
+      d, VFromD<decltype(df)>{_mm256_shuffle_ps(
+             b3_b1_a3_a3.raw, b3_b1_a3_a3.raw, _MM_SHUFFLE(3, 1, 2, 0))});
+}
+#endif
+
+// I64/U64/F64 InterleaveOdd is generic for vector lengths >= 32 bytes
+template <class D, HWY_IF_LANES_GT_D(D, 2), HWY_IF_T_SIZE_D(D, 8)>
+HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
+  return InterleaveUpper(d, a, b);
+}
+
 // ------------------------------ OddEvenBlocks
 
 template <typename T, HWY_IF_NOT_FLOAT3264(T)>
@@ -6334,6 +6458,13 @@
 
 #endif  // HWY_DISABLE_F16C
 
+#if HWY_HAVE_FLOAT16
+template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
+HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec256<double> v) {
+  return VFromD<D>{_mm256_cvtpd_ph(v.raw)};
+}
+#endif  // HWY_HAVE_FLOAT16
+
 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
 HWY_API VFromD<D> DemoteTo(D dbf16, Vec256<float> v) {
   // TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16.
@@ -6749,6 +6880,15 @@
 
 #endif  // HWY_DISABLE_F16C
 
+#if HWY_HAVE_FLOAT16
+
+template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F64_D(D)>
+HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, Vec64<float16_t> v) {
+  return VFromD<D>{_mm256_cvtph_pd(v.raw)};
+}
+
+#endif  // HWY_HAVE_FLOAT16
+
 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_F32_D(D)>
 HWY_API VFromD<D> PromoteTo(D df32, Vec128<bfloat16_t> v) {
   const Rebind<uint16_t, decltype(df32)> du16;
diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h
index e84a074..2c7c45c 100644
--- a/hwy/ops/x86_512-inl.h
+++ b/hwy/ops/x86_512-inl.h
@@ -1417,14 +1417,41 @@
 
 // ------------------------------ RotateRight
 
-template <int kBits, typename T, HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
-HWY_API Vec512<T> RotateRight(const Vec512<T> v) {
-  constexpr size_t kSizeInBits = sizeof(T) * 8;
-  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
+#if HWY_TARGET <= HWY_AVX3_DL
+// U8 RotateRight is generic for all vector lengths on AVX3_DL
+template <int kBits, class V, HWY_IF_U8(TFromV<V>)>
+HWY_API V RotateRight(V v) {
+  static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
+
+  const Repartition<uint64_t, DFromV<V>> du64;
   if (kBits == 0) return v;
-  // AVX3 does not support 8/16-bit.
-  return Or(ShiftRight<kBits>(v),
-            ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
+
+  constexpr uint64_t kShrMatrix =
+      (0x0102040810204080ULL << kBits) &
+      (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF));
+  constexpr int kShlBits = (-kBits) & 7;
+  constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) &
+                                  (0x0101010101010101ULL * (0xFF >> kShlBits));
+  constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix;
+
+  return detail::GaloisAffine(v, Set(du64, kMatrix));
+}
+#else   // HWY_TARGET > HWY_AVX3_DL
+template <int kBits>
+HWY_API Vec512<uint8_t> RotateRight(const Vec512<uint8_t> v) {
+  static_assert(0 <= kBits && kBits < 8, "Invalid shift count");
+  if (kBits == 0) return v;
+  // AVX3 does not support 8-bit.
+  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(7, 8 - kBits)>(v));
+}
+#endif  // HWY_TARGET <= HWY_AVX3_DL
+
+template <int kBits>
+HWY_API Vec512<uint16_t> RotateRight(const Vec512<uint16_t> v) {
+  static_assert(0 <= kBits && kBits < 16, "Invalid shift count");
+  if (kBits == 0) return v;
+  // AVX3 does not support 16-bit.
+  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(15, 16 - kBits)>(v));
 }
 
 template <int kBits>
@@ -1441,6 +1468,32 @@
   return Vec512<uint64_t>{_mm512_ror_epi64(v.raw, kBits)};
 }
 
+// ------------------------------ Rol/Ror
+
+#if HWY_TARGET <= HWY_AVX3
+
+template <class T, HWY_IF_UI32(T)>
+HWY_API Vec512<T> Rol(Vec512<T> a, Vec512<T> b) {
+  return Vec512<T>{_mm512_rolv_epi32(a.raw, b.raw)};
+}
+
+template <class T, HWY_IF_UI32(T)>
+HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
+  return Vec512<T>{_mm512_rorv_epi32(a.raw, b.raw)};
+}
+
+template <class T, HWY_IF_UI64(T)>
+HWY_API Vec512<T> Rol(Vec512<T> a, Vec512<T> b) {
+  return Vec512<T>{_mm512_rolv_epi64(a.raw, b.raw)};
+}
+
+template <class T, HWY_IF_UI64(T)>
+HWY_API Vec512<T> Ror(Vec512<T> a, Vec512<T> b) {
+  return Vec512<T>{_mm512_rorv_epi64(a.raw, b.raw)};
+}
+
+#endif
+
 // ------------------------------ ShiftLeftSame
 
 // GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512
@@ -2874,6 +2927,28 @@
   return MFromD<D>{static_cast<decltype(MFromD<D>().raw)>(shifted_mask)};
 }
 
+template <class D, HWY_IF_LANES_D(D, 64)>
+HWY_API MFromD<D> SlideMask1Up(D /*d*/, MFromD<D> m) {
+  using RawM = decltype(MFromD<D>().raw);
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return MFromD<D>{
+      static_cast<RawM>(_kshiftli_mask64(static_cast<__mmask64>(m.raw), 1))};
+#else
+  return MFromD<D>{static_cast<RawM>(static_cast<uint64_t>(m.raw) << 1)};
+#endif
+}
+
+template <class D, HWY_IF_LANES_D(D, 64)>
+HWY_API MFromD<D> SlideMask1Down(D /*d*/, MFromD<D> m) {
+  using RawM = decltype(MFromD<D>().raw);
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return MFromD<D>{
+      static_cast<RawM>(_kshiftri_mask64(static_cast<__mmask64>(m.raw), 1))};
+#else
+  return MFromD<D>{static_cast<RawM>(static_cast<uint64_t>(m.raw) >> 1)};
+#endif
+}
+
 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
 
 HWY_API Vec512<int8_t> BroadcastSignBit(Vec512<int8_t> v) {
@@ -4625,6 +4700,35 @@
   return IfThenElse(Mask512<T>{0x5555555555555555ull >> shift}, b, a);
 }
 
+// -------------------------- InterleaveEven
+
+template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_UI32_D(D)>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{_mm512_mask_shuffle_epi32(
+      a.raw, static_cast<__mmask16>(0xAAAA), b.raw,
+      static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(2, 2, 0, 0)))};
+}
+template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_F32_D(D)>
+HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{_mm512_mask_shuffle_ps(a.raw, static_cast<__mmask16>(0xAAAA),
+                                          b.raw, b.raw,
+                                          _MM_SHUFFLE(2, 2, 0, 0))};
+}
+// -------------------------- InterleaveOdd
+
+template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_UI32_D(D)>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{_mm512_mask_shuffle_epi32(
+      b.raw, static_cast<__mmask16>(0x5555), a.raw,
+      static_cast<_MM_PERM_ENUM>(_MM_SHUFFLE(3, 3, 1, 1)))};
+}
+template <class D, HWY_IF_LANES_D(D, 16), HWY_IF_F32_D(D)>
+HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
+  return VFromD<D>{_mm512_mask_shuffle_ps(b.raw, static_cast<__mmask16>(0x5555),
+                                          a.raw, a.raw,
+                                          _MM_SHUFFLE(3, 3, 1, 1))};
+}
+
 // ------------------------------ OddEvenBlocks
 
 template <typename T>
@@ -5349,6 +5453,15 @@
 #endif  // HWY_HAVE_FLOAT16
 }
 
+#if HWY_HAVE_FLOAT16
+
+template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F64_D(D)>
+HWY_INLINE VFromD<D> PromoteTo(D /*tag*/, Vec128<float16_t> v) {
+  return VFromD<D>{_mm512_cvtph_pd(v.raw)};
+}
+
+#endif  // HWY_HAVE_FLOAT16
+
 template <class D, HWY_IF_V_SIZE_D(D, 64), HWY_IF_F32_D(D)>
 HWY_API VFromD<D> PromoteTo(D df32, Vec256<bfloat16_t> v) {
   const Rebind<uint16_t, decltype(df32)> du16;
@@ -5530,6 +5643,13 @@
   HWY_DIAGNOSTICS(pop)
 }
 
+#if HWY_HAVE_FLOAT16
+template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
+HWY_API VFromD<D> DemoteTo(D /*df16*/, Vec512<double> v) {
+  return VFromD<D>{_mm512_cvtpd_ph(v.raw)};
+}
+#endif  // HWY_HAVE_FLOAT16
+
 template <class D, HWY_IF_V_SIZE_D(D, 32), HWY_IF_BF16_D(D)>
 HWY_API VFromD<D> DemoteTo(D dbf16, Vec512<float> v) {
   // TODO(janwas): _mm512_cvtneps_pbh once we have avx512bf16.
diff --git a/hwy/stats.cc b/hwy/stats.cc
new file mode 100644
index 0000000..a00e379
--- /dev/null
+++ b/hwy/stats.cc
@@ -0,0 +1,120 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/stats.h"
+
+#include <stdio.h>
+
+#include <algorithm>  // std::min
+#include <string>
+
+#include "hwy/base.h"  // HWY_ASSERT
+
+namespace hwy {
+
+void Stats::Assimilate(const Stats& other) {
+  const int64_t total_n = n_ + other.n_;
+  if (total_n == 0) return;  // Nothing to do; prevents div by zero.
+
+  min_ = std::min(min_, other.min_);
+  max_ = std::max(max_, other.max_);
+
+  product_ *= other.product_;
+
+  const double product_n = n_ * other.n_;
+  const double n2 = n_ * n_;
+  const double other_n2 = other.n_ * other.n_;
+  const int64_t total_n2 = total_n * total_n;
+  const double total_n3 = static_cast<double>(total_n2) * total_n;
+  // Precompute reciprocal for speed - used at least twice.
+  const double inv_total_n = 1.0 / total_n;
+  const double inv_total_n2 = 1.0 / total_n2;
+
+  const double delta = other.m1_ - m1_;
+  const double delta2 = delta * delta;
+  const double delta3 = delta * delta2;
+  const double delta4 = delta2 * delta2;
+
+  m1_ = (n_ * m1_ + other.n_ * other.m1_) * inv_total_n;
+
+  const double new_m2 = m2_ + other.m2_ + delta2 * product_n * inv_total_n;
+
+  const double new_m3 =
+      m3_ + other.m3_ + delta3 * product_n * (n_ - other.n_) * inv_total_n2 +
+      3.0 * delta * (n_ * other.m2_ - other.n_ * m2_) * inv_total_n;
+
+  m4_ += other.m4_ +
+         delta4 * product_n * (n2 - product_n + other_n2) / total_n3 +
+         6.0 * delta2 * (n2 * other.m2_ + other_n2 * m2_) * inv_total_n2 +
+         4.0 * delta * (n_ * other.m3_ - other.n_ * m3_) * inv_total_n;
+
+  m2_ = new_m2;
+  m3_ = new_m3;
+  n_ = total_n;
+}
+
+std::string Stats::ToString(int exclude) const {
+  if (Count() == 0) return std::string("(none)");
+
+  char buf[300];
+  int pos = 0;
+  int ret;  // snprintf - bytes written or negative for error.
+
+  if ((exclude & kNoCount) == 0) {
+    ret = snprintf(buf + pos, sizeof(buf) - pos, "Count=%9zu ",
+                   static_cast<size_t>(Count()));
+    HWY_ASSERT(ret > 0);
+    pos += ret;
+  }
+
+  if ((exclude & kNoMeanSD) == 0) {
+    const float sd = StandardDeviation();
+    if (sd > 100) {
+      ret = snprintf(buf + pos, sizeof(buf) - pos, "Mean=%8.2E SD=%7.1E ",
+                     Mean(), sd);
+    } else {
+      ret = snprintf(buf + pos, sizeof(buf) - pos, "Mean=%8.6f SD=%7.5f ",
+                     Mean(), sd);
+    }
+    HWY_ASSERT(ret > 0);
+    pos += ret;
+  }
+
+  if ((exclude & kNoMinMax) == 0) {
+    ret = snprintf(buf + pos, sizeof(buf) - pos, "Min=%8.5e Max=%8.5e ", Min(),
+                   Max());
+    HWY_ASSERT(ret > 0);
+    pos += ret;
+  }
+
+  if ((exclude & kNoSkewKurt) == 0) {
+    ret = snprintf(buf + pos, sizeof(buf) - pos, "Skew=%5.2f Kurt=%7.2f ",
+                   Skewness(), Kurtosis());
+    HWY_ASSERT(ret > 0);
+    pos += ret;
+  }
+
+  if ((exclude & kNoGeomean) == 0) {
+    ret = snprintf(buf + pos, sizeof(buf) - pos, "GeoMean=%9.6f ",
+                   GeometricMean());
+    HWY_ASSERT(ret > 0);
+    pos += ret;
+  }
+
+  HWY_ASSERT(pos < static_cast<int>(sizeof(buf)));
+  return buf;
+}
+
+}  // namespace hwy
diff --git a/hwy/stats.h b/hwy/stats.h
new file mode 100644
index 0000000..5b7d4c4
--- /dev/null
+++ b/hwy/stats.h
@@ -0,0 +1,194 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_STATS_H_
+#define HIGHWAY_HWY_STATS_H_
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <cmath>
+#include <string>
+
+#include "hwy/base.h"  // HWY_ASSERT
+
+namespace hwy {
+
+// Thread-compatible.
+template <size_t N>
+class Bins {
+ public:
+  Bins() { Reset(); }
+
+  template <typename T>
+  void Notify(T bin) {
+    HWY_ASSERT(T{0} <= bin && bin < static_cast<T>(N));
+    counts_[static_cast<int32_t>(bin)]++;
+  }
+
+  void Assimilate(const Bins<N>& other) {
+    for (size_t i = 0; i < N; ++i) {
+      counts_[i] += other.counts_[i];
+    }
+  }
+
+  void Print(const char* caption) {
+    fprintf(stderr, "\n%s [%zu]\n", caption, N);
+    size_t last_nonzero = 0;
+    for (size_t i = N - 1; i < N; --i) {
+      if (counts_[i] != 0) {
+        last_nonzero = i;
+        break;
+      }
+    }
+    for (size_t i = 0; i <= last_nonzero; ++i) {
+      fprintf(stderr, "  %zu\n", counts_[i]);
+    }
+  }
+
+  void Reset() {
+    for (size_t i = 0; i < N; ++i) {
+      counts_[i] = 0;
+    }
+  }
+
+ private:
+  size_t counts_[N];
+};
+
+// Descriptive statistics of a variable (4 moments). Thread-compatible.
+class Stats {
+ public:
+  Stats() { Reset(); }
+
+  void Notify(const float x) {
+    ++n_;
+
+    min_ = std::min(min_, x);
+    max_ = std::max(max_, x);
+
+    product_ *= x;
+
+    // Online moments. Reference: https://goo.gl/9ha694
+    const double d = x - m1_;
+    const double d_div_n = d / n_;
+    const double d2n1_div_n = d * (n_ - 1) * d_div_n;
+    const int64_t n_poly = n_ * n_ - 3 * n_ + 3;
+    m1_ += d_div_n;
+    m4_ += d_div_n * (d_div_n * (d2n1_div_n * n_poly + 6.0 * m2_) - 4.0 * m3_);
+    m3_ += d_div_n * (d2n1_div_n * (n_ - 2) - 3.0 * m2_);
+    m2_ += d2n1_div_n;
+  }
+
+  void Assimilate(const Stats& other);
+
+  int64_t Count() const { return n_; }
+
+  float Min() const { return min_; }
+  float Max() const { return max_; }
+
+  double GeometricMean() const {
+    return n_ == 0 ? 0.0 : pow(product_, 1.0 / n_);
+  }
+
+  double Mean() const { return m1_; }
+  // Same as Mu2. Assumes n_ is large.
+  double SampleVariance() const {
+    return n_ == 0 ? 0.0 : m2_ / static_cast<int>(n_);
+  }
+  // Unbiased estimator for population variance even for smaller n_.
+  double Variance() const {
+    if (n_ == 0) return 0.0;
+    if (n_ == 1) return m2_;
+    return m2_ / static_cast<int>(n_ - 1);
+  }
+  double StandardDeviation() const { return std::sqrt(Variance()); }
+  // Near zero for normal distributions; if positive on a unimodal distribution,
+  // the right tail is fatter. Assumes n_ is large.
+  double SampleSkewness() const {
+    if (std::abs(m2_) < 1E-7) return 0.0;
+    return m3_ * std::sqrt(static_cast<double>(n_)) / std::pow(m2_, 1.5);
+  }
+  // Corrected for bias (same as Wikipedia and Minitab but not Excel).
+  double Skewness() const {
+    if (n_ == 0) return 0.0;
+    const double biased = SampleSkewness();
+    const double r = (n_ - 1.0) / n_;
+    return biased * std::pow(r, 1.5);
+  }
+  // Near zero for normal distributions; smaller values indicate fewer/smaller
+  // outliers and larger indicates more/larger outliers. Assumes n_ is large.
+  double SampleKurtosis() const {
+    if (std::abs(m2_) < 1E-7) return 0.0;
+    return m4_ * n_ / (m2_ * m2_);
+  }
+  // Corrected for bias (same as Wikipedia and Minitab but not Excel).
+  double Kurtosis() const {
+    if (n_ == 0) return 0.0;
+    const double biased = SampleKurtosis();
+    const double r = (n_ - 1.0) / n_;
+    return biased * r * r;
+  }
+
+  // Central moments, useful for "method of moments"-based parameter estimation
+  // of a mixture of two Gaussians. Assumes Count() != 0.
+  double Mu1() const { return m1_; }
+  double Mu2() const { return m2_ / static_cast<int>(n_); }
+  double Mu3() const { return m3_ / static_cast<int>(n_); }
+  double Mu4() const { return m4_ / static_cast<int>(n_); }
+
+  // Which statistics to EXCLUDE in ToString
+  enum {
+    kNoCount = 1,
+    kNoMeanSD = 2,
+    kNoMinMax = 4,
+    kNoSkewKurt = 8,
+    kNoGeomean = 16
+  };
+  std::string ToString(int exclude = 0) const;
+
+  void Reset() {
+    n_ = 0;
+
+    min_ = hwy::HighestValue<float>();
+    max_ = hwy::LowestValue<float>();
+
+    product_ = 1.0;
+
+    m1_ = 0.0;
+    m2_ = 0.0;
+    m3_ = 0.0;
+    m4_ = 0.0;
+  }
+
+ private:
+  int64_t n_;  // signed for faster conversion + safe subtraction
+
+  float min_;
+  float max_;
+
+  double product_;  // for geomean
+
+  // Moments
+  double m1_;
+  double m2_;
+  double m3_;
+  double m4_;
+};
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_STATS_H_
diff --git a/hwy/targets.cc b/hwy/targets.cc
index b5e4cbc..87bc3d0 100644
--- a/hwy/targets.cc
+++ b/hwy/targets.cc
@@ -16,9 +16,11 @@
 #include "hwy/targets.h"
 
 #include <stdarg.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>  // abort / exit
 
+#include "hwy/detect_targets.h"
 #include "hwy/highway.h"
 #include "hwy/per_target.h"  // VectorBytes
 
@@ -287,38 +289,46 @@
   }
 #endif
 
-  // Clear bits if the OS does not support XSAVE - otherwise, registers
-  // are not preserved across context switches.
+  // Clear AVX2/AVX3 bits if the CPU or OS does not support XSAVE - otherwise,
+  // YMM/ZMM registers are not preserved across context switches.
+
+  // The lower 128 bits of XMM0-XMM15 are guaranteed to be preserved across
+  // context switches on x86_64
+
+  // The following OS's are known to preserve the lower 128 bits of XMM
+  // registers across context switches on x86 CPU's that support SSE (even in
+  // 32-bit mode):
+  // - Windows 2000 or later
+  // - Linux 2.4.0 or later
+  // - Mac OS X 10.4 or later
+  // - FreeBSD 4.4 or later
+  // - NetBSD 1.6 or later
+  // - OpenBSD 3.5 or later
+  // - UnixWare 7 Release 7.1.1 or later
+  // - Solaris 9 4/04 or later
+
   uint32_t abcd[4];
   Cpuid(1, 0, abcd);
+  const bool has_xsave = IsBitSet(abcd[2], 26);
   const bool has_osxsave = IsBitSet(abcd[2], 27);
-  if (has_osxsave) {
-    const uint32_t xcr0 = ReadXCR0();
-    const int64_t min_avx3 = HWY_AVX3 | HWY_AVX3_DL | HWY_AVX3_SPR;
-    const int64_t min_avx2 = HWY_AVX2 | min_avx3;
-    // XMM
-    if (!IsBitSet(xcr0, 1)) {
-#if HWY_ARCH_X86_64
-      // The HWY_SSE2, HWY_SSSE3, and HWY_SSE4 bits do not need to be
-      // cleared on x86_64, even if bit 1 of XCR0 is not set, as
-      // the lower 128 bits of XMM0-XMM15 are guaranteed to be
-      // preserved across context switches on x86_64
+  constexpr int64_t min_avx2 = HWY_AVX2 | (HWY_AVX2 - 1);
 
-      // Only clear the AVX2/AVX3 bits on x86_64 if bit 1 of XCR0 is not set
-      bits &= ~min_avx2;
-#else
-      bits &= ~(HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | min_avx2);
-#endif
-    }
-    // YMM
-    if (!IsBitSet(xcr0, 2)) {
+  if (has_xsave && has_osxsave) {
+    const uint32_t xcr0 = ReadXCR0();
+    constexpr int64_t min_avx3 = HWY_AVX3 | HWY_AVX3_DL | HWY_AVX3_SPR;
+    // XMM/YMM
+    if (!IsBitSet(xcr0, 1) || !IsBitSet(xcr0, 2)) {
+      // Clear the AVX2/AVX3 bits if XMM/YMM XSAVE is not enabled
       bits &= ~min_avx2;
     }
     // opmask, ZMM lo/hi
     if (!IsBitSet(xcr0, 5) || !IsBitSet(xcr0, 6) || !IsBitSet(xcr0, 7)) {
       bits &= ~min_avx3;
     }
-  }  // has_osxsave
+  } else {  // !has_xsave || !has_osxsave
+    // Clear the AVX2/AVX3 bits if the CPU or OS does not support XSAVE
+    bits &= ~min_avx2;
+  }
 
   // This is mainly to work around the slow Zen4 CompressStore. It's unclear
   // whether subsequent AMD models will be affected; assume yes.
diff --git a/hwy/tests/blockwise_test.cc b/hwy/tests/blockwise_test.cc
index f11ae9c..cdad7e6 100644
--- a/hwy/tests/blockwise_test.cc
+++ b/hwy/tests/blockwise_test.cc
@@ -224,10 +224,59 @@
   }
 };
 
+struct TestInterleaveEven {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto even_lanes = AllocateAligned<T>(N);
+    auto odd_lanes = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<T>(N);
+    HWY_ASSERT(even_lanes && odd_lanes && expected);
+    for (size_t i = 0; i < N; ++i) {
+      even_lanes[i] = ConvertScalarTo<T>(2 * i + 0);
+      odd_lanes[i] = ConvertScalarTo<T>(2 * i + 1);
+    }
+    const auto even = Load(d, even_lanes.get());
+    const auto odd = Load(d, odd_lanes.get());
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = ConvertScalarTo<T>(2 * i - (i & 1));
+    }
+
+    HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveEven(even, odd));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveEven(d, even, odd));
+  }
+};
+
+struct TestInterleaveOdd {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto even_lanes = AllocateAligned<T>(N);
+    auto odd_lanes = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<T>(N);
+    HWY_ASSERT(even_lanes && odd_lanes && expected);
+    for (size_t i = 0; i < N; ++i) {
+      even_lanes[i] = ConvertScalarTo<T>(2 * i + 0);
+      odd_lanes[i] = ConvertScalarTo<T>(2 * i + 1);
+    }
+    const auto even = Load(d, even_lanes.get());
+    const auto odd = Load(d, odd_lanes.get());
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = ConvertScalarTo<T>((2 * i) - (i & 1) + 2);
+    }
+
+    HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveOdd(d, even, odd));
+  }
+};
+
 HWY_NOINLINE void TestAllInterleave() {
   // Not DemoteVectors because this cannot be supported by HWY_SCALAR.
   ForAllTypes(ForShrinkableVectors<TestInterleaveLower>());
   ForAllTypes(ForShrinkableVectors<TestInterleaveUpper>());
+  ForAllTypes(ForShrinkableVectors<TestInterleaveEven>());
+  ForAllTypes(ForShrinkableVectors<TestInterleaveOdd>());
 }
 
 struct TestZipLower {
diff --git a/hwy/tests/convert_test.cc b/hwy/tests/convert_test.cc
index ebb4beb..bb303a8 100644
--- a/hwy/tests/convert_test.cc
+++ b/hwy/tests/convert_test.cc
@@ -319,8 +319,8 @@
   to_i32div2(int16_t());
 
   const ForShrinkableVectors<TestPromoteOddEvenTo<float>, 1> to_f32div2;
-  to_f32div2(float16_t());
-  to_f32div2(bfloat16_t());
+  to_f32div2(hwy::float16_t());
+  to_f32div2(hwy::bfloat16_t());
 
 #if HWY_HAVE_INTEGER64
   const ForShrinkableVectors<TestPromoteOddEvenTo<uint64_t>, 1> to_u64div2;
@@ -446,6 +446,49 @@
 
 HWY_NOINLINE void TestAllF16() { ForDemoteVectors<TestF16>()(float()); }
 
+// This minimal interface is always supported, even if !HWY_HAVE_FLOAT16.
+struct TestF16FromF64 {
+  template <typename TF64, class DF64>
+  HWY_NOINLINE void operator()(TF64 /*t*/, DF64 df64) {
+#if HWY_HAVE_FLOAT64
+    size_t padded;
+    const size_t N = Lanes(df64);  // same count for f16 and f32
+    HWY_ASSERT(N != 0);
+
+    const Rebind<hwy::float16_t, DF64> df16;
+    const Rebind<float, DF64> df32;
+    const RebindToUnsigned<decltype(df64)> du64;
+    using VF16 = Vec<decltype(df16)>;
+    using VF32 = Vec<decltype(df32)>;
+    using VF64 = Vec<decltype(df64)>;
+    using VU64 = Vec<decltype(du64)>;
+
+    auto f32_in = F16TestCases(df32, padded);
+    const VU64 u64_zero =
+        Set(du64, static_cast<uint64_t>(Unpredictable1() - 1));
+    const VF64 f64_zero = BitCast(df64, u64_zero);
+    const VF16 f16_zero = ResizeBitCast(df16, u64_zero);
+
+    for (size_t i = 0; i < padded; i += N) {
+      const VF32 vf32 = Load(df32, f32_in.get() + i);
+      const VF16 vf16 = Or(DemoteTo(df16, vf32), f16_zero);
+      const VF64 vf64 = Or(PromoteTo(df64, vf32), f64_zero);
+
+      HWY_ASSERT_VEC_EQ(df16, vf16, DemoteTo(df16, vf64));
+      HWY_ASSERT_VEC_EQ(df64, vf64, PromoteTo(df64, vf16));
+    }
+#else
+    (void)df64;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllF16FromF64() {
+#if HWY_HAVE_FLOAT64
+  ForDemoteVectors<TestF16FromF64, 2>()(double());
+#endif
+}
+
 template <class D>
 AlignedFreeUniquePtr<float[]> BF16TestCases(D d, size_t& padded) {
   const float test_cases[] = {
@@ -843,6 +886,14 @@
   ForFloatTypes(ForPartialVectors<TestFloatFromUint>());
 }
 
+#undef HWY_F2I_INLINE
+#if HWY_TARGET == HWY_RVV
+// Workaround for incorrect rounding mode.
+#define HWY_F2I_INLINE HWY_NOINLINE
+#else
+#define HWY_F2I_INLINE HWY_INLINE
+#endif
+
 template <class TTo>
 class TestNonFiniteF2IConvertTo {
  private:
@@ -850,21 +901,21 @@
                 "TTo must be an integer type");
 
   template <class DF, HWY_IF_T_SIZE_LE_D(DF, sizeof(TTo) - 1)>
-  static HWY_INLINE VFromD<Rebind<TTo, DF>> DoF2IConvVec(DF /*df*/,
-                                                         VFromD<DF> v) {
-    return PromoteTo(Rebind<TTo, DF>(), v);
+  static HWY_F2I_INLINE VFromD<Rebind<TTo, DF>> DoF2IConvVec(DF df,
+                                                             VFromD<DF> v) {
+    return PromoteTo(Rebind<TTo, decltype(df)>(), v);
   }
 
   template <class DF, HWY_IF_T_SIZE_D(DF, sizeof(TTo))>
-  static HWY_INLINE VFromD<Rebind<TTo, DF>> DoF2IConvVec(DF /*df*/,
-                                                         VFromD<DF> v) {
-    return ConvertTo(Rebind<TTo, DF>(), v);
+  static HWY_F2I_INLINE VFromD<Rebind<TTo, DF>> DoF2IConvVec(DF df,
+                                                             VFromD<DF> v) {
+    return ConvertTo(Rebind<TTo, decltype(df)>(), v);
   }
 
   template <class DF, HWY_IF_T_SIZE_GT_D(DF, sizeof(TTo))>
-  static HWY_INLINE VFromD<Rebind<TTo, DF>> DoF2IConvVec(DF /*df*/,
-                                                         VFromD<DF> v) {
-    return DemoteTo(Rebind<TTo, DF>(), v);
+  static HWY_F2I_INLINE VFromD<Rebind<TTo, DF>> DoF2IConvVec(DF df,
+                                                             VFromD<DF> v) {
+    return DemoteTo(Rebind<TTo, decltype(df)>(), v);
   }
 
   template <class DF, HWY_IF_T_SIZE_LE_D(DF, sizeof(TTo) - 1)>
@@ -873,8 +924,8 @@
   }
 
   template <class DF, HWY_IF_T_SIZE_D(DF, sizeof(TTo))>
-  static HWY_INLINE Mask<Rebind<TTo, DF>> DoF2IConvMask(DF /*df*/, Mask<DF> m) {
-    return RebindMask(Rebind<TTo, DF>(), m);
+  static HWY_INLINE Mask<Rebind<TTo, DF>> DoF2IConvMask(DF df, Mask<DF> m) {
+    return RebindMask(Rebind<TTo, decltype(df)>(), m);
   }
 
   template <class DF, HWY_IF_T_SIZE_GT_D(DF, sizeof(TTo))>
@@ -1400,6 +1451,7 @@
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteUpperLowerTo);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteOddEvenTo);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16FromF64);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
diff --git a/hwy/tests/dup128_vec_test.cc b/hwy/tests/dup128_vec_test.cc
index e9dd23f..221a173 100644
--- a/hwy/tests/dup128_vec_test.cc
+++ b/hwy/tests/dup128_vec_test.cc
@@ -70,12 +70,12 @@
   }
 
   template <class D, class T, HWY_IF_BF16_D(D)>
-  static HWY_INLINE bfloat16_t CastValueToLaneType(D /*d*/, T val) {
+  static HWY_INLINE hwy::bfloat16_t CastValueToLaneType(D /*d*/, T val) {
     return BF16FromF32(static_cast<float>(val));
   }
 
   template <class D, class T, HWY_IF_F16_D(D)>
-  static HWY_INLINE float16_t CastValueToLaneType(D /*d*/, T val) {
+  static HWY_INLINE hwy::float16_t CastValueToLaneType(D /*d*/, T val) {
     return F16FromF32(static_cast<float>(val));
   }
 
@@ -98,12 +98,12 @@
 #else
     const FixedTag<float, 8> df32;
 #endif
-    const Rebind<bfloat16_t, decltype(df32)> dbf16;
+    const Rebind<hwy::bfloat16_t, decltype(df32)> dbf16;
 
     const auto vbf16_iota = DemoteTo(dbf16, Iota(df32, start));
 #else
     const FixedTag<float, 4> df32;
-    const Repartition<bfloat16_t, decltype(df32)> dbf16;
+    const Repartition<hwy::bfloat16_t, decltype(df32)> dbf16;
 
     const auto vbf16_iota = OrderedDemote2To(
         dbf16, Iota(df32, start), Iota(df32, static_cast<float>(start) + 4.0f));
@@ -129,12 +129,12 @@
 #else
     const FixedTag<float, 8> df32;
 #endif
-    const Rebind<float16_t, decltype(df32)> df16;
+    const Rebind<hwy::float16_t, decltype(df32)> df16;
 
     const auto vf16_iota = DemoteTo(df16, Iota(df32, start));
 #else
     const FixedTag<float, 4> df32;
-    const Repartition<float16_t, decltype(df32)> df16;
+    const Repartition<hwy::float16_t, decltype(df32)> df16;
     const Half<decltype(df16)> dh_f16;
 
     const auto vf16_iota = Combine(
@@ -213,8 +213,8 @@
 HWY_NOINLINE void TestAllDup128VecFromValues() {
   const ForPartialVectors<TestDup128VecFromValues> func;
   ForIntegerTypes(func);
-  func(float16_t());
-  func(bfloat16_t());
+  func(hwy::float16_t());
+  func(hwy::bfloat16_t());
   ForFloat3264Types(func);
 }
 
diff --git a/hwy/tests/if_test.cc b/hwy/tests/if_test.cc
index e9e4554..6c3cf85 100644
--- a/hwy/tests/if_test.cc
+++ b/hwy/tests/if_test.cc
@@ -186,8 +186,9 @@
   static HWY_INLINE void TestMoreThan1LaneIfNegativeThenNegOrUndefIfZero(
       D /*d*/, Vec<D> /*v1*/, Vec<D> /*v2*/) {}
 #if HWY_TARGET != HWY_SCALAR
+  // NOINLINE works around a clang compiler bug for PPC9 partial vectors.
   template <class D, HWY_IF_LANES_GT_D(D, 1)>
-  static HWY_INLINE void TestMoreThan1LaneIfNegativeThenNegOrUndefIfZero(
+  static HWY_NOINLINE void TestMoreThan1LaneIfNegativeThenNegOrUndefIfZero(
       D d, Vec<D> v1, Vec<D> v2) {
 #if HWY_HAVE_SCALABLE
     if (Lanes(d) < 2) {
@@ -195,12 +196,12 @@
     }
 #endif
 
-    const auto v3 = InterleaveLower(d, v1, v1);
-    const auto v4 = InterleaveUpper(d, v1, v1);
-    const auto v5 = InterleaveLower(d, v1, v2);
-    const auto v6 = InterleaveUpper(d, v1, v2);
-    const auto v7 = InterleaveLower(d, v2, v1);
-    const auto v8 = InterleaveUpper(d, v2, v1);
+    const Vec<D> v3 = InterleaveLower(d, v1, v1);
+    const Vec<D> v4 = InterleaveUpper(d, v1, v1);
+    const Vec<D> v5 = InterleaveLower(d, v1, v2);
+    const Vec<D> v6 = InterleaveUpper(d, v1, v2);
+    const Vec<D> v7 = InterleaveLower(d, v2, v1);
+    const Vec<D> v8 = InterleaveUpper(d, v2, v1);
 
     HWY_ASSERT_VEC_EQ(d, v3, IfNegativeThenNegOrUndefIfZero(v3, v3));
     HWY_ASSERT_VEC_EQ(d, v4, IfNegativeThenNegOrUndefIfZero(v4, v4));
@@ -214,7 +215,7 @@
     HWY_ASSERT_VEC_EQ(d, v7, IfNegativeThenNegOrUndefIfZero(v3, v7));
     HWY_ASSERT_VEC_EQ(d, v8, IfNegativeThenNegOrUndefIfZero(v4, v8));
 
-    const auto zero = Zero(d);
+    const Vec<D> zero = Zero(d);
     HWY_ASSERT_VEC_EQ(d, zero, IfNegativeThenNegOrUndefIfZero(v3, zero));
     HWY_ASSERT_VEC_EQ(d, zero, IfNegativeThenNegOrUndefIfZero(v4, zero));
     HWY_ASSERT_VEC_EQ(d, zero, IfNegativeThenNegOrUndefIfZero(v5, zero));
diff --git a/hwy/tests/mask_slide_test.cc b/hwy/tests/mask_slide_test.cc
new file mode 100644
index 0000000..9cda31a
--- /dev/null
+++ b/hwy/tests/mask_slide_test.cc
@@ -0,0 +1,155 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/mask_slide_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestSlideMaskDownLanes {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    using TI = MakeSigned<T>;
+
+    const RebindToSigned<decltype(d)> di;
+
+    const size_t N = Lanes(d);
+    if (N < 2) {
+      return;
+    }
+
+    auto bool_lanes = AllocateAligned<TI>(N);
+    auto expected = AllocateAligned<TI>(N);
+    HWY_ASSERT(bool_lanes && expected);
+
+    // For all combinations of zero/nonzero state of subset of lanes:
+    const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6)));
+
+    ZeroBytes(bool_lanes.get(), max_lanes * sizeof(TI));
+    for (size_t i = max_lanes; i < N; i++) {
+      bool_lanes[i] = TI(-1);
+    }
+
+    for (size_t code = 0; code < (1ull << max_lanes); ++code) {
+      for (size_t i = 0; i < max_lanes; ++i) {
+        bool_lanes[i] = (code & (1ull << i)) ? TI(-1) : TI(0);
+      }
+
+      for (size_t i = 0; i < max_lanes; i++) {
+        ZeroBytes(expected.get() + N - i, i * sizeof(TI));
+        for (size_t j = 0; j < N - i; j++) {
+          expected[j] = bool_lanes[j + i];
+        }
+
+        const auto src_mask =
+            MaskFromVec(BitCast(d, Load(di, bool_lanes.get())));
+        const auto expected_mask =
+            MaskFromVec(BitCast(d, Load(di, expected.get())));
+        const auto actual_mask = SlideMaskDownLanes(d, src_mask, i);
+        HWY_ASSERT_MASK_EQ(d, expected_mask, actual_mask);
+
+        if (i == 1) {
+          HWY_ASSERT_MASK_EQ(d, expected_mask, SlideMask1Down(d, src_mask));
+        }
+      }
+    }
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllSlideMaskDownLanes() {
+  ForAllTypes(ForPartialVectors<TestSlideMaskDownLanes>());
+}
+
+struct TestSlideMaskUpLanes {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    using TI = MakeSigned<T>;
+
+    const RebindToSigned<decltype(d)> di;
+
+    const size_t N = Lanes(d);
+    if (N < 2) {
+      return;
+    }
+
+    auto bool_lanes = AllocateAligned<TI>(N);
+    auto expected = AllocateAligned<TI>(N);
+    HWY_ASSERT(bool_lanes && expected);
+
+    // For all combinations of zero/nonzero state of subset of lanes:
+    const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6)));
+
+    ZeroBytes(bool_lanes.get(), max_lanes * sizeof(TI));
+    for (size_t i = max_lanes; i < N; i++) {
+      bool_lanes[i] = TI(-1);
+    }
+
+    for (size_t code = 0; code < (1ull << max_lanes); ++code) {
+      for (size_t i = 0; i < max_lanes; ++i) {
+        bool_lanes[i] = (code & (1ull << i)) ? TI(-1) : TI(0);
+      }
+
+      for (size_t i = 0; i < max_lanes; i++) {
+        ZeroBytes(expected.get(), i * sizeof(TI));
+        for (size_t j = 0; j < N - i; j++) {
+          expected[j + i] = bool_lanes[j];
+        }
+
+        const auto src_mask =
+            MaskFromVec(BitCast(d, Load(di, bool_lanes.get())));
+        const auto expected_mask =
+            MaskFromVec(BitCast(d, Load(di, expected.get())));
+        const auto actual_mask = SlideMaskUpLanes(d, src_mask, i);
+        HWY_ASSERT_MASK_EQ(d, expected_mask, actual_mask);
+
+        if (i == 1) {
+          HWY_ASSERT_MASK_EQ(d, expected_mask, SlideMask1Up(d, src_mask));
+        }
+      }
+    }
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllSlideMaskUpLanes() {
+  ForAllTypes(ForPartialVectors<TestSlideMaskUpLanes>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyMaskSlideTest);
+HWY_EXPORT_AND_TEST_P(HwyMaskSlideTest, TestAllSlideMaskDownLanes);
+HWY_EXPORT_AND_TEST_P(HwyMaskSlideTest, TestAllSlideMaskUpLanes);
+}  // namespace hwy
+
+#endif
diff --git a/hwy/tests/mask_test.cc b/hwy/tests/mask_test.cc
index 96fab68..5ca52b9 100644
--- a/hwy/tests/mask_test.cc
+++ b/hwy/tests/mask_test.cc
@@ -33,67 +33,42 @@
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
 #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \
     HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_SCALAR
+    // For RVV, SVE and SCALAR, use the underlying native vector.
     const DFromV<Vec<D>> d2;
 #else
+    // Other targets are strongly-typed, but we can safely ResizeBitCast to the
+    // native vector. All targets have at least 128-bit vectors, but NEON also
+    // supports 64-bit vectors.
     constexpr size_t kMinD2Lanes =
         ((HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) ? 8
                                                                         : 16) /
         sizeof(T);
     const FixedTag<T, HWY_MAX(HWY_MAX_LANES_D(D), kMinD2Lanes)> d2;
 #endif
-
     static_assert(d2.MaxBytes() >= d.MaxBytes(),
                   "d2.MaxBytes() >= d.MaxBytes() should be true");
+    using V2 = Vec<decltype(d2)>;
 
-    const size_t N = Lanes(d);
-    const size_t N2 = Lanes(d2);
-    HWY_ASSERT(N2 >= N);
-
-    auto expected1 = AllocateAligned<T>(N);
-    auto expected2 = AllocateAligned<T>(N2);
-    auto actual1 = AllocateAligned<T>(N);
-    auto actual2 = AllocateAligned<T>(N2);
-    HWY_ASSERT(expected1 && expected2 && actual1 && actual2);
-
-    ZeroBytes(expected1.get(), sizeof(T) * N);
-    ZeroBytes(expected2.get(), sizeof(T) * N2);
-
-    memset(actual1.get(), 0xFF, sizeof(T) * N);
-    memset(actual2.get(), 0xFF, sizeof(T) * N2);
-
-    // If possible, use a compiler memory barrier to prevent the compiler from
-    // reordering the above ZeroBytes and memset operations with the Store
-    // operations below
-
-#if HWY_COMPILER_MSVC && !defined(HWY_NO_LIBCXX)
-    std::atomic_signal_fence(std::memory_order_seq_cst);
-#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANG
-    asm volatile("" ::: "memory");
-#endif
-
-    Store(VecFromMask(d, MaskFalse(d)), d, actual1.get());
-    HWY_ASSERT_ARRAY_EQ(expected1.get(), actual1.get(), N);
-
-    // All of the bits of MaskFalse(d) should be zero, including any bits past
-    // the first N lanes of MaskFalse(d)
-    Store(ResizeBitCast(d2, VecFromMask(d, MaskFalse(d))), d2, actual2.get());
-    HWY_ASSERT_ARRAY_EQ(expected2.get(), actual2.get(), N2);
-
-#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
-    HWY_ASSERT_VEC_EQ(d2, expected2.get(), VecFromMask(d2, MaskFalse(d)));
-#endif
-
+    // Various ways of checking that false masks are false.
     HWY_ASSERT(AllFalse(d, MaskFalse(d)));
+    HWY_ASSERT_EQ(0, CountTrue(d, MaskFalse(d)));
+    HWY_ASSERT_VEC_EQ(d, Zero(d), VecFromMask(d, MaskFalse(d)));
+
 #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
-    // Check that AllFalse(d2, MaskFalse(d)) is true on RVV/SVE targets
+    // For these targets, we can treat the result as if it were a vector of type
+    // `V2`. On SVE, vectors are always full (not fractional) and caps are only
+    // enforced by Highway ops. On RVV, LMUL must match but caps can also be
+    // ignored. For safety, MaskFalse also sets lanes >= `Lanes(d)` to false,
+    // and we verify that here.
     HWY_ASSERT(AllFalse(d2, MaskFalse(d)));
+    HWY_ASSERT_EQ(0, CountTrue(d2, MaskFalse(d)));
+    HWY_ASSERT_VEC_EQ(d2, Zero(d2), VecFromMask(d2, MaskFalse(d)));
 #endif
 
-    HWY_ASSERT_EQ(0, CountTrue(d, MaskFalse(d)));
-#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
-    // Check that CountTrue(d2, MaskFalse(d)) returns true on RVV/SVE targets
-    HWY_ASSERT_EQ(0, CountTrue(d2, MaskFalse(d)));
-#endif
+    // All targets support, and strongly-typed (non-scalable) targets require,
+    // ResizeBitCast before we compare to the 'native' underlying vector size.
+    const V2 actual2 = ResizeBitCast(d2, VecFromMask(d, MaskFalse(d)));
+    HWY_ASSERT_VEC_EQ(d2, Zero(d2), actual2);
   }
 };
 
diff --git a/hwy/tests/shift_test.cc b/hwy/tests/shift_test.cc
index 8011e84..a4f4afc 100644
--- a/hwy/tests/shift_test.cc
+++ b/hwy/tests/shift_test.cc
@@ -165,50 +165,144 @@
   }
 };
 
-struct TestRotateRight {
+struct TestRotateLeft {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using TU = MakeUnsigned<T>;
+
     const size_t N = Lanes(d);
     auto expected = AllocateAligned<T>(N);
     HWY_ASSERT(expected);
 
     constexpr size_t kBits = sizeof(T) * 8;
-    const Vec<D> mask_shift = Set(d, static_cast<T>(kBits));
+    const Vec<D> mask_shift = Set(d, static_cast<T>(kBits - 1));
     // Cover as many bit positions as possible to test shifting out
     const Vec<D> values =
         Shl(Set(d, static_cast<T>(1)), And(Iota(d, 0), mask_shift));
+    const Vec<D> values2 = Xor(values, SignBit(d));
 
     // Rotate by 0
-    HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values));
+    HWY_ASSERT_VEC_EQ(d, values, RotateLeft<0>(values));
+    HWY_ASSERT_VEC_EQ(d, values2, RotateLeft<0>(values2));
 
     // Rotate by 1
     Store(values, d, expected.get());
     for (size_t i = 0; i < N; ++i) {
       expected[i] =
-          ConvertScalarTo<T>((expected[i] >> 1) | (expected[i] << (kBits - 1)));
+          ConvertScalarTo<T>((static_cast<TU>(expected[i]) << 1) |
+                             (static_cast<TU>(expected[i]) >> (kBits - 1)));
     }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateLeft<1>(values));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = ConvertScalarTo<T>(expected[i] ^ static_cast<T>(1));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateLeft<1>(values2));
 
     // Rotate by half
     Store(values, d, expected.get());
     for (size_t i = 0; i < N; ++i) {
-      expected[i] = ConvertScalarTo<T>((expected[i] >> (kBits / 2)) |
-                                       (expected[i] << (kBits / 2)));
+      expected[i] =
+          ConvertScalarTo<T>((static_cast<TU>(expected[i]) << (kBits / 2)) |
+                             (static_cast<TU>(expected[i]) >> (kBits / 2)));
     }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateLeft<kBits / 2>(values));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = ConvertScalarTo<T>(
+          expected[i] ^ (static_cast<T>(1) << ((kBits / 2) - 1)));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateLeft<kBits / 2>(values2));
 
     // Rotate by max
     Store(values, d, expected.get());
     for (size_t i = 0; i < N; ++i) {
       expected[i] =
-          ConvertScalarTo<T>((expected[i] >> (kBits - 1)) | (expected[i] << 1));
+          ConvertScalarTo<T>((static_cast<TU>(expected[i]) << (kBits - 1)) |
+                             (static_cast<TU>(expected[i]) >> 1));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateLeft<kBits - 1>(values));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] =
+          ConvertScalarTo<T>(expected[i] ^ (static_cast<T>(1) << (kBits - 2)));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateLeft<kBits - 1>(values2));
+  }
+};
+
+HWY_NOINLINE void TestAllRotateLeft() {
+  ForIntegerTypes(ForPartialVectors<TestRotateLeft>());
+}
+
+struct TestRotateRight {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using TU = MakeUnsigned<T>;
+
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    HWY_ASSERT(expected);
+
+    constexpr size_t kBits = sizeof(T) * 8;
+    const Vec<D> mask_shift = Set(d, static_cast<T>(kBits - 1));
+    // Cover as many bit positions as possible to test shifting out
+    const Vec<D> values =
+        Shl(Set(d, static_cast<T>(1)), And(Iota(d, 0), mask_shift));
+    const Vec<D> values2 = Xor(values, SignBit(d));
+
+    // Rotate by 0
+    HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values));
+    HWY_ASSERT_VEC_EQ(d, values2, RotateRight<0>(values2));
+
+    // Rotate by 1
+    Store(values, d, expected.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] =
+          ConvertScalarTo<T>((static_cast<TU>(expected[i]) >> 1) |
+                             (static_cast<TU>(expected[i]) << (kBits - 1)));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] =
+          ConvertScalarTo<T>(expected[i] ^ (static_cast<T>(1) << (kBits - 2)));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values2));
+
+    // Rotate by half
+    Store(values, d, expected.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] =
+          ConvertScalarTo<T>((static_cast<TU>(expected[i]) >> (kBits / 2)) |
+                             (static_cast<TU>(expected[i]) << (kBits / 2)));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = ConvertScalarTo<T>(
+          expected[i] ^ (static_cast<T>(1) << ((kBits / 2) - 1)));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values2));
+
+    // Rotate by max
+    Store(values, d, expected.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] =
+          ConvertScalarTo<T>((static_cast<TU>(expected[i]) >> (kBits - 1)) |
+                             (static_cast<TU>(expected[i]) << 1));
     }
     HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = ConvertScalarTo<T>(expected[i] ^ static_cast<T>(1));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values2));
   }
 };
 
 HWY_NOINLINE void TestAllRotateRight() {
-  ForUnsignedTypes(ForPartialVectors<TestRotateRight>());
+  ForIntegerTypes(ForPartialVectors<TestRotateRight>());
 }
 
 struct TestVariableUnsignedRightShifts {
@@ -411,6 +505,118 @@
   ForSignedTypes(ForPartialVectors<TestVariableSignedRightShifts>());
 }
 
+struct TestVariableRotations {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using TU = MakeUnsigned<T>;
+
+    constexpr TU kBits1 = static_cast<TU>(0x7C29085C41482973ULL);
+    constexpr TU kBits2 = static_cast<TU>(0xD3C8835FBD1A89BAULL);
+
+    const auto viota0 = Iota(d, 0);
+    const auto va = Xor(Set(d, static_cast<T>(kBits1)), viota0);
+    const auto vb = Xor(Set(d, static_cast<T>(kBits2)), viota0);
+
+    const size_t N = Lanes(d);
+    auto expected1 = AllocateAligned<T>(N);
+    auto expected2 = AllocateAligned<T>(N);
+    auto expected3 = AllocateAligned<T>(N);
+    auto expected4 = AllocateAligned<T>(N);
+    HWY_ASSERT(expected1 && expected2 && expected3 && expected4);
+
+    constexpr size_t kBits = sizeof(T) * 8;
+
+    auto vrotate_amt1 = viota0;
+    auto vrotate_amt2 = Sub(Set(d, static_cast<T>(kBits)), viota0);
+    auto vrotate_amt_incr = Set(d, static_cast<T>(N));
+
+    const RebindToSigned<decltype(d)> di;
+
+    for (size_t i = 0; i < kBits; i += N) {
+      for (size_t j = 0; j < N; j++) {
+        const size_t shift_amt_1 = (i + j) & (kBits - 1);
+        const size_t shift_amt_2 = (size_t{0} - shift_amt_1) & (kBits - 1);
+
+        const TU val_a = static_cast<TU>(kBits1 ^ j);
+        const TU val_b = static_cast<TU>(kBits2 ^ j);
+
+        expected1[j] =
+            static_cast<T>((val_a << shift_amt_1) | (val_a >> shift_amt_2));
+        expected2[j] =
+            static_cast<T>((val_a >> shift_amt_1) | (val_a << shift_amt_2));
+        expected3[j] =
+            static_cast<T>((val_b << shift_amt_1) | (val_b >> shift_amt_2));
+        expected4[j] =
+            static_cast<T>((val_b >> shift_amt_1) | (val_b << shift_amt_2));
+      }
+
+      const auto vrotate_amt3 = BitCast(d, Neg(BitCast(di, vrotate_amt1)));
+      const auto vrotate_amt4 = BitCast(d, Neg(BitCast(di, vrotate_amt2)));
+
+      HWY_ASSERT_VEC_EQ(d, expected1.get(), Rol(va, vrotate_amt1));
+      HWY_ASSERT_VEC_EQ(d, expected2.get(), Ror(va, vrotate_amt1));
+      HWY_ASSERT_VEC_EQ(d, expected3.get(), Rol(vb, vrotate_amt1));
+      HWY_ASSERT_VEC_EQ(d, expected4.get(), Ror(vb, vrotate_amt1));
+
+      HWY_ASSERT_VEC_EQ(d, expected1.get(), Ror(va, vrotate_amt2));
+      HWY_ASSERT_VEC_EQ(d, expected2.get(), Rol(va, vrotate_amt2));
+      HWY_ASSERT_VEC_EQ(d, expected3.get(), Ror(vb, vrotate_amt2));
+      HWY_ASSERT_VEC_EQ(d, expected4.get(), Rol(vb, vrotate_amt2));
+
+      HWY_ASSERT_VEC_EQ(d, expected1.get(), Ror(va, vrotate_amt3));
+      HWY_ASSERT_VEC_EQ(d, expected2.get(), Rol(va, vrotate_amt3));
+      HWY_ASSERT_VEC_EQ(d, expected3.get(), Ror(vb, vrotate_amt3));
+      HWY_ASSERT_VEC_EQ(d, expected4.get(), Rol(vb, vrotate_amt3));
+
+      HWY_ASSERT_VEC_EQ(d, expected1.get(), Rol(va, vrotate_amt4));
+      HWY_ASSERT_VEC_EQ(d, expected2.get(), Ror(va, vrotate_amt4));
+      HWY_ASSERT_VEC_EQ(d, expected3.get(), Rol(vb, vrotate_amt4));
+      HWY_ASSERT_VEC_EQ(d, expected4.get(), Ror(vb, vrotate_amt4));
+
+      vrotate_amt1 = Add(vrotate_amt1, vrotate_amt_incr);
+      vrotate_amt2 = Sub(vrotate_amt2, vrotate_amt_incr);
+    }
+
+    for (int i = 0; i < static_cast<int>(kBits); ++i) {
+      for (size_t j = 0; j < N; j++) {
+        const int shift_amt_2 =
+            static_cast<int>(static_cast<size_t>(-i) & (kBits - 1));
+
+        const TU val_a = static_cast<TU>(kBits1 ^ j);
+        const TU val_b = static_cast<TU>(kBits2 ^ j);
+
+        expected1[j] = static_cast<T>((val_a << i) | (val_a >> shift_amt_2));
+        expected2[j] = static_cast<T>((val_a >> i) | (val_a << shift_amt_2));
+        expected3[j] = static_cast<T>((val_b << i) | (val_b >> shift_amt_2));
+        expected4[j] = static_cast<T>((val_b >> i) | (val_b << shift_amt_2));
+      }
+
+      HWY_ASSERT_VEC_EQ(d, expected1.get(), RotateLeftSame(va, i));
+      HWY_ASSERT_VEC_EQ(d, expected2.get(), RotateRightSame(va, i));
+      HWY_ASSERT_VEC_EQ(d, expected3.get(), RotateLeftSame(vb, i));
+      HWY_ASSERT_VEC_EQ(d, expected4.get(), RotateRightSame(vb, i));
+
+      HWY_ASSERT_VEC_EQ(d, expected1.get(), RotateRightSame(va, -i));
+      HWY_ASSERT_VEC_EQ(d, expected2.get(), RotateLeftSame(va, -i));
+      HWY_ASSERT_VEC_EQ(d, expected3.get(), RotateRightSame(vb, -i));
+      HWY_ASSERT_VEC_EQ(d, expected4.get(), RotateLeftSame(vb, -i));
+
+      HWY_ASSERT_VEC_EQ(d, expected1.get(),
+                        RotateRightSame(va, static_cast<int>(kBits) - i));
+      HWY_ASSERT_VEC_EQ(d, expected2.get(),
+                        RotateLeftSame(va, static_cast<int>(kBits) - i));
+      HWY_ASSERT_VEC_EQ(d, expected3.get(),
+                        RotateRightSame(vb, static_cast<int>(kBits) - i));
+      HWY_ASSERT_VEC_EQ(d, expected4.get(),
+                        RotateLeftSame(vb, static_cast<int>(kBits) - i));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllVariableRotations() {
+  ForIntegerTypes(ForPartialVectors<TestVariableRotations>());
+}
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
@@ -422,7 +628,9 @@
 HWY_BEFORE_TEST(HwyShiftTest);
 HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllShifts);
 HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllVariableShifts);
+HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllRotateLeft);
 HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllRotateRight);
+HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllVariableRotations);
 }  // namespace hwy
 
 #endif
diff --git a/hwy/timer-inl.h b/hwy/timer-inl.h
index c286b0a..2e082fc 100644
--- a/hwy/timer-inl.h
+++ b/hwy/timer-inl.h
@@ -24,7 +24,6 @@
 #endif
 
 #include "hwy/highway.h"
-#include "hwy/timer.h"
 
 #if defined(_WIN32) || defined(_WIN64)
 #ifndef NOMINMAX
@@ -50,6 +49,7 @@
 #include <intrin.h>
 #endif
 
+#include <stdint.h>
 #include <time.h>  // clock_gettime
 
 HWY_BEFORE_NAMESPACE();
@@ -140,7 +140,7 @@
       // "cc" = flags modified by SHL.
       : "rdx", "memory", "cc");
 #elif HWY_ARCH_RVV
-  asm volatile("rdtime %0" : "=r"(t));
+  asm volatile("fence; rdtime %0" : "=r"(t));
 #elif defined(_WIN32) || defined(_WIN64)
   LARGE_INTEGER counter;
   (void)QueryPerformanceCounter(&counter);
diff --git a/run_tests.sh b/run_tests.sh
index 54bae91..10a2009 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -14,9 +14,9 @@
 make -j && ctest -j && cd .. && rm -rf build
 
 #######################################
-echo DEBUG Clang 9
+echo DEBUG Clang 13
 rm -rf build_dbg && mkdir build_dbg && cd build_dbg
-CXX=clang++-9 CC=clang-9 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Debug
+CXX=clang++-13 CC=clang-13 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Debug
 make -j && ctest -j && cd .. && rm -rf build_dbg
 
 #######################################
@@ -26,7 +26,7 @@
 make -j && ctest -j && cd .. && rm -rf build_32
 
 #######################################
-for VER in 10 11 12; do
+for VER in 11 12 13; do
   echo GCC $VER
   rm -rf build_g$VER && mkdir build_g$VER && cd build_g$VER
   CC=gcc-$VER CXX=g++-$VER cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Release
@@ -75,5 +75,19 @@
 CC=powerpc64le-linux-gnu-gcc-12 CXX=powerpc64le-linux-gnu-g++-12 cmake .. -DCMAKE_BUILD_TYPE=Release -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_C_COMPILER_TARGET="powerpc64le-linux-gnu" -DCMAKE_CXX_COMPILER_TARGET="powerpc64le-linux-gnu" -DCMAKE_CROSSCOMPILING=true -DCMAKE_CXX_FLAGS='-mcpu=power10'
 clear && make -j && ctest -j && cd .. && rm -rf build_ppc10
 
+#######################################
+echo Z14
+export QEMU_LD_PREFIX=/usr/s390x-linux-gnu
+rm -rf build_z14 && mkdir build_z14 && cd build_z14
+CC=s390x-linux-gnu-gcc-11 CXX=s390x-linux-gnu-g++-11 cmake .. -DCMAKE_C_COMPILER_TARGET="s390x-linux-gnu" -DCMAKE_CXX_COMPILER_TARGET="s390x-linux-gnu" -DCMAKE_C_FLAGS='-march=z14 -mzvector' -DCMAKE_CXX_FLAGS='-march=z14 -mzvector -DHWY_DISABLED_TARGETS="(HWY_SCALAR|HWY_EMU128|HWY_Z15)" -DHWY_COMPILE_ONLY_STATIC=1' -DCMAKE_CROSSCOMPILING=true -DCMAKE_CROSSCOMPILING_EMULATOR="/usr/bin/qemu-s390x;-L;/usr/s390x-linux-gnu"
+clear && make -j && ctest -j && cd .. && rm -rf build_z14
+
+#######################################
+echo Z15
+export QEMU_LD_PREFIX=/usr/s390x-linux-gnu
+rm -rf build_z15 && mkdir build_z15 && cd build_z15
+CC=s390x-linux-gnu-gcc-11 CXX=s390x-linux-gnu-g++-11 cmake .. -DCMAKE_C_COMPILER_TARGET="s390x-linux-gnu" -DCMAKE_CXX_COMPILER_TARGET="s390x-linux-gnu" -DCMAKE_C_FLAGS='-march=z15 -mzvector' -DCMAKE_CXX_FLAGS='-march=z15 -mzvector -DHWY_DISABLED_TARGETS="(HWY_SCALAR|HWY_EMU128|HWY_Z14)" -DHWY_COMPILE_ONLY_STATIC=1' -DCMAKE_CROSSCOMPILING=true -DCMAKE_CROSSCOMPILING_EMULATOR="/usr/bin/qemu-s390x;-cpu;max,vxeh2=on;-L;/usr/s390x-linux-gnu"
+clear && make -j && ctest -j && cd .. && rm -rf build_z15
+
 
 echo Success