Merge pull request #2128 from johnplatts:hwy_rvv_detect_fix_043024_2

PiperOrigin-RevId: 631058462
diff --git a/BUILD b/BUILD
index 8ff5f1e..5fda7b3 100644
--- a/BUILD
+++ b/BUILD
@@ -559,6 +559,11 @@
                 subdir + test + ".cc",
             ],
             copts = COPTS + HWY_TEST_COPTS,
+            # Fixes OOM for matvec_test on RVV.
+            exec_properties = select({
+                "@platforms//cpu:riscv64": {"mem": "16g"},
+                "//conditions:default": None,
+            }),
             features = select({
                 "@platforms//cpu:riscv64": ["fully_static_link"],
                 "//conditions:default": [],
@@ -579,6 +584,7 @@
                 "//conditions:default": False,
             }),
             local_defines = ["HWY_IS_TEST"],
+            # Placeholder for malloc, do not remove
             # for test_suite.
             tags = ["hwy_ops_test"],
             deps = HWY_TEST_DEPS + select({
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c8c3fe2..3f1ced8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,6 +79,9 @@
 # arise due to compiler/platform changes. Enable this in CI/tests.
 set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
 
+# Experimental support for header-only builds
+set(HWY_CMAKE_HEADER_ONLY OFF CACHE BOOL "Change to header-only?")
+
 set(HWY_ENABLE_CONTRIB ON CACHE BOOL "Include contrib/")
 set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
 set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")
@@ -137,9 +140,7 @@
 endif()  # HWY_ENABLE_CONTRIB
 
 set(HWY_SOURCES
-    hwy/abort.cc
     hwy/abort.h
-    hwy/aligned_allocator.cc
     hwy/aligned_allocator.h
     hwy/base.h
     hwy/cache_control.h
@@ -148,7 +149,6 @@
     hwy/foreach_target.h
     hwy/highway_export.h
     hwy/highway.h
-    hwy/nanobenchmark.cc
     hwy/nanobenchmark.h
     hwy/ops/arm_neon-inl.h
     hwy/ops/arm_sve-inl.h
@@ -164,20 +164,28 @@
     hwy/ops/x86_128-inl.h
     hwy/ops/x86_256-inl.h
     hwy/ops/x86_512-inl.h
-    hwy/per_target.cc
     hwy/per_target.h
     hwy/print-inl.h
-    hwy/print.cc
     hwy/print.h
     hwy/profiler.h
     hwy/robust_statistics.h
-    hwy/targets.cc
     hwy/targets.h
     hwy/timer-inl.h
-    hwy/timer.cc
     hwy/timer.h
 )
 
+if (NOT HWY_CMAKE_HEADER_ONLY)
+  list(APPEND HWY_SOURCES
+    hwy/abort.cc
+    hwy/aligned_allocator.cc
+    hwy/nanobenchmark.cc
+    hwy/per_target.cc
+    hwy/print.cc
+    hwy/targets.cc
+    hwy/timer.cc
+  )
+endif()
+
 set(HWY_TEST_SOURCES
     hwy/tests/hwy_gtest.h
     hwy/tests/test_util-inl.h
@@ -367,6 +375,10 @@
 
 endif()  # !MSVC
 
+if (HWY_CMAKE_HEADER_ONLY)
+  list(APPEND HWY_FLAGS -DHWY_HEADER_ONLY)
+endif()
+
 include(CheckIncludeFile)
 check_include_file(sys/auxv.h  HAVE_SYS_AUXV_H)
 check_include_file(asm/hwcap.h HAVE_ASM_HWCAP_H)
diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md
index dd37635..94160fc 100644
--- a/g3doc/quick_reference.md
+++ b/g3doc/quick_reference.md
@@ -808,14 +808,23 @@
 
 *   <code>V **NegMulSub**(V a, V b, V c)</code>: returns `-a[i] * b[i] - c[i]`.
 
-*   <code>V **MulAddSub**(V a, V b, V c)</code>: returns `a[i] * b[i] - c[i]`
-    in the even lanes and `a[i] * b[i] + c[i]` in the odd lanes.
+*   <code>V **MulAddSub**(V a, V b, V c)</code>: returns `a[i] * b[i] - c[i]` in
+    the even lanes and `a[i] * b[i] + c[i]` in the odd lanes.
 
-    `MulAddSub(a, b, c)` is equivalent to
-    `OddEven(MulAdd(a, b, c), MulSub(a, b, c))` or
-    `MulAddSub(a, b, OddEven(c, Neg(c))`, but `MulSub(a, b, c)` is more
+    `MulAddSub(a, b, c)` is equivalent to `OddEven(MulAdd(a, b, c), MulSub(a, b,
+    c))` or `MulAddSub(a, b, OddEven(c, Neg(c))`, but `MulSub(a, b, c)` is more
     efficient on some targets (including AVX2/AVX3).
 
+*   `V`: `bf16`, `D`: `RepartitionToWide<DFromV<V>>`, `VW`: `Vec<D>` \
+    <code>VW **MulEvenAdd**(D d, V a, V b, VW c)</code>: equivalent to and
+    potentially more efficient than `MulAdd(PromoteEvenTo(d, a),
+    PromoteEvenTo(d, b), c)`.
+
+*   `V`: `bf16`, `D`: `RepartitionToWide<DFromV<V>>`, `VW`: `Vec<D>` \
+    <code>VW **MulOddAdd**(D d, V a, V b, VW c)</code>: equivalent to and
+    potentially more efficient than `MulAdd(PromoteOddTo(d, a), PromoteOddTo(d,
+    b), c)`.
+
 #### Masked arithmetic
 
 All ops in this section return `no` for `mask=false` lanes, and suppress any
diff --git a/hwy/contrib/matvec/matvec_test.cc b/hwy/contrib/matvec/matvec_test.cc
index 51e6151..8b87634 100644
--- a/hwy/contrib/matvec/matvec_test.cc
+++ b/hwy/contrib/matvec/matvec_test.cc
@@ -206,6 +206,8 @@
     ThreadPool pool(HWY_MIN(num_threads, ThreadPool::MaxThreads()));
 
     Test<AdjustedReps(192), AdjustedReps(256)>(d, pool);
+// Fewer tests due to compiler OOM
+#if HWY_TARGET != HWY_RVV
     Test<40, AdjustedReps(512)>(d, pool);
     Test<AdjustedReps(1024), 50>(d, pool);
 
@@ -213,13 +215,17 @@
     if (sizeof(TFromD<D>) != 2 && sizeof(VecT) != 2) {
       Test<AdjustedReps(1536), AdjustedReps(1536)>(d, pool);
     }
+#endif  // HWY_TARGET != HWY_RVV
   }
 
  public:
   template <class T, class D>
   HWY_INLINE void operator()(T /*unused*/, D d) {
     CreatePoolAndTest(d, 13);
+// Fewer tests due to compiler OOM
+#if HWY_TARGET != HWY_RVV
     CreatePoolAndTest(d, 16);
+#endif
   }
 };
 
diff --git a/hwy/contrib/sort/vqsort-inl.h b/hwy/contrib/sort/vqsort-inl.h
index c5ccba7..523a215 100644
--- a/hwy/contrib/sort/vqsort-inl.h
+++ b/hwy/contrib/sort/vqsort-inl.h
@@ -1779,7 +1779,7 @@
     MaybePrintVector(d, "pivot", pivot, 0, st.LanesPerKey());
     MaybePrintVector(d, "second", second, 0, st.LanesPerKey());
 
-    Vec<D> third;
+    Vec<D> third = Zero(d);
     // Not supported for key-value types because two 'keys' may be equivalent
     // but not interchangeable (their values may differ).
     if (HWY_UNLIKELY(!st.IsKV() &&
diff --git a/hwy/contrib/thread_pool/topology.cc b/hwy/contrib/thread_pool/topology.cc
index 41c13f8..a24b8f6 100644
--- a/hwy/contrib/thread_pool/topology.cc
+++ b/hwy/contrib/thread_pool/topology.cc
@@ -15,6 +15,14 @@
 
 #include "hwy/contrib/thread_pool/topology.h"
 
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <map>
+#include <thread>  // NOLINT
+#include <vector>
+
 #include "hwy/detect_compiler_arch.h"  // HWY_OS_WIN
 
 #if HWY_OS_WIN
@@ -46,10 +54,11 @@
 #include <emscripten/threading.h>
 #endif
 
-#include <stddef.h>
-#include <stdio.h>
-
-#include <thread>  // NOLINT
+#if HWY_OS_LINUX
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#endif  // HWY_OS_LINUX
 
 #include "hwy/base.h"
 
@@ -64,26 +73,38 @@
 }
 
 HWY_DLLEXPORT size_t TotalLogicalProcessors() {
+  size_t lp = 0;
 #if HWY_ARCH_WASM
-  const int lp = emscripten_num_logical_cores();
-  HWY_ASSERT(lp < static_cast<int>(kMaxLogicalProcessors));
-  if (lp > 0) return static_cast<size_t>(lp);
+  const int num_cores = emscripten_num_logical_cores();
+  if (num_cores > 0) lp = static_cast<size_t>(num_cores);
 #else
-  const unsigned lp = std::thread::hardware_concurrency();
-  HWY_ASSERT(lp < static_cast<unsigned>(kMaxLogicalProcessors));
-  if (lp != 0) return static_cast<size_t>(lp);
+  const unsigned concurrency = std::thread::hardware_concurrency();
+  if (concurrency != 0) lp = static_cast<size_t>(concurrency);
 #endif
 
-  // WASM or C++ stdlib failed.
-  if (HWY_IS_DEBUG_BUILD) {
-    fprintf(
-        stderr,
-        "Unknown TotalLogicalProcessors. HWY_OS_: WIN=%d LINUX=%d APPLE=%d;\n"
-        "HWY_ARCH_: WASM=%d X86=%d PPC=%d ARM=%d RISCV=%d S390X=%d\n",
-        HWY_OS_WIN, HWY_OS_LINUX, HWY_OS_APPLE, HWY_ARCH_WASM, HWY_ARCH_X86,
-        HWY_ARCH_PPC, HWY_ARCH_ARM, HWY_ARCH_RISCV, HWY_ARCH_S390X);
+  // WASM or C++ stdlib failed to detect #CPUs.
+  if (lp == 0) {
+    if (HWY_IS_DEBUG_BUILD) {
+      fprintf(
+          stderr,
+          "Unknown TotalLogicalProcessors. HWY_OS_: WIN=%d LINUX=%d APPLE=%d;\n"
+          "HWY_ARCH_: WASM=%d X86=%d PPC=%d ARM=%d RISCV=%d S390X=%d\n",
+          HWY_OS_WIN, HWY_OS_LINUX, HWY_OS_APPLE, HWY_ARCH_WASM, HWY_ARCH_X86,
+          HWY_ARCH_PPC, HWY_ARCH_ARM, HWY_ARCH_RISCV, HWY_ARCH_S390X);
+    }
+    return 1;
   }
-  return 1;
+
+  // Warn that we are clamping.
+  if (lp > kMaxLogicalProcessors) {
+    if (HWY_IS_DEBUG_BUILD) {
+      fprintf(stderr, "OS reports %zu processors but clamping to %zu\n", lp,
+              kMaxLogicalProcessors);
+    }
+    lp = kMaxLogicalProcessors;
+  }
+
+  return lp;
 }
 
 #ifdef __ANDROID__
@@ -111,9 +132,17 @@
 #endif  // __ANDROID__
   if (err != 0) return false;
   for (size_t lp = 0; lp < kMaxLogicalProcessors; ++lp) {
+#if HWY_COMPILER_GCC_ACTUAL
+    // Workaround for GCC compiler warning with CPU_ISSET macro
+    HWY_DIAGNOSTICS(push)
+    HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")
+#endif
     if (CPU_ISSET(static_cast<int>(lp), &set)) {
       lps.Set(lp);
     }
+#if HWY_COMPILER_GCC_ACTUAL
+    HWY_DIAGNOSTICS(pop)
+#endif
   }
   return true;
 #elif HWY_OS_FREEBSD
@@ -124,9 +153,17 @@
                                      sizeof(cpuset_t), &set);
   if (err != 0) return false;
   for (size_t lp = 0; lp < kMaxLogicalProcessors; ++lp) {
+#if HWY_COMPILER_GCC_ACTUAL
+    // Workaround for GCC compiler warning with CPU_ISSET macro
+    HWY_DIAGNOSTICS(push)
+    HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")
+#endif
     if (CPU_ISSET(static_cast<int>(lp), &set)) {
       lps.Set(lp);
     }
+#if HWY_COMPILER_GCC_ACTUAL
+    HWY_DIAGNOSTICS(pop)
+#endif
   }
   return true;
 #else
@@ -144,7 +181,15 @@
 #elif HWY_OS_LINUX
   cpu_set_t set;
   CPU_ZERO(&set);
+#if HWY_COMPILER_GCC_ACTUAL
+  // Workaround for GCC compiler warning with CPU_SET macro
+  HWY_DIAGNOSTICS(push)
+  HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")
+#endif
   lps.Foreach([&set](size_t lp) { CPU_SET(static_cast<int>(lp), &set); });
+#if HWY_COMPILER_GCC_ACTUAL
+  HWY_DIAGNOSTICS(pop)
+#endif
   const pid_t pid = 0;  // current thread
 #ifdef __ANDROID__
   const int err = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &set);
@@ -156,7 +201,15 @@
 #elif HWY_OS_FREEBSD
   cpuset_t set;
   CPU_ZERO(&set);
+#if HWY_COMPILER_GCC_ACTUAL
+  // Workaround for GCC compiler warning with CPU_SET macro
+  HWY_DIAGNOSTICS(push)
+  HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")
+#endif
   lps.Foreach([&set](size_t lp) { CPU_SET(static_cast<int>(lp), &set); });
+#if HWY_COMPILER_GCC_ACTUAL
+  HWY_DIAGNOSTICS(pop)
+#endif
   const pid_t pid = getpid();  // current thread
   const int err = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid,
                                      sizeof(cpuset_t), &set);
@@ -169,4 +222,224 @@
 #endif
 }
 
+#if HWY_OS_LINUX
+
+class File {
+ public:
+  explicit File(const char* path) {
+    for (;;) {
+      fd_ = open(path, O_RDONLY);
+      if (fd_ > 0) return;           // success
+      if (errno == EINTR) continue;  // signal: retry
+      if (errno == ENOENT) return;   // not found, give up
+      if (HWY_IS_DEBUG_BUILD) {
+        fprintf(stderr, "Unexpected error opening %s: %d\n", path, errno);
+      }
+      return;  // unknown error, give up
+    }
+  }
+
+  ~File() {
+    if (fd_ > 0) {
+      for (;;) {
+        const int ret = close(fd_);
+        if (ret == 0) break;           // success
+        if (errno == EINTR) continue;  // signal: retry
+        if (HWY_IS_DEBUG_BUILD) {
+          fprintf(stderr, "Unexpected error closing file: %d\n", errno);
+        }
+        return;  // unknown error, ignore
+      }
+    }
+  }
+
+  // Returns number of bytes read or 0 on failure.
+  size_t Read(char* buf200) const {
+    if (fd_ < 0) return 0;
+    size_t pos = 0;
+    for (;;) {
+      // read instead of pread, which might not work for sysfs.
+      const auto bytes_read = read(fd_, buf200 + pos, 200 - pos);
+      if (bytes_read == 0) {  // EOF: done
+        buf200[pos++] = '\0';
+        return pos;
+      }
+      if (bytes_read == -1) {
+        if (errno == EINTR) continue;  // signal: retry
+        if (HWY_IS_DEBUG_BUILD) {
+          fprintf(stderr, "Unexpected error reading file: %d\n", errno);
+        }
+        return 0;
+      }
+      pos += static_cast<size_t>(bytes_read);
+      HWY_ASSERT(pos <= 200);
+    }
+  }
+
+ private:
+  int fd_;
+};
+
+bool SimpleAtoi(const char* str, size_t len, size_t* out) {
+  size_t value = 0;
+  size_t digits_seen = 0;
+  // 9 digits cannot overflow even 32-bit size_t.
+  for (size_t i = 0; i < HWY_MIN(len, 9); ++i) {
+    if (str[i] < '0' || str[i] > '9') break;
+    value *= 10;
+    value += static_cast<size_t>(str[i] - '0');
+    ++digits_seen;
+  }
+  if (digits_seen == 0) {
+    *out = 0;
+    return false;
+  }
+  *out = value;
+  return true;
+}
+
+bool ReadSysfs(const char* format, size_t lp, size_t max, size_t* out) {
+  char path[200];
+  const int bytes_written = snprintf(path, sizeof(path), format, lp);
+  HWY_ASSERT(0 < bytes_written &&
+             bytes_written < static_cast<int>(sizeof(path) - 1));
+
+  const File file(path);
+  char buf200[200];
+  const size_t pos = file.Read(buf200);
+  if (pos == 0) return false;
+
+  if (!SimpleAtoi(buf200, pos, out)) return false;
+  if (*out > max) {
+    if (HWY_IS_DEBUG_BUILD) {
+      fprintf(stderr, "Value for %s = %zu > %zu\n", path, *out, max);
+    }
+    return false;
+  }
+  return true;
+}
+
+// Given a set of opaque values, returns a map of each value to the number of
+// values that precede it. Used to generate contiguous arrays in the same order.
+std::map<size_t, size_t> RanksFromSet(const BitSet4096<>& set) {
+  HWY_ASSERT(set.Count() != 0);
+  std::map<size_t, size_t> ranks;
+  size_t num = 0;
+  set.Foreach([&ranks, &num](size_t opaque) {
+    const bool inserted = ranks.insert({opaque, num++}).second;
+    HWY_ASSERT(inserted);
+  });
+  HWY_ASSERT(num == set.Count());
+  HWY_ASSERT(num == ranks.size());
+  return ranks;
+}
+
+const char* kPackage =
+    "/sys/devices/system/cpu/cpu%zu/topology/physical_package_id";
+const char* kCluster = "/sys/devices/system/cpu/cpu%zu/cache/index3/id";
+const char* kCore = "/sys/devices/system/cpu/cpu%zu/topology/core_id";
+
+// Returns 0 on error, or number of packages and initializes LP::package.
+size_t DetectPackages(std::vector<Topology::LP>& lps) {
+  // Packages are typically an index, but could actually be an arbitrary value.
+  // We assume they do not exceed kMaxLogicalProcessors and thus fit in a set.
+  BitSet4096<> package_set;
+  for (size_t lp = 0; lp < lps.size(); ++lp) {
+    size_t package;
+    if (!ReadSysfs(kPackage, lp, kMaxLogicalProcessors, &package)) return 0;
+    package_set.Set(package);
+    // Storing a set of lps belonging to each package requires more space/time
+    // than storing the package per lp and remapping below.
+    lps[lp].package = static_cast<uint16_t>(package);
+  }
+  HWY_ASSERT(package_set.Count() != 0);
+
+  // Remap the per-lp package to their rank.
+  std::map<size_t, size_t> ranks = RanksFromSet(package_set);
+  for (size_t lp = 0; lp < lps.size(); ++lp) {
+    lps[lp].package = static_cast<uint16_t>(ranks[lps[lp].package]);
+  }
+  return ranks.size();
+}
+
+// Stores the global cluster/core values separately for each package so we can
+// return per-package arrays.
+struct PerPackage {
+  // Use maximum possible set size because Arm values can exceed 1k.
+  BitSet4096<> cluster_set;
+  BitSet4096<> core_set;
+
+  std::map<size_t, size_t> smt_per_core;
+
+  size_t num_clusters;
+  size_t num_cores;
+};
+
+// Returns false, or fills per_package and initializes LP::cluster/core/smt.
+bool DetectClusterAndCore(std::vector<Topology::LP>& lps,
+                          std::vector<PerPackage>& per_package) {
+  for (size_t lp = 0; lp < lps.size(); ++lp) {
+    PerPackage& pp = per_package[lps[lp].package];
+    size_t cluster, core;
+    if (!ReadSysfs(kCluster, lp, kMaxLogicalProcessors, &cluster)) return false;
+    if (!ReadSysfs(kCore, lp, kMaxLogicalProcessors, &core)) return false;
+
+    // SMT ID is how many LP we have already seen assigned to the same core.
+    const size_t smt = pp.smt_per_core[core]++;
+    HWY_ASSERT(smt < 16);
+    lps[lp].smt = static_cast<uint8_t>(smt);  // already contiguous
+
+    // Certainly core, and likely also cluster, are HW-dependent opaque values.
+    // We assume they do not exceed kMaxLogicalProcessors and thus fit in a set.
+    pp.cluster_set.Set(cluster);
+    pp.core_set.Set(core);
+    // Temporary storage for remapping as in DetectPackages.
+    lps[lp].cluster = static_cast<uint16_t>(cluster);
+    lps[lp].core = static_cast<uint16_t>(core);
+  }
+
+  for (size_t p = 0; p < per_package.size(); ++p) {
+    PerPackage& pp = per_package[p];
+    std::map<size_t, size_t> cluster_ranks = RanksFromSet(pp.cluster_set);
+    std::map<size_t, size_t> core_ranks = RanksFromSet(pp.core_set);
+    // Remap *this packages'* per-lp cluster/core to their ranks.
+    for (size_t lp = 0; lp < lps.size(); ++lp) {
+      if (lps[lp].package != p) continue;
+      lps[lp].cluster = static_cast<uint16_t>(cluster_ranks[lps[lp].cluster]);
+      lps[lp].core = static_cast<uint16_t>(core_ranks[lps[lp].core]);
+    }
+    pp.num_clusters = cluster_ranks.size();
+    pp.num_cores = core_ranks.size();
+  }
+
+  return true;
+}
+
+#endif  // HWY_OS_LINUX
+
+HWY_DLLEXPORT Topology::Topology() {
+#if HWY_OS_LINUX
+  lps.resize(TotalLogicalProcessors());
+
+  const size_t num_packages = DetectPackages(lps);
+  if (num_packages == 0) return;
+  std::vector<PerPackage> per_package(num_packages);
+  if (!DetectClusterAndCore(lps, per_package)) return;
+
+  // Allocate per-package/cluster/core vectors. This indicates to callers that
+  // detection succeeded.
+  packages.resize(num_packages);
+  for (size_t p = 0; p < num_packages; ++p) {
+    packages[p].clusters.resize(per_package[p].num_clusters);
+    packages[p].cores.resize(per_package[p].num_cores);
+  }
+
+  // Populate the per-cluster/core sets of LP.
+  for (size_t lp = 0; lp < lps.size(); ++lp) {
+    packages[lps[lp].package].clusters[lps[lp].cluster].lps.Set(lp);
+    packages[lps[lp].package].cores[lps[lp].core].lps.Set(lp);
+  }
+#endif
+}
+
 }  // namespace hwy
diff --git a/hwy/contrib/thread_pool/topology.h b/hwy/contrib/thread_pool/topology.h
index 086cd2b..0a001c5 100644
--- a/hwy/contrib/thread_pool/topology.h
+++ b/hwy/contrib/thread_pool/topology.h
@@ -16,27 +16,16 @@
 #ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_
 #define HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_
 
-// OS-specific functions for thread affinity.
+// OS-specific functions for processor topology and thread affinity.
 
 #include <stddef.h>
 
+#include <vector>
+
 #include "hwy/base.h"
 
 namespace hwy {
 
-// Returns false if std::thread should not be used.
-HWY_DLLEXPORT bool HaveThreadingSupport();
-
-// Upper bound on logical processors, including hyperthreads, to enable
-// fixed-size arrays. This limit matches glibc.
-static constexpr size_t kMaxLogicalProcessors = 1024;
-
-// Returns 1 if unknown, otherwise the total number of logical processors
-// provided by the hardware. We assert this is at most `kMaxLogicalProcessors`.
-// These processors are not necessarily all usable; you can determine which are
-// via GetThreadAffinity().
-HWY_DLLEXPORT size_t TotalLogicalProcessors();
-
 // 64-bit specialization of std::bitset, which lacks Foreach.
 class BitSet64 {
  public:
@@ -84,17 +73,18 @@
   uint64_t bits_ = 0;
 };
 
-// Two-level bitset for Get/SetThreadAffinity.
-class LogicalProcessorSet {
+// Two-level bitset for up to kMaxSize <= 4096 values.
+template <size_t kMaxSize = 4096>
+class BitSet4096 {
  public:
-  // No harm if `lp` is already set.
-  void Set(size_t lp) {
-    HWY_DASSERT(lp < TotalLogicalProcessors());
-    const size_t idx = lp / 64;
-    const size_t mod = lp % 64;
+  // No harm if `i` is already set.
+  void Set(size_t i) {
+    HWY_DASSERT(i < kMaxSize);
+    const size_t idx = i / 64;
+    const size_t mod = i % 64;
     bits_[idx].Set(mod);
     nonzero_.Set(idx);
-    HWY_DASSERT(Get(lp));
+    HWY_DASSERT(Get(i));
   }
 
   // Equivalent to Set(i) for i in [0, 64) where (bits >> i) & 1. This does
@@ -104,28 +94,28 @@
     if (bits) nonzero_.Set(0);
   }
 
-  void Clear(size_t lp) {
-    HWY_DASSERT(lp < TotalLogicalProcessors());
-    const size_t idx = lp / 64;
-    const size_t mod = lp % 64;
+  void Clear(size_t i) {
+    HWY_DASSERT(i < kMaxSize);
+    const size_t idx = i / 64;
+    const size_t mod = i % 64;
     bits_[idx].Clear(mod);
     if (!bits_[idx].Any()) {
       nonzero_.Clear(idx);
     }
-    HWY_DASSERT(!Get(lp));
+    HWY_DASSERT(!Get(i));
   }
 
-  bool Get(size_t lp) const {
-    HWY_DASSERT(lp < TotalLogicalProcessors());
-    const size_t idx = lp / 64;
-    const size_t mod = lp % 64;
+  bool Get(size_t i) const {
+    HWY_DASSERT(i < kMaxSize);
+    const size_t idx = i / 64;
+    const size_t mod = i % 64;
     return bits_[idx].Get(mod);
   }
 
-  // Returns uint64_t(Get(lp)) << lp for lp in [0, 64).
+  // Returns uint64_t(Get(i)) << i for i in [0, 64).
   uint64_t Get64() const { return bits_[0].Get64(); }
 
-  // Calls func(lp) for each lp in the set.
+  // Calls func(i) for each i in the set.
   template <class Func>
   void Foreach(const Func& func) const {
     nonzero_.Foreach([&func, this](size_t idx) {
@@ -141,11 +131,20 @@
   }
 
  private:
-  static_assert(kMaxLogicalProcessors <= 64 * 64, "One BitSet64 insufficient");
+  static_assert(kMaxSize <= 64 * 64, "One BitSet64 insufficient");
   BitSet64 nonzero_;
-  BitSet64 bits_[kMaxLogicalProcessors / 64];
+  BitSet64 bits_[kMaxSize / 64];
 };
 
+// Returns false if std::thread should not be used.
+HWY_DLLEXPORT bool HaveThreadingSupport();
+
+// Upper bound on logical processors, including hyperthreads.
+static constexpr size_t kMaxLogicalProcessors = 1024;  // matches glibc
+
+// Set used by Get/SetThreadAffinity.
+using LogicalProcessorSet = BitSet4096<kMaxLogicalProcessors>;
+
 // Returns false, or sets `lps` to all logical processors which are online and
 // available to the current thread.
 HWY_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps);
@@ -164,6 +163,47 @@
   return SetThreadAffinity(lps);
 }
 
+// Returns 1 if unknown, otherwise the total number of logical processors
+// provided by the hardware clamped to `kMaxLogicalProcessors`.
+// These processors are not necessarily all usable; you can determine which are
+// via GetThreadAffinity().
+HWY_DLLEXPORT size_t TotalLogicalProcessors();
+
+struct Topology {
+  // Caller must check packages.empty(); if so, do not use any fields.
+  HWY_DLLEXPORT Topology();
+
+  // Clique of cores with lower latency to each other. On Apple M1 these are
+  // four cores sharing an L2. On AMD these 'CCX' are up to eight cores sharing
+  // an L3 and a memory controller.
+  struct Cluster {
+    LogicalProcessorSet lps;
+  };
+
+  struct Core {
+    LogicalProcessorSet lps;
+  };
+
+  struct Package {
+    std::vector<Cluster> clusters;
+    std::vector<Core> cores;
+  };
+
+  std::vector<Package> packages;
+
+  // Several hundred instances, so prefer a compact representation.
+#pragma pack(push, 1)
+  struct LP {
+    uint16_t package = 0;
+    uint16_t cluster = 0;  // local to the package, not globally unique
+    uint16_t core = 0;     // local to the package, not globally unique
+    uint8_t smt = 0;       // local to the package and core
+    uint8_t reserved = 0;
+  };
+#pragma pack(pop)
+  std::vector<LP> lps;  // size() == TotalLogicalProcessors().
+};
+
 }  // namespace hwy
 
 #endif  // HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_
diff --git a/hwy/contrib/thread_pool/topology_test.cc b/hwy/contrib/thread_pool/topology_test.cc
index 450e25d..c844791 100644
--- a/hwy/contrib/thread_pool/topology_test.cc
+++ b/hwy/contrib/thread_pool/topology_test.cc
@@ -25,6 +25,7 @@
 #include <utility>  // std::make_pair
 #include <vector>
 
+#include "hwy/base.h"
 #include "hwy/tests/hwy_gtest.h"
 #include "hwy/tests/test_util-inl.h"
 #include "hwy/tests/test_util.h"
@@ -188,13 +189,39 @@
 TEST(TopologyTest, TestNum) {
   const size_t total = TotalLogicalProcessors();
   fprintf(stderr, "TotalLogical %zu\n", total);
+
   LogicalProcessorSet lps;
   if (GetThreadAffinity(lps)) {
-    HWY_ASSERT(lps.Count() <= total);
     fprintf(stderr, "Active %zu\n", lps.Count());
+    HWY_ASSERT(lps.Count() <= total);
   }
 }
 
+TEST(TopologyTest, TestTopology) {
+  Topology topology;
+  if (topology.packages.empty()) return;
+
+  HWY_ASSERT(!topology.lps.empty());
+
+  size_t lps_by_cluster = 0;
+  size_t lps_by_core = 0;
+  for (size_t p = 0; p < topology.packages.size(); ++p) {
+    const Topology::Package& pkg = topology.packages[p];
+    HWY_ASSERT(!pkg.clusters.empty());
+    HWY_ASSERT(!pkg.cores.empty());
+    HWY_ASSERT(pkg.clusters.size() <= pkg.cores.size());
+
+    for (const Topology::Cluster& c : pkg.clusters) {
+      lps_by_cluster += c.lps.Count();
+    }
+    for (const Topology::Core& c : pkg.cores) {
+      lps_by_core += c.lps.Count();
+    }
+  }
+  HWY_ASSERT(lps_by_cluster == topology.lps.size());
+  HWY_ASSERT(lps_by_core == topology.lps.size());
+}
+
 }  // namespace
 }  // namespace hwy
 
diff --git a/hwy/detect_targets.h b/hwy/detect_targets.h
index 3338ace..ab74a5f 100644
--- a/hwy/detect_targets.h
+++ b/hwy/detect_targets.h
@@ -625,6 +625,10 @@
 #define HWY_ATTAINABLE_S390X 0
 #endif
 
+#if HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH
+#define HWY_ATTAINABLE_RISCV (HWY_RVV)
+#endif
+
 // Attainable means enabled and the compiler allows intrinsics (even when not
 // allowed to autovectorize). Used in 3 and 4.
 #if HWY_ARCH_X86
@@ -648,6 +652,9 @@
 #elif HWY_ARCH_S390X
 #define HWY_ATTAINABLE_TARGETS \
   HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_S390X)
+#elif HWY_ARCH_RVV
+#define HWY_ATTAINABLE_TARGETS \
+  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_RISCV)
 #else
 #define HWY_ATTAINABLE_TARGETS (HWY_ENABLED_BASELINE)
 #endif  // HWY_ARCH_*
diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h
index 232a113..927a2ef 100644
--- a/hwy/ops/arm_neon-inl.h
+++ b/hwy/ops/arm_neon-inl.h
@@ -6908,6 +6908,18 @@
 
 #if HWY_NEON_HAVE_F32_TO_BF16C
 
+#ifdef HWY_NATIVE_MUL_EVEN_BF16
+#undef HWY_NATIVE_MUL_EVEN_BF16
+#else
+#define HWY_NATIVE_MUL_EVEN_BF16
+#endif
+
+#ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#else
+#define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#endif
+
 namespace detail {
 #if HWY_NEON_HAVE_BFLOAT16
 // If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is
@@ -6937,6 +6949,20 @@
 }  // namespace detail
 
 template <class D, HWY_IF_V_SIZE_D(D, 16)>
+HWY_API Vec128<float> MulEvenAdd(D /*d32*/, Vec128<bfloat16_t> a,
+                                 Vec128<bfloat16_t> b, const Vec128<float> c) {
+  return Vec128<float>(vbfmlalbq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
+                                   detail::BitCastToRawNeonBF16(b.raw)));
+}
+
+template <class D, HWY_IF_V_SIZE_D(D, 16)>
+HWY_API Vec128<float> MulOddAdd(D /*d32*/, Vec128<bfloat16_t> a,
+                                 Vec128<bfloat16_t> b, const Vec128<float> c) {
+  return Vec128<float>(vbfmlaltq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
+                                   detail::BitCastToRawNeonBF16(b.raw)));
+}
+
+template <class D, HWY_IF_V_SIZE_D(D, 16)>
 HWY_API Vec128<float> ReorderWidenMulAccumulate(D /*d32*/, Vec128<bfloat16_t> a,
                                                 Vec128<bfloat16_t> b,
                                                 const Vec128<float> sum0,
@@ -6947,6 +6973,22 @@
 }
 
 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
+HWY_API VFromD<D> MulEvenAdd(
+    D /*d32*/, VFromD<Repartition<bfloat16_t, D>> a,
+    VFromD<Repartition<bfloat16_t, D>> b, const VFromD<D> c) {
+  return VFromD<D>(vbfmlalb_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
+                              detail::BitCastToRawNeonBF16(b.raw)));
+}
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
+HWY_API VFromD<D> MulOddAdd(
+    D /*d32*/, VFromD<Repartition<bfloat16_t, D>> a,
+    VFromD<Repartition<bfloat16_t, D>> b, const VFromD<D> c) {
+  return VFromD<D>(vbfmlalt_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
+                              detail::BitCastToRawNeonBF16(b.raw)));
+}
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
 HWY_API VFromD<D> ReorderWidenMulAccumulate(
     D /*d32*/, VFromD<Repartition<bfloat16_t, D>> a,
     VFromD<Repartition<bfloat16_t, D>> b, const VFromD<D> sum0,
@@ -6955,19 +6997,6 @@
                               detail::BitCastToRawNeonBF16(b.raw)));
 }
 
-#else
-
-template <class DF, HWY_IF_F32_D(DF),
-          class VBF = VFromD<Repartition<bfloat16_t, DF>>>
-HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
-                                             const VFromD<DF> sum0,
-                                             VFromD<DF>& sum1) {
-  // Lane order within sum0/1 is undefined, hence we can avoid the
-  // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
-  sum1 = MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), sum1);
-  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), sum0);
-}
-
 #endif  // HWY_NEON_HAVE_F32_TO_BF16C
 
 template <class D, HWY_IF_I32_D(D)>
diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h
index 36be563..7f4b79a 100644
--- a/hwy/ops/arm_sve-inl.h
+++ b/hwy/ops/arm_sve-inl.h
@@ -5800,23 +5800,32 @@
 
 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
 
-template <size_t N, int kPow2>
-HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df, VBF16 a,
-                                              VBF16 b, const svfloat32_t sum0,
-                                              svfloat32_t& sum1) {
 #if HWY_SVE_HAVE_BF16_FEATURE
-  (void)df;
-  sum1 = svbfmlalt_f32(sum1, a, b);
-  return svbfmlalb_f32(sum0, a, b);
+
+// NOTE: we currently do not use SVE BFDOT for bf16 ReorderWidenMulAccumulate
+// because, apparently unlike NEON, it uses round to odd unless the additional
+// FEAT_EBF16 feature is available and enabled.
+#ifdef HWY_NATIVE_MUL_EVEN_BF16
+#undef HWY_NATIVE_MUL_EVEN_BF16
 #else
-  // Lane order within sum0/1 is undefined, hence we can avoid the
-  // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
-  sum1 = MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), sum1);
-  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), sum0);
-#endif  // HWY_SVE_HAVE_BF16_FEATURE
+#define HWY_NATIVE_MUL_EVEN_BF16
+#endif
+
+template <size_t N, int kPow2>
+HWY_API svfloat32_t MulEvenAdd(Simd<float, N, kPow2> df, VBF16 a, VBF16 b,
+                               const svfloat32_t c) {
+  return svbfmlalb_f32(c, a, b);
 }
 
 template <size_t N, int kPow2>
+HWY_API svfloat32_t MulOddAdd(Simd<float, N, kPow2> df, VBF16 a, VBF16 b,
+                              const svfloat32_t c) {
+  return svbfmlalt_f32(c, a, b);
+}
+
+#endif  // HWY_SVE_HAVE_BF16_FEATURE
+
+template <size_t N, int kPow2>
 HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32,
                                             svint16_t a, svint16_t b,
                                             const svint32_t sum0,
diff --git a/hwy/ops/emu128-inl.h b/hwy/ops/emu128-inl.h
index 79216b4..c909e70 100644
--- a/hwy/ops/emu128-inl.h
+++ b/hwy/ops/emu128-inl.h
@@ -2811,16 +2811,6 @@
 
 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
 
-template <class DF, HWY_IF_F32_D(DF), class VBF>
-HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
-                                             const VFromD<DF> sum0,
-                                             VFromD<DF>& sum1) {
-  // Lane order within sum0/1 is undefined, hence we can avoid the
-  // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
-  sum1 = MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), sum1);
-  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), sum0);
-}
-
 template <class D, HWY_IF_UI32_D(D), class V16>
 HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, V16 a, V16 b,
                                             const VFromD<D> sum0,
diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h
index 890d6bb..3e57e3f 100644
--- a/hwy/ops/generic_ops-inl.h
+++ b/hwy/ops/generic_ops-inl.h
@@ -1,5 +1,6 @@
 // Copyright 2021 Google LLC
-// Copyright 2023,2024 Arm Limited and/or its affiliates <open-source-office@arm.com> 
+// Copyright 2023,2024 Arm Limited and/or
+// its affiliates <open-source-office@arm.com>
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-License-Identifier: BSD-3-Clause
 //
@@ -2802,9 +2803,23 @@
   static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
 
   VFromD<D> v = Zero(d);
-  for (size_t i = 0; i < MaxLanes(d); ++i) {
-    if (i < max_lanes_to_load)
-      v = InsertLane(v, i, base[ExtractLane(index, i)]);
+  for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) {
+    v = InsertLane(v, i, base[ExtractLane(index, i)]);
+  }
+  return v;
+}
+
+template <class D, typename T = TFromD<D>>
+HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
+                               VFromD<RebindToSigned<D>> index,
+                               const size_t max_lanes_to_load) {
+  const RebindToSigned<D> di;
+  using TI = TFromD<decltype(di)>;
+  static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
+
+  VFromD<D> v = no;
+  for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) {
+    v = InsertLane(v, i, base[ExtractLane(index, i)]);
   }
   return v;
 }
@@ -2815,6 +2830,12 @@
                                const size_t max_lanes_to_load) {
   return MaskedGatherIndex(FirstN(d, max_lanes_to_load), d, base, index);
 }
+template <class D, typename T = TFromD<D>>
+HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
+                               VFromD<RebindToSigned<D>> index,
+                               const size_t max_lanes_to_load) {
+  return MaskedGatherIndexOr(no, FirstN(d, max_lanes_to_load), d, base, index);
+}
 #endif  // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
 
 // ------------------------------ Integer AbsDiff and SumsOf8AbsDiff
@@ -4924,6 +4945,56 @@
 
 #endif  // HWY_NATIVE_INT_DIV
 
+// ------------------------------ MulEvenAdd (PromoteEvenTo)
+
+// SVE with bf16 and NEON with bf16 override this.
+#if (defined(HWY_NATIVE_MUL_EVEN_BF16) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_MUL_EVEN_BF16
+#undef HWY_NATIVE_MUL_EVEN_BF16
+#else
+#define HWY_NATIVE_MUL_EVEN_BF16
+#endif
+
+template <class DF, HWY_IF_F32_D(DF),
+          class VBF = VFromD<Repartition<bfloat16_t, DF>>>
+HWY_API VFromD<DF> MulEvenAdd(DF df, VBF a, VBF b, VFromD<DF> c) {
+  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), c);
+}
+
+template <class DF, HWY_IF_F32_D(DF),
+          class VBF = VFromD<Repartition<bfloat16_t, DF>>>
+HWY_API VFromD<DF> MulOddAdd(DF df, VBF a, VBF b, VFromD<DF> c) {
+  return MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), c);
+}
+
+#endif  // HWY_NATIVE_MUL_EVEN_BF16
+
+// ------------------------------ ReorderWidenMulAccumulate (MulEvenAdd)
+
+// AVX3_SPR/ZEN4, and NEON with bf16 but not(!) SVE override this.
+#if (defined(HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16) == \
+     defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#else
+#define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#endif
+
+template <class DF, HWY_IF_F32_D(DF),
+          class VBF = VFromD<Repartition<bfloat16_t, DF>>>
+HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
+                                             VFromD<DF> sum0,
+                                             VFromD<DF>& sum1) {
+  // Lane order within sum0/1 is undefined, hence we can avoid the
+  // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
+  sum1 = MulOddAdd(df, a, b, sum1);
+  return MulEvenAdd(df, a, b, sum0);
+}
+
+#endif  // HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+
+// ------------------------------ WidenMulAccumulate
+
 #if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE) == defined(HWY_TARGET_TOGGLE))
 #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
 #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
@@ -4939,7 +5010,7 @@
   return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low);
 }
 
-#endif
+#endif  // HWY_NATIVE_WIDEN_MUL_ACCUMULATE
 
 #if 0
 #if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16) == defined(HWY_TARGET_TOGGLE))
@@ -4959,10 +5030,10 @@
   return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low);
 }
 
-#endif
+#endif  // HWY_HAVE_FLOAT16
 
-#endif
-#endif
+#endif  // HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
+#endif  // #if 0
 
 // ------------------------------ SatWidenMulPairwiseAdd
 
diff --git a/hwy/ops/ppc_vsx-inl.h b/hwy/ops/ppc_vsx-inl.h
index 9360ee8..03a20ce 100644
--- a/hwy/ops/ppc_vsx-inl.h
+++ b/hwy/ops/ppc_vsx-inl.h
@@ -3458,17 +3458,6 @@
 
 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
 
-template <class DF, HWY_IF_F32_D(DF),
-          class VBF = VFromD<Repartition<bfloat16_t, DF>>>
-HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
-                                             VFromD<DF> sum0,
-                                             VFromD<DF>& sum1) {
-  // Lane order within sum0/1 is undefined, hence we can avoid the
-  // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
-  sum1 = MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), sum1);
-  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), sum0);
-}
-
 // Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
 template <class D32, HWY_IF_UI32_D(D32),
           class V16 = VFromD<RepartitionToNarrow<D32>>>
diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h
index 1700e67..63a6ec2 100644
--- a/hwy/ops/rvv-inl.h
+++ b/hwy/ops/rvv-inl.h
@@ -5573,19 +5573,6 @@
 
 namespace detail {
 
-// Non-overloaded wrapper function so we can define DF in template args.
-template <size_t N, int kPow2, class DF = Simd<float, N, kPow2>,
-          class VF = VFromD<DF>,
-          class DBF16 = Repartition<hwy::bfloat16_t, Simd<float, N, kPow2>>>
-HWY_API VF ReorderWidenMulAccumulateBF16(Simd<float, N, kPow2> df,
-                                         VFromD<DBF16> a, VFromD<DBF16> b,
-                                         const VF sum0, VF& sum1) {
-  // Lane order within sum0/1 is undefined, hence we can avoid the
-  // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
-  sum1 = MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), sum1);
-  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), sum0);
-}
-
 #define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,    \
                            SHIFT, MLEN, NAME, OP)                              \
   template <size_t N>                                                          \
@@ -5662,12 +5649,6 @@
 }  // namespace detail
 
 template <size_t N, int kPow2, class VN, class VW>
-HWY_API VW ReorderWidenMulAccumulate(Simd<float, N, kPow2> d32, VN a, VN b,
-                                     const VW sum0, VW& sum1) {
-  return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1);
-}
-
-template <size_t N, int kPow2, class VN, class VW>
 HWY_API VW ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32, VN a, VN b,
                                      const VW sum0, VW& sum1) {
   return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1);
diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h
index b156458..4e2c2ed 100644
--- a/hwy/ops/scalar-inl.h
+++ b/hwy/ops/scalar-inl.h
@@ -2067,6 +2067,12 @@
 
 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
 
+#ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#else
+#define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#endif
+
 template <class D32, HWY_IF_F32_D(D32)>
 HWY_API Vec1<float> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<bfloat16_t> a,
                                               Vec1<bfloat16_t> b,
diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h
index 5df6f7a..0185e3d 100644
--- a/hwy/ops/wasm_128-inl.h
+++ b/hwy/ops/wasm_128-inl.h
@@ -5834,17 +5834,6 @@
 
 // ------------------------------ ReorderWidenMulAccumulate
 
-template <class DF, HWY_IF_F32_D(DF),
-          class VBF = VFromD<Repartition<bfloat16_t, DF>>>
-HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
-                                             const VFromD<DF> sum0,
-                                             VFromD<DF>& sum1) {
-  // Lane order within sum0/1 is undefined, hence we can avoid the
-  // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
-  sum1 = MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), sum1);
-  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), sum0);
-}
-
 template <class D32, HWY_IF_UI32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
           class V16 = VFromD<RepartitionToNarrow<D32>>>
 HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d32, V16 a, V16 b,
diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h
index 9140749..aa6a43e 100644
--- a/hwy/ops/x86_128-inl.h
+++ b/hwy/ops/x86_128-inl.h
@@ -5990,20 +5990,14 @@
 }  // namespace detail
 
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
-HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
+HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
                                VFromD<RebindToSigned<D>> offsets) {
-  const RebindToSigned<decltype(d)> di;
-  (void)di;  // for HWY_DASSERT
-  HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
   return detail::NativeGather128<1>(base, offsets);
 }
 
 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>>
-HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base,
+HWY_API VFromD<D> GatherIndex(D /*d*/, const T* HWY_RESTRICT base,
                               VFromD<RebindToSigned<D>> indices) {
-  const RebindToSigned<decltype(d)> di;
-  (void)di;  // for HWY_DASSERT
-  HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
   return detail::NativeGather128<sizeof(T)>(base, indices);
 }
 
@@ -6014,9 +6008,6 @@
   // For partial vectors, ensure upper mask lanes are zero to prevent faults.
   if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
 
-  const RebindToSigned<decltype(d)> di;
-  (void)di;  // for HWY_DASSERT
-  HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
   return detail::NativeMaskedGatherOr128<sizeof(T)>(no, m, base, indices);
 }
 
@@ -9537,25 +9528,20 @@
 // ------------------------------ ReorderWidenMulAccumulate (PromoteEvenTo)
 
 #if HWY_NATIVE_DOT_BF16
+
+#ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#else
+#define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#endif
+
 template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_LE_D(DF, 16),
           class VBF = VFromD<Repartition<bfloat16_t, DF>>>
 HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b,
                                              const VFromD<DF> sum0,
                                              VFromD<DF>& /*sum1*/) {
-  return VFromD<DF>{_mm_dpbf16_ps(sum0.raw, reinterpret_cast<__m128bh>(a.raw), reinterpret_cast<__m128bh>(b.raw))};
-}
-#else
-
-// Generic for all vector lengths.
-template <class DF, HWY_IF_F32_D(DF),
-          class VBF = VFromD<Repartition<bfloat16_t, DF>>>
-HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
-                                             const VFromD<DF> sum0,
-                                             VFromD<DF>& sum1) {
-  // Lane order within sum0/1 is undefined, hence we can avoid the
-  // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
-  sum1 = MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), sum1);
-  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), sum0);
+  return VFromD<DF>{_mm_dpbf16_ps(sum0.raw, reinterpret_cast<__m128bh>(a.raw),
+                                  reinterpret_cast<__m128bh>(b.raw))};
 }
 
 #endif  // HWY_NATIVE_DOT_BF16
diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h
index c99d7c9..e890408 100644
--- a/hwy/ops/x86_256-inl.h
+++ b/hwy/ops/x86_256-inl.h
@@ -3822,20 +3822,14 @@
 }  // namespace detail
 
 template <class D, HWY_IF_V_SIZE_D(D, 32)>
-HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
+HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
                                VFromD<RebindToSigned<D>> offsets) {
-  const RebindToSigned<decltype(d)> di;
-  (void)di;  // for HWY_DASSERT
-  HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
   return detail::NativeGather256<1>(base, offsets);
 }
 
 template <class D, HWY_IF_V_SIZE_D(D, 32)>
-HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
+HWY_API VFromD<D> GatherIndex(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
                               VFromD<RebindToSigned<D>> indices) {
-  const RebindToSigned<decltype(d)> di;
-  (void)di;  // for HWY_DASSERT
-  HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
   return detail::NativeGather256<sizeof(TFromD<D>)>(base, indices);
 }
 
@@ -3908,12 +3902,9 @@
 }  // namespace detail
 
 template <class D, HWY_IF_V_SIZE_D(D, 32)>
-HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
+HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D /*d*/,
                                       const TFromD<D>* HWY_RESTRICT base,
                                       VFromD<RebindToSigned<D>> indices) {
-  const RebindToSigned<decltype(d)> di;
-  (void)di;  // for HWY_DASSERT
-  HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
   return detail::NativeMaskedGatherOr256<sizeof(TFromD<D>)>(no, m, base,
                                                             indices);
 }
diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h
index b212ce9..7aede9d 100644
--- a/hwy/ops/x86_512-inl.h
+++ b/hwy/ops/x86_512-inl.h
@@ -3484,30 +3484,21 @@
 }  // namespace detail
 
 template <class D, HWY_IF_V_SIZE_D(D, 64)>
-HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
+HWY_API VFromD<D> GatherOffset(D  /*d*/, const TFromD<D>* HWY_RESTRICT base,
                                VFromD<RebindToSigned<D>> offsets) {
-  const RebindToSigned<decltype(d)> di;
-  (void)di;  // for HWY_DASSERT
-  HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
   return detail::NativeGather512<1>(base, offsets);
 }
 
 template <class D, HWY_IF_V_SIZE_D(D, 64)>
-HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
+HWY_API VFromD<D> GatherIndex(D  /*d*/, const TFromD<D>* HWY_RESTRICT base,
                               VFromD<RebindToSigned<D>> indices) {
-  const RebindToSigned<decltype(d)> di;
-  (void)di;  // for HWY_DASSERT
-  HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
   return detail::NativeGather512<sizeof(TFromD<D>)>(base, indices);
 }
 
 template <class D, HWY_IF_V_SIZE_D(D, 64)>
-HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
+HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D  /*d*/,
                                       const TFromD<D>* HWY_RESTRICT base,
                                       VFromD<RebindToSigned<D>> indices) {
-  const RebindToSigned<decltype(d)> di;
-  (void)di;  // for HWY_DASSERT
-  HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
   return detail::NativeMaskedGatherOr512<sizeof(TFromD<D>)>(no, m, base,
                                                             indices);
 }
diff --git a/hwy/tests/widen_mul_test.cc b/hwy/tests/widen_mul_test.cc
index 1ebeafb..d6e04c3 100644
--- a/hwy/tests/widen_mul_test.cc
+++ b/hwy/tests/widen_mul_test.cc
@@ -20,6 +20,7 @@
 #define HWY_TARGET_INCLUDE "tests/widen_mul_test.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
+#include "hwy/nanobenchmark.h"  // Unpredictable1
 #include "hwy/tests/test_util-inl.h"
 
 HWY_BEFORE_NAMESPACE();
@@ -49,6 +50,7 @@
 
     // delta[p] := p all others zero.
     auto delta_w = AllocateAligned<TW>(NN);
+    HWY_ASSERT(delta_w);
     for (size_t p = 0; p < NN; ++p) {
       // Workaround for incorrect Clang wasm codegen: re-initialize the entire
       // array rather than zero-initialize once and then set lane p to p.
@@ -418,16 +420,86 @@
 }
 
 #ifndef HWY_NATIVE_DOT_BF16
-#error "Update set_macros-inl to set this required macro"
+#error "Update set_macros-inl.h to set this required macro"
 #endif
 
+struct TestMulEvenAdd {
+  // Must be inlined on aarch64 for bf16, else clang crashes.
+  template <typename TN, class DN>
+  HWY_INLINE void operator()(TN /*unused*/, DN dn) {
+    using TW = MakeWide<TN>;
+    const RepartitionToWide<DN> dw;
+    using VW = Vec<decltype(dw)>;
+    using VN = Vec<decltype(dn)>;
+    const size_t NN = Lanes(dn);
+
+    const VW f0 = Zero(dw);
+    const VW f1 = Set(dw, TW{1});
+    const VN bf0 = Zero(dn);
+    // Cannot Set() bfloat16_t directly.
+    const VN bf1 = ReorderDemote2To(dn, f1, f1);
+
+    // Any input zero => both outputs zero
+    HWY_ASSERT_VEC_EQ(dw, f0, MulEvenAdd(dw, bf0, bf0, f0));
+    HWY_ASSERT_VEC_EQ(dw, f0, MulEvenAdd(dw, bf0, bf1, f0));
+    HWY_ASSERT_VEC_EQ(dw, f0, MulEvenAdd(dw, bf1, bf0, f0));
+    HWY_ASSERT_VEC_EQ(dw, f0, MulOddAdd(dw, bf0, bf0, f0));
+    HWY_ASSERT_VEC_EQ(dw, f0, MulOddAdd(dw, bf0, bf1, f0));
+    HWY_ASSERT_VEC_EQ(dw, f0, MulOddAdd(dw, bf1, bf0, f0));
+
+    // delta[p] := 1, all others zero. For each p: Mul(delta, 1, 0) == 1.
+    auto delta_w = AllocateAligned<TW>(NN);
+    HWY_ASSERT(delta_w);
+    for (size_t p = 0; p < NN; ++p) {
+      // Workaround for incorrect Clang wasm codegen: re-initialize the entire
+      // array rather than zero-initialize once and then toggle lane p.
+      for (size_t i = 0; i < NN; ++i) {
+        delta_w[i] = static_cast<TW>(i == p ? Unpredictable1() : 0);
+      }
+      const VW delta0 = Load(dw, delta_w.get());
+      const VW delta1 = Load(dw, delta_w.get() + NN / 2);
+      const VN delta = OrderedDemote2To(dn, delta0, delta1);
+
+      {
+        const VW sum_e = MulEvenAdd(dw, delta, bf1, f0);
+        const VW sum_o = MulOddAdd(dw, delta, bf1, f0);
+        HWY_ASSERT_VEC_EQ(dw, p & 1 ? f0 : f1, SumOfLanes(dw, sum_e));
+        HWY_ASSERT_VEC_EQ(dw, p & 1 ? f1 : f0, SumOfLanes(dw, sum_o));
+      }
+      // Swapped arg order
+      {
+        const VW sum_e = MulEvenAdd(dw, bf1, delta, f0);
+        const VW sum_o = MulOddAdd(dw, bf1, delta, f0);
+        HWY_ASSERT_VEC_EQ(dw, p & 1 ? f0 : f1, SumOfLanes(dw, sum_e));
+        HWY_ASSERT_VEC_EQ(dw, p & 1 ? f1 : f0, SumOfLanes(dw, sum_o));
+      }
+      // Start with nonzero sum
+      {
+        const VW sum_e = MulEvenAdd(dw, delta, bf1, Add(delta0, delta1));
+        const VW sum_o = MulOddAdd(dw, delta, bf1, Add(delta0, delta1));
+        HWY_ASSERT_EQ(TW{3}, ReduceSum(dw, Add(sum_e, sum_o)));
+      }
+
+      // Start with nonzero sum and swap arg order
+      {
+        const VW sum_e = MulEvenAdd(dw, bf1, delta, Add(delta0, delta1));
+        const VW sum_o = MulOddAdd(dw, bf1, delta, Add(delta0, delta1));
+        HWY_ASSERT_EQ(TW{3}, ReduceSum(dw, Add(sum_e, sum_o)));
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllMulEvenAdd() {
+  ForShrinkableVectors<TestMulEvenAdd>()(bfloat16_t());
+}
+
 struct TestReorderWidenMulAccumulate {
   // Must be inlined on aarch64 for bf16, else clang crashes.
   template <typename TN, class DN>
   HWY_INLINE void operator()(TN /*unused*/, DN dn) {
     using TW = MakeWide<TN>;
     const RepartitionToWide<DN> dw;
-    const Half<DN> dnh;
     using VW = Vec<decltype(dw)>;
     using VN = Vec<decltype(dn)>;
     const size_t NN = Lanes(dn);
@@ -452,6 +524,7 @@
 
     // delta[p] := 1, all others zero. For each p: Dot(delta, all-ones) == 1.
     auto delta_w = AllocateAligned<TW>(NN);
+    HWY_ASSERT(delta_w);
     for (size_t p = 0; p < NN; ++p) {
       // Workaround for incorrect Clang wasm codegen: re-initialize the entire
       // array rather than zero-initialize once and then toggle lane p.
@@ -475,15 +548,15 @@
       }
       // Start with nonzero sum0 or sum1
       {
-        VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
-        sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
+        VW sum0 = delta0;
+        sum1 = delta1;
         sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, sum0, sum1);
         HWY_ASSERT_EQ(TW{2}, ReduceSum(dw, Add(sum0, sum1)));
       }
       // Start with nonzero sum0 or sum1, and swap arg order
       {
-        VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
-        sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
+        VW sum0 = delta0;
+        sum1 = delta1;
         sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, sum0, sum1);
         HWY_ASSERT_EQ(TW{2}, ReduceSum(dw, Add(sum0, sum1)));
       }
@@ -524,6 +597,7 @@
 
     // delta[p] := 1, all others zero.
     auto delta_w = AllocateAligned<TW>(NN);
+    HWY_ASSERT(delta_w);
     for (size_t p = 0; p < NN; ++p) {
       // Workaround for incorrect Clang wasm codegen: re-initialize the entire
       // array rather than zero-initialize once and then toggle lane p.
@@ -619,6 +693,7 @@
     const size_t NW = Lanes(dw);
 
     const auto expected = AllocateAligned<TW>(NW);
+    HWY_ASSERT(expected);
     for (size_t iw = 0; iw < NW; ++iw) {
       const size_t in = iw * 2;  // even, odd is +1
       const size_t a0 = 1 + in;
@@ -757,6 +832,7 @@
 HWY_EXPORT_AND_TEST_P(HwyWidenMulTest, TestAllSatWidenMulPairwiseAdd);
 HWY_EXPORT_AND_TEST_P(HwyWidenMulTest, TestAllSatWidenMulPairwiseAccumulate);
 HWY_EXPORT_AND_TEST_P(HwyWidenMulTest, TestAllSatWidenMulAccumFixedPoint);
+HWY_EXPORT_AND_TEST_P(HwyWidenMulTest, TestAllMulEvenAdd);
 HWY_EXPORT_AND_TEST_P(HwyWidenMulTest, TestAllReorderWidenMulAccumulate);
 HWY_EXPORT_AND_TEST_P(HwyWidenMulTest, TestAllRearrangeToOddPlusEven);
 HWY_EXPORT_AND_TEST_P(HwyWidenMulTest, TestAllSumOfMulQuadAccumulate);