Merge pull request #2128 from johnplatts:hwy_rvv_detect_fix_043024_2
PiperOrigin-RevId: 631058462
diff --git a/BUILD b/BUILD
index 8ff5f1e..5fda7b3 100644
--- a/BUILD
+++ b/BUILD
@@ -559,6 +559,11 @@
subdir + test + ".cc",
],
copts = COPTS + HWY_TEST_COPTS,
+ # Fixes OOM for matvec_test on RVV.
+ exec_properties = select({
+ "@platforms//cpu:riscv64": {"mem": "16g"},
+ "//conditions:default": None,
+ }),
features = select({
"@platforms//cpu:riscv64": ["fully_static_link"],
"//conditions:default": [],
@@ -579,6 +584,7 @@
"//conditions:default": False,
}),
local_defines = ["HWY_IS_TEST"],
+ # Placeholder for malloc, do not remove
# for test_suite.
tags = ["hwy_ops_test"],
deps = HWY_TEST_DEPS + select({
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c8c3fe2..3f1ced8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,6 +79,9 @@
# arise due to compiler/platform changes. Enable this in CI/tests.
set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
+# Experimental support for header-only builds
+set(HWY_CMAKE_HEADER_ONLY OFF CACHE BOOL "Change to header-only?")
+
set(HWY_ENABLE_CONTRIB ON CACHE BOOL "Include contrib/")
set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")
@@ -137,9 +140,7 @@
endif() # HWY_ENABLE_CONTRIB
set(HWY_SOURCES
- hwy/abort.cc
hwy/abort.h
- hwy/aligned_allocator.cc
hwy/aligned_allocator.h
hwy/base.h
hwy/cache_control.h
@@ -148,7 +149,6 @@
hwy/foreach_target.h
hwy/highway_export.h
hwy/highway.h
- hwy/nanobenchmark.cc
hwy/nanobenchmark.h
hwy/ops/arm_neon-inl.h
hwy/ops/arm_sve-inl.h
@@ -164,20 +164,28 @@
hwy/ops/x86_128-inl.h
hwy/ops/x86_256-inl.h
hwy/ops/x86_512-inl.h
- hwy/per_target.cc
hwy/per_target.h
hwy/print-inl.h
- hwy/print.cc
hwy/print.h
hwy/profiler.h
hwy/robust_statistics.h
- hwy/targets.cc
hwy/targets.h
hwy/timer-inl.h
- hwy/timer.cc
hwy/timer.h
)
+if (NOT HWY_CMAKE_HEADER_ONLY)
+ list(APPEND HWY_SOURCES
+ hwy/abort.cc
+ hwy/aligned_allocator.cc
+ hwy/nanobenchmark.cc
+ hwy/per_target.cc
+ hwy/print.cc
+ hwy/targets.cc
+ hwy/timer.cc
+ )
+endif()
+
set(HWY_TEST_SOURCES
hwy/tests/hwy_gtest.h
hwy/tests/test_util-inl.h
@@ -367,6 +375,10 @@
endif() # !MSVC
+if (HWY_CMAKE_HEADER_ONLY)
+ list(APPEND HWY_FLAGS -DHWY_HEADER_ONLY)
+endif()
+
include(CheckIncludeFile)
check_include_file(sys/auxv.h HAVE_SYS_AUXV_H)
check_include_file(asm/hwcap.h HAVE_ASM_HWCAP_H)
diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md
index dd37635..94160fc 100644
--- a/g3doc/quick_reference.md
+++ b/g3doc/quick_reference.md
@@ -808,14 +808,23 @@
* <code>V **NegMulSub**(V a, V b, V c)</code>: returns `-a[i] * b[i] - c[i]`.
-* <code>V **MulAddSub**(V a, V b, V c)</code>: returns `a[i] * b[i] - c[i]`
- in the even lanes and `a[i] * b[i] + c[i]` in the odd lanes.
+* <code>V **MulAddSub**(V a, V b, V c)</code>: returns `a[i] * b[i] - c[i]` in
+ the even lanes and `a[i] * b[i] + c[i]` in the odd lanes.
- `MulAddSub(a, b, c)` is equivalent to
- `OddEven(MulAdd(a, b, c), MulSub(a, b, c))` or
- `MulAddSub(a, b, OddEven(c, Neg(c))`, but `MulSub(a, b, c)` is more
+ `MulAddSub(a, b, c)` is equivalent to `OddEven(MulAdd(a, b, c), MulSub(a, b,
+ c))` or `MulAddSub(a, b, OddEven(c, Neg(c))`, but `MulSub(a, b, c)` is more
efficient on some targets (including AVX2/AVX3).
+* `V`: `bf16`, `D`: `RepartitionToWide<DFromV<V>>`, `VW`: `Vec<D>` \
+ <code>VW **MulEvenAdd**(D d, V a, V b, VW c)</code>: equivalent to and
+ potentially more efficient than `MulAdd(PromoteEvenTo(d, a),
+ PromoteEvenTo(d, b), c)`.
+
+* `V`: `bf16`, `D`: `RepartitionToWide<DFromV<V>>`, `VW`: `Vec<D>` \
+ <code>VW **MulOddAdd**(D d, V a, V b, VW c)</code>: equivalent to and
+ potentially more efficient than `MulAdd(PromoteOddTo(d, a), PromoteOddTo(d,
+ b), c)`.
+
#### Masked arithmetic
All ops in this section return `no` for `mask=false` lanes, and suppress any
diff --git a/hwy/contrib/matvec/matvec_test.cc b/hwy/contrib/matvec/matvec_test.cc
index 51e6151..8b87634 100644
--- a/hwy/contrib/matvec/matvec_test.cc
+++ b/hwy/contrib/matvec/matvec_test.cc
@@ -206,6 +206,8 @@
ThreadPool pool(HWY_MIN(num_threads, ThreadPool::MaxThreads()));
Test<AdjustedReps(192), AdjustedReps(256)>(d, pool);
+// Fewer tests due to compiler OOM
+#if HWY_TARGET != HWY_RVV
Test<40, AdjustedReps(512)>(d, pool);
Test<AdjustedReps(1024), 50>(d, pool);
@@ -213,13 +215,17 @@
if (sizeof(TFromD<D>) != 2 && sizeof(VecT) != 2) {
Test<AdjustedReps(1536), AdjustedReps(1536)>(d, pool);
}
+#endif // HWY_TARGET != HWY_RVV
}
public:
template <class T, class D>
HWY_INLINE void operator()(T /*unused*/, D d) {
CreatePoolAndTest(d, 13);
+// Fewer tests due to compiler OOM
+#if HWY_TARGET != HWY_RVV
CreatePoolAndTest(d, 16);
+#endif
}
};
diff --git a/hwy/contrib/sort/vqsort-inl.h b/hwy/contrib/sort/vqsort-inl.h
index c5ccba7..523a215 100644
--- a/hwy/contrib/sort/vqsort-inl.h
+++ b/hwy/contrib/sort/vqsort-inl.h
@@ -1779,7 +1779,7 @@
MaybePrintVector(d, "pivot", pivot, 0, st.LanesPerKey());
MaybePrintVector(d, "second", second, 0, st.LanesPerKey());
- Vec<D> third;
+ Vec<D> third = Zero(d);
// Not supported for key-value types because two 'keys' may be equivalent
// but not interchangeable (their values may differ).
if (HWY_UNLIKELY(!st.IsKV() &&
diff --git a/hwy/contrib/thread_pool/topology.cc b/hwy/contrib/thread_pool/topology.cc
index 41c13f8..a24b8f6 100644
--- a/hwy/contrib/thread_pool/topology.cc
+++ b/hwy/contrib/thread_pool/topology.cc
@@ -15,6 +15,14 @@
#include "hwy/contrib/thread_pool/topology.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <map>
+#include <thread> // NOLINT
+#include <vector>
+
#include "hwy/detect_compiler_arch.h" // HWY_OS_WIN
#if HWY_OS_WIN
@@ -46,10 +54,11 @@
#include <emscripten/threading.h>
#endif
-#include <stddef.h>
-#include <stdio.h>
-
-#include <thread> // NOLINT
+#if HWY_OS_LINUX
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#endif // HWY_OS_LINUX
#include "hwy/base.h"
@@ -64,26 +73,38 @@
}
HWY_DLLEXPORT size_t TotalLogicalProcessors() {
+ size_t lp = 0;
#if HWY_ARCH_WASM
- const int lp = emscripten_num_logical_cores();
- HWY_ASSERT(lp < static_cast<int>(kMaxLogicalProcessors));
- if (lp > 0) return static_cast<size_t>(lp);
+ const int num_cores = emscripten_num_logical_cores();
+ if (num_cores > 0) lp = static_cast<size_t>(num_cores);
#else
- const unsigned lp = std::thread::hardware_concurrency();
- HWY_ASSERT(lp < static_cast<unsigned>(kMaxLogicalProcessors));
- if (lp != 0) return static_cast<size_t>(lp);
+ const unsigned concurrency = std::thread::hardware_concurrency();
+ if (concurrency != 0) lp = static_cast<size_t>(concurrency);
#endif
- // WASM or C++ stdlib failed.
- if (HWY_IS_DEBUG_BUILD) {
- fprintf(
- stderr,
- "Unknown TotalLogicalProcessors. HWY_OS_: WIN=%d LINUX=%d APPLE=%d;\n"
- "HWY_ARCH_: WASM=%d X86=%d PPC=%d ARM=%d RISCV=%d S390X=%d\n",
- HWY_OS_WIN, HWY_OS_LINUX, HWY_OS_APPLE, HWY_ARCH_WASM, HWY_ARCH_X86,
- HWY_ARCH_PPC, HWY_ARCH_ARM, HWY_ARCH_RISCV, HWY_ARCH_S390X);
+ // WASM or C++ stdlib failed to detect #CPUs.
+ if (lp == 0) {
+ if (HWY_IS_DEBUG_BUILD) {
+ fprintf(
+ stderr,
+ "Unknown TotalLogicalProcessors. HWY_OS_: WIN=%d LINUX=%d APPLE=%d;\n"
+ "HWY_ARCH_: WASM=%d X86=%d PPC=%d ARM=%d RISCV=%d S390X=%d\n",
+ HWY_OS_WIN, HWY_OS_LINUX, HWY_OS_APPLE, HWY_ARCH_WASM, HWY_ARCH_X86,
+ HWY_ARCH_PPC, HWY_ARCH_ARM, HWY_ARCH_RISCV, HWY_ARCH_S390X);
+ }
+ return 1;
}
- return 1;
+
+ // Warn that we are clamping.
+ if (lp > kMaxLogicalProcessors) {
+ if (HWY_IS_DEBUG_BUILD) {
+ fprintf(stderr, "OS reports %zu processors but clamping to %zu\n", lp,
+ kMaxLogicalProcessors);
+ }
+ lp = kMaxLogicalProcessors;
+ }
+
+ return lp;
}
#ifdef __ANDROID__
@@ -111,9 +132,17 @@
#endif // __ANDROID__
if (err != 0) return false;
for (size_t lp = 0; lp < kMaxLogicalProcessors; ++lp) {
+#if HWY_COMPILER_GCC_ACTUAL
+ // Workaround for GCC compiler warning with CPU_ISSET macro
+ HWY_DIAGNOSTICS(push)
+ HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")
+#endif
if (CPU_ISSET(static_cast<int>(lp), &set)) {
lps.Set(lp);
}
+#if HWY_COMPILER_GCC_ACTUAL
+ HWY_DIAGNOSTICS(pop)
+#endif
}
return true;
#elif HWY_OS_FREEBSD
@@ -124,9 +153,17 @@
sizeof(cpuset_t), &set);
if (err != 0) return false;
for (size_t lp = 0; lp < kMaxLogicalProcessors; ++lp) {
+#if HWY_COMPILER_GCC_ACTUAL
+ // Workaround for GCC compiler warning with CPU_ISSET macro
+ HWY_DIAGNOSTICS(push)
+ HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")
+#endif
if (CPU_ISSET(static_cast<int>(lp), &set)) {
lps.Set(lp);
}
+#if HWY_COMPILER_GCC_ACTUAL
+ HWY_DIAGNOSTICS(pop)
+#endif
}
return true;
#else
@@ -144,7 +181,15 @@
#elif HWY_OS_LINUX
cpu_set_t set;
CPU_ZERO(&set);
+#if HWY_COMPILER_GCC_ACTUAL
+ // Workaround for GCC compiler warning with CPU_SET macro
+ HWY_DIAGNOSTICS(push)
+ HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")
+#endif
lps.Foreach([&set](size_t lp) { CPU_SET(static_cast<int>(lp), &set); });
+#if HWY_COMPILER_GCC_ACTUAL
+ HWY_DIAGNOSTICS(pop)
+#endif
const pid_t pid = 0; // current thread
#ifdef __ANDROID__
const int err = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &set);
@@ -156,7 +201,15 @@
#elif HWY_OS_FREEBSD
cpuset_t set;
CPU_ZERO(&set);
+#if HWY_COMPILER_GCC_ACTUAL
+ // Workaround for GCC compiler warning with CPU_SET macro
+ HWY_DIAGNOSTICS(push)
+ HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion")
+#endif
lps.Foreach([&set](size_t lp) { CPU_SET(static_cast<int>(lp), &set); });
+#if HWY_COMPILER_GCC_ACTUAL
+ HWY_DIAGNOSTICS(pop)
+#endif
const pid_t pid = getpid(); // current thread
const int err = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid,
sizeof(cpuset_t), &set);
@@ -169,4 +222,224 @@
#endif
}
+#if HWY_OS_LINUX
+
+class File {
+ public:
+ explicit File(const char* path) {
+ for (;;) {
+ fd_ = open(path, O_RDONLY);
+ if (fd_ > 0) return; // success
+ if (errno == EINTR) continue; // signal: retry
+ if (errno == ENOENT) return; // not found, give up
+ if (HWY_IS_DEBUG_BUILD) {
+ fprintf(stderr, "Unexpected error opening %s: %d\n", path, errno);
+ }
+ return; // unknown error, give up
+ }
+ }
+
+ ~File() {
+ if (fd_ > 0) {
+ for (;;) {
+ const int ret = close(fd_);
+ if (ret == 0) break; // success
+ if (errno == EINTR) continue; // signal: retry
+ if (HWY_IS_DEBUG_BUILD) {
+ fprintf(stderr, "Unexpected error closing file: %d\n", errno);
+ }
+ return; // unknown error, ignore
+ }
+ }
+ }
+
+ // Returns number of bytes read or 0 on failure.
+ size_t Read(char* buf200) const {
+ if (fd_ < 0) return 0;
+ size_t pos = 0;
+ for (;;) {
+ // read instead of pread, which might not work for sysfs.
+ const auto bytes_read = read(fd_, buf200 + pos, 200 - pos);
+ if (bytes_read == 0) { // EOF: done
+ buf200[pos++] = '\0';
+ return pos;
+ }
+ if (bytes_read == -1) {
+ if (errno == EINTR) continue; // signal: retry
+ if (HWY_IS_DEBUG_BUILD) {
+ fprintf(stderr, "Unexpected error reading file: %d\n", errno);
+ }
+ return 0;
+ }
+ pos += static_cast<size_t>(bytes_read);
+ HWY_ASSERT(pos <= 200);
+ }
+ }
+
+ private:
+ int fd_;
+};
+
+bool SimpleAtoi(const char* str, size_t len, size_t* out) {
+ size_t value = 0;
+ size_t digits_seen = 0;
+ // 9 digits cannot overflow even 32-bit size_t.
+ for (size_t i = 0; i < HWY_MIN(len, 9); ++i) {
+ if (str[i] < '0' || str[i] > '9') break;
+ value *= 10;
+ value += static_cast<size_t>(str[i] - '0');
+ ++digits_seen;
+ }
+ if (digits_seen == 0) {
+ *out = 0;
+ return false;
+ }
+ *out = value;
+ return true;
+}
+
+bool ReadSysfs(const char* format, size_t lp, size_t max, size_t* out) {
+ char path[200];
+ const int bytes_written = snprintf(path, sizeof(path), format, lp);
+ HWY_ASSERT(0 < bytes_written &&
+ bytes_written < static_cast<int>(sizeof(path) - 1));
+
+ const File file(path);
+ char buf200[200];
+ const size_t pos = file.Read(buf200);
+ if (pos == 0) return false;
+
+ if (!SimpleAtoi(buf200, pos, out)) return false;
+ if (*out > max) {
+ if (HWY_IS_DEBUG_BUILD) {
+ fprintf(stderr, "Value for %s = %zu > %zu\n", path, *out, max);
+ }
+ return false;
+ }
+ return true;
+}
+
+// Given a set of opaque values, returns a map of each value to the number of
+// values that precede it. Used to generate contiguous arrays in the same order.
+std::map<size_t, size_t> RanksFromSet(const BitSet4096<>& set) {
+ HWY_ASSERT(set.Count() != 0);
+ std::map<size_t, size_t> ranks;
+ size_t num = 0;
+ set.Foreach([&ranks, &num](size_t opaque) {
+ const bool inserted = ranks.insert({opaque, num++}).second;
+ HWY_ASSERT(inserted);
+ });
+ HWY_ASSERT(num == set.Count());
+ HWY_ASSERT(num == ranks.size());
+ return ranks;
+}
+
+const char* kPackage =
+ "/sys/devices/system/cpu/cpu%zu/topology/physical_package_id";
+const char* kCluster = "/sys/devices/system/cpu/cpu%zu/cache/index3/id";
+const char* kCore = "/sys/devices/system/cpu/cpu%zu/topology/core_id";
+
+// Returns 0 on error, or number of packages and initializes LP::package.
+size_t DetectPackages(std::vector<Topology::LP>& lps) {
+ // Packages are typically an index, but could actually be an arbitrary value.
+ // We assume they do not exceed kMaxLogicalProcessors and thus fit in a set.
+ BitSet4096<> package_set;
+ for (size_t lp = 0; lp < lps.size(); ++lp) {
+ size_t package;
+ if (!ReadSysfs(kPackage, lp, kMaxLogicalProcessors, &package)) return 0;
+ package_set.Set(package);
+ // Storing a set of lps belonging to each package requires more space/time
+ // than storing the package per lp and remapping below.
+ lps[lp].package = static_cast<uint16_t>(package);
+ }
+ HWY_ASSERT(package_set.Count() != 0);
+
+ // Remap the per-lp package to their rank.
+ std::map<size_t, size_t> ranks = RanksFromSet(package_set);
+ for (size_t lp = 0; lp < lps.size(); ++lp) {
+ lps[lp].package = static_cast<uint16_t>(ranks[lps[lp].package]);
+ }
+ return ranks.size();
+}
+
+// Stores the global cluster/core values separately for each package so we can
+// return per-package arrays.
+struct PerPackage {
+ // Use maximum possible set size because Arm values can exceed 1k.
+ BitSet4096<> cluster_set;
+ BitSet4096<> core_set;
+
+ std::map<size_t, size_t> smt_per_core;
+
+ size_t num_clusters;
+ size_t num_cores;
+};
+
+// Returns false, or fills per_package and initializes LP::cluster/core/smt.
+bool DetectClusterAndCore(std::vector<Topology::LP>& lps,
+ std::vector<PerPackage>& per_package) {
+ for (size_t lp = 0; lp < lps.size(); ++lp) {
+ PerPackage& pp = per_package[lps[lp].package];
+ size_t cluster, core;
+ if (!ReadSysfs(kCluster, lp, kMaxLogicalProcessors, &cluster)) return false;
+ if (!ReadSysfs(kCore, lp, kMaxLogicalProcessors, &core)) return false;
+
+ // SMT ID is how many LP we have already seen assigned to the same core.
+ const size_t smt = pp.smt_per_core[core]++;
+ HWY_ASSERT(smt < 16);
+ lps[lp].smt = static_cast<uint8_t>(smt); // already contiguous
+
+ // Certainly core, and likely also cluster, are HW-dependent opaque values.
+ // We assume they do not exceed kMaxLogicalProcessors and thus fit in a set.
+ pp.cluster_set.Set(cluster);
+ pp.core_set.Set(core);
+ // Temporary storage for remapping as in DetectPackages.
+ lps[lp].cluster = static_cast<uint16_t>(cluster);
+ lps[lp].core = static_cast<uint16_t>(core);
+ }
+
+ for (size_t p = 0; p < per_package.size(); ++p) {
+ PerPackage& pp = per_package[p];
+ std::map<size_t, size_t> cluster_ranks = RanksFromSet(pp.cluster_set);
+ std::map<size_t, size_t> core_ranks = RanksFromSet(pp.core_set);
+ // Remap *this packages'* per-lp cluster/core to their ranks.
+ for (size_t lp = 0; lp < lps.size(); ++lp) {
+ if (lps[lp].package != p) continue;
+ lps[lp].cluster = static_cast<uint16_t>(cluster_ranks[lps[lp].cluster]);
+ lps[lp].core = static_cast<uint16_t>(core_ranks[lps[lp].core]);
+ }
+ pp.num_clusters = cluster_ranks.size();
+ pp.num_cores = core_ranks.size();
+ }
+
+ return true;
+}
+
+#endif // HWY_OS_LINUX
+
+HWY_DLLEXPORT Topology::Topology() {
+#if HWY_OS_LINUX
+ lps.resize(TotalLogicalProcessors());
+
+ const size_t num_packages = DetectPackages(lps);
+ if (num_packages == 0) return;
+ std::vector<PerPackage> per_package(num_packages);
+ if (!DetectClusterAndCore(lps, per_package)) return;
+
+ // Allocate per-package/cluster/core vectors. This indicates to callers that
+ // detection succeeded.
+ packages.resize(num_packages);
+ for (size_t p = 0; p < num_packages; ++p) {
+ packages[p].clusters.resize(per_package[p].num_clusters);
+ packages[p].cores.resize(per_package[p].num_cores);
+ }
+
+ // Populate the per-cluster/core sets of LP.
+ for (size_t lp = 0; lp < lps.size(); ++lp) {
+ packages[lps[lp].package].clusters[lps[lp].cluster].lps.Set(lp);
+ packages[lps[lp].package].cores[lps[lp].core].lps.Set(lp);
+ }
+#endif
+}
+
} // namespace hwy
diff --git a/hwy/contrib/thread_pool/topology.h b/hwy/contrib/thread_pool/topology.h
index 086cd2b..0a001c5 100644
--- a/hwy/contrib/thread_pool/topology.h
+++ b/hwy/contrib/thread_pool/topology.h
@@ -16,27 +16,16 @@
#ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_
#define HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_
-// OS-specific functions for thread affinity.
+// OS-specific functions for processor topology and thread affinity.
#include <stddef.h>
+#include <vector>
+
#include "hwy/base.h"
namespace hwy {
-// Returns false if std::thread should not be used.
-HWY_DLLEXPORT bool HaveThreadingSupport();
-
-// Upper bound on logical processors, including hyperthreads, to enable
-// fixed-size arrays. This limit matches glibc.
-static constexpr size_t kMaxLogicalProcessors = 1024;
-
-// Returns 1 if unknown, otherwise the total number of logical processors
-// provided by the hardware. We assert this is at most `kMaxLogicalProcessors`.
-// These processors are not necessarily all usable; you can determine which are
-// via GetThreadAffinity().
-HWY_DLLEXPORT size_t TotalLogicalProcessors();
-
// 64-bit specialization of std::bitset, which lacks Foreach.
class BitSet64 {
public:
@@ -84,17 +73,18 @@
uint64_t bits_ = 0;
};
-// Two-level bitset for Get/SetThreadAffinity.
-class LogicalProcessorSet {
+// Two-level bitset for up to kMaxSize <= 4096 values.
+template <size_t kMaxSize = 4096>
+class BitSet4096 {
public:
- // No harm if `lp` is already set.
- void Set(size_t lp) {
- HWY_DASSERT(lp < TotalLogicalProcessors());
- const size_t idx = lp / 64;
- const size_t mod = lp % 64;
+ // No harm if `i` is already set.
+ void Set(size_t i) {
+ HWY_DASSERT(i < kMaxSize);
+ const size_t idx = i / 64;
+ const size_t mod = i % 64;
bits_[idx].Set(mod);
nonzero_.Set(idx);
- HWY_DASSERT(Get(lp));
+ HWY_DASSERT(Get(i));
}
// Equivalent to Set(i) for i in [0, 64) where (bits >> i) & 1. This does
@@ -104,28 +94,28 @@
if (bits) nonzero_.Set(0);
}
- void Clear(size_t lp) {
- HWY_DASSERT(lp < TotalLogicalProcessors());
- const size_t idx = lp / 64;
- const size_t mod = lp % 64;
+ void Clear(size_t i) {
+ HWY_DASSERT(i < kMaxSize);
+ const size_t idx = i / 64;
+ const size_t mod = i % 64;
bits_[idx].Clear(mod);
if (!bits_[idx].Any()) {
nonzero_.Clear(idx);
}
- HWY_DASSERT(!Get(lp));
+ HWY_DASSERT(!Get(i));
}
- bool Get(size_t lp) const {
- HWY_DASSERT(lp < TotalLogicalProcessors());
- const size_t idx = lp / 64;
- const size_t mod = lp % 64;
+ bool Get(size_t i) const {
+ HWY_DASSERT(i < kMaxSize);
+ const size_t idx = i / 64;
+ const size_t mod = i % 64;
return bits_[idx].Get(mod);
}
- // Returns uint64_t(Get(lp)) << lp for lp in [0, 64).
+ // Returns uint64_t(Get(i)) << i for i in [0, 64).
uint64_t Get64() const { return bits_[0].Get64(); }
- // Calls func(lp) for each lp in the set.
+ // Calls func(i) for each i in the set.
template <class Func>
void Foreach(const Func& func) const {
nonzero_.Foreach([&func, this](size_t idx) {
@@ -141,11 +131,20 @@
}
private:
- static_assert(kMaxLogicalProcessors <= 64 * 64, "One BitSet64 insufficient");
+ static_assert(kMaxSize <= 64 * 64, "One BitSet64 insufficient");
BitSet64 nonzero_;
- BitSet64 bits_[kMaxLogicalProcessors / 64];
+ BitSet64 bits_[kMaxSize / 64];
};
+// Returns false if std::thread should not be used.
+HWY_DLLEXPORT bool HaveThreadingSupport();
+
+// Upper bound on logical processors, including hyperthreads.
+static constexpr size_t kMaxLogicalProcessors = 1024; // matches glibc
+
+// Set used by Get/SetThreadAffinity.
+using LogicalProcessorSet = BitSet4096<kMaxLogicalProcessors>;
+
// Returns false, or sets `lps` to all logical processors which are online and
// available to the current thread.
HWY_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps);
@@ -164,6 +163,47 @@
return SetThreadAffinity(lps);
}
+// Returns 1 if unknown, otherwise the total number of logical processors
+// provided by the hardware clamped to `kMaxLogicalProcessors`.
+// These processors are not necessarily all usable; you can determine which are
+// via GetThreadAffinity().
+HWY_DLLEXPORT size_t TotalLogicalProcessors();
+
+struct Topology {
+ // Caller must check packages.empty(); if so, do not use any fields.
+ HWY_DLLEXPORT Topology();
+
+ // Clique of cores with lower latency to each other. On Apple M1 these are
+ // four cores sharing an L2. On AMD these 'CCX' are up to eight cores sharing
+ // an L3 and a memory controller.
+ struct Cluster {
+ LogicalProcessorSet lps;
+ };
+
+ struct Core {
+ LogicalProcessorSet lps;
+ };
+
+ struct Package {
+ std::vector<Cluster> clusters;
+ std::vector<Core> cores;
+ };
+
+ std::vector<Package> packages;
+
+ // Several hundred instances, so prefer a compact representation.
+#pragma pack(push, 1)
+ struct LP {
+ uint16_t package = 0;
+ uint16_t cluster = 0; // local to the package, not globally unique
+ uint16_t core = 0; // local to the package, not globally unique
+ uint8_t smt = 0; // local to the package and core
+ uint8_t reserved = 0;
+ };
+#pragma pack(pop)
+ std::vector<LP> lps; // size() == TotalLogicalProcessors().
+};
+
} // namespace hwy
#endif // HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_
diff --git a/hwy/contrib/thread_pool/topology_test.cc b/hwy/contrib/thread_pool/topology_test.cc
index 450e25d..c844791 100644
--- a/hwy/contrib/thread_pool/topology_test.cc
+++ b/hwy/contrib/thread_pool/topology_test.cc
@@ -25,6 +25,7 @@
#include <utility> // std::make_pair
#include <vector>
+#include "hwy/base.h"
#include "hwy/tests/hwy_gtest.h"
#include "hwy/tests/test_util-inl.h"
#include "hwy/tests/test_util.h"
@@ -188,13 +189,39 @@
TEST(TopologyTest, TestNum) {
const size_t total = TotalLogicalProcessors();
fprintf(stderr, "TotalLogical %zu\n", total);
+
LogicalProcessorSet lps;
if (GetThreadAffinity(lps)) {
- HWY_ASSERT(lps.Count() <= total);
fprintf(stderr, "Active %zu\n", lps.Count());
+ HWY_ASSERT(lps.Count() <= total);
}
}
+TEST(TopologyTest, TestTopology) {
+ Topology topology;
+ if (topology.packages.empty()) return;
+
+ HWY_ASSERT(!topology.lps.empty());
+
+ size_t lps_by_cluster = 0;
+ size_t lps_by_core = 0;
+ for (size_t p = 0; p < topology.packages.size(); ++p) {
+ const Topology::Package& pkg = topology.packages[p];
+ HWY_ASSERT(!pkg.clusters.empty());
+ HWY_ASSERT(!pkg.cores.empty());
+ HWY_ASSERT(pkg.clusters.size() <= pkg.cores.size());
+
+ for (const Topology::Cluster& c : pkg.clusters) {
+ lps_by_cluster += c.lps.Count();
+ }
+ for (const Topology::Core& c : pkg.cores) {
+ lps_by_core += c.lps.Count();
+ }
+ }
+ HWY_ASSERT(lps_by_cluster == topology.lps.size());
+ HWY_ASSERT(lps_by_core == topology.lps.size());
+}
+
} // namespace
} // namespace hwy
diff --git a/hwy/detect_targets.h b/hwy/detect_targets.h
index 3338ace..ab74a5f 100644
--- a/hwy/detect_targets.h
+++ b/hwy/detect_targets.h
@@ -625,6 +625,10 @@
#define HWY_ATTAINABLE_S390X 0
#endif
+#if HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH
+#define HWY_ATTAINABLE_RISCV (HWY_RVV)
+#endif
+
// Attainable means enabled and the compiler allows intrinsics (even when not
// allowed to autovectorize). Used in 3 and 4.
#if HWY_ARCH_X86
@@ -648,6 +652,9 @@
#elif HWY_ARCH_S390X
#define HWY_ATTAINABLE_TARGETS \
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_S390X)
+#elif HWY_ARCH_RVV
+#define HWY_ATTAINABLE_TARGETS \
+ HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_RISCV)
#else
#define HWY_ATTAINABLE_TARGETS (HWY_ENABLED_BASELINE)
#endif // HWY_ARCH_*
diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h
index 232a113..927a2ef 100644
--- a/hwy/ops/arm_neon-inl.h
+++ b/hwy/ops/arm_neon-inl.h
@@ -6908,6 +6908,18 @@
#if HWY_NEON_HAVE_F32_TO_BF16C
+#ifdef HWY_NATIVE_MUL_EVEN_BF16
+#undef HWY_NATIVE_MUL_EVEN_BF16
+#else
+#define HWY_NATIVE_MUL_EVEN_BF16
+#endif
+
+#ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#else
+#define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#endif
+
namespace detail {
#if HWY_NEON_HAVE_BFLOAT16
// If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is
@@ -6937,6 +6949,20 @@
} // namespace detail
template <class D, HWY_IF_V_SIZE_D(D, 16)>
+HWY_API Vec128<float> MulEvenAdd(D /*d32*/, Vec128<bfloat16_t> a,
+ Vec128<bfloat16_t> b, const Vec128<float> c) {
+ return Vec128<float>(vbfmlalbq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
+ detail::BitCastToRawNeonBF16(b.raw)));
+}
+
+template <class D, HWY_IF_V_SIZE_D(D, 16)>
+HWY_API Vec128<float> MulOddAdd(D /*d32*/, Vec128<bfloat16_t> a,
+ Vec128<bfloat16_t> b, const Vec128<float> c) {
+ return Vec128<float>(vbfmlaltq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
+ detail::BitCastToRawNeonBF16(b.raw)));
+}
+
+template <class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_API Vec128<float> ReorderWidenMulAccumulate(D /*d32*/, Vec128<bfloat16_t> a,
Vec128<bfloat16_t> b,
const Vec128<float> sum0,
@@ -6947,6 +6973,22 @@
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
+HWY_API VFromD<D> MulEvenAdd(
+ D /*d32*/, VFromD<Repartition<bfloat16_t, D>> a,
+ VFromD<Repartition<bfloat16_t, D>> b, const VFromD<D> c) {
+ return VFromD<D>(vbfmlalb_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
+ detail::BitCastToRawNeonBF16(b.raw)));
+}
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
+HWY_API VFromD<D> MulOddAdd(
+ D /*d32*/, VFromD<Repartition<bfloat16_t, D>> a,
+ VFromD<Repartition<bfloat16_t, D>> b, const VFromD<D> c) {
+ return VFromD<D>(vbfmlalt_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
+ detail::BitCastToRawNeonBF16(b.raw)));
+}
+
+template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ReorderWidenMulAccumulate(
D /*d32*/, VFromD<Repartition<bfloat16_t, D>> a,
VFromD<Repartition<bfloat16_t, D>> b, const VFromD<D> sum0,
@@ -6955,19 +6997,6 @@
detail::BitCastToRawNeonBF16(b.raw)));
}
-#else
-
-template <class DF, HWY_IF_F32_D(DF),
- class VBF = VFromD<Repartition<bfloat16_t, DF>>>
-HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
- const VFromD<DF> sum0,
- VFromD<DF>& sum1) {
- // Lane order within sum0/1 is undefined, hence we can avoid the
- // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
- sum1 = MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), sum1);
- return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), sum0);
-}
-
#endif // HWY_NEON_HAVE_F32_TO_BF16C
template <class D, HWY_IF_I32_D(D)>
diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h
index 36be563..7f4b79a 100644
--- a/hwy/ops/arm_sve-inl.h
+++ b/hwy/ops/arm_sve-inl.h
@@ -5800,23 +5800,32 @@
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-template <size_t N, int kPow2>
-HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df, VBF16 a,
- VBF16 b, const svfloat32_t sum0,
- svfloat32_t& sum1) {
#if HWY_SVE_HAVE_BF16_FEATURE
- (void)df;
- sum1 = svbfmlalt_f32(sum1, a, b);
- return svbfmlalb_f32(sum0, a, b);
+
+// NOTE: we currently do not use SVE BFDOT for bf16 ReorderWidenMulAccumulate
+// because, apparently unlike NEON, it uses round to odd unless the additional
+// FEAT_EBF16 feature is available and enabled.
+#ifdef HWY_NATIVE_MUL_EVEN_BF16
+#undef HWY_NATIVE_MUL_EVEN_BF16
#else
- // Lane order within sum0/1 is undefined, hence we can avoid the
- // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
- sum1 = MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), sum1);
- return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), sum0);
-#endif // HWY_SVE_HAVE_BF16_FEATURE
+#define HWY_NATIVE_MUL_EVEN_BF16
+#endif
+
+template <size_t N, int kPow2>
+HWY_API svfloat32_t MulEvenAdd(Simd<float, N, kPow2> df, VBF16 a, VBF16 b,
+ const svfloat32_t c) {
+ return svbfmlalb_f32(c, a, b);
}
template <size_t N, int kPow2>
+HWY_API svfloat32_t MulOddAdd(Simd<float, N, kPow2> df, VBF16 a, VBF16 b,
+ const svfloat32_t c) {
+ return svbfmlalt_f32(c, a, b);
+}
+
+#endif // HWY_SVE_HAVE_BF16_FEATURE
+
+template <size_t N, int kPow2>
HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32,
svint16_t a, svint16_t b,
const svint32_t sum0,
diff --git a/hwy/ops/emu128-inl.h b/hwy/ops/emu128-inl.h
index 79216b4..c909e70 100644
--- a/hwy/ops/emu128-inl.h
+++ b/hwy/ops/emu128-inl.h
@@ -2811,16 +2811,6 @@
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-template <class DF, HWY_IF_F32_D(DF), class VBF>
-HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
- const VFromD<DF> sum0,
- VFromD<DF>& sum1) {
- // Lane order within sum0/1 is undefined, hence we can avoid the
- // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
- sum1 = MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), sum1);
- return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), sum0);
-}
-
template <class D, HWY_IF_UI32_D(D), class V16>
HWY_API VFromD<D> ReorderWidenMulAccumulate(D d32, V16 a, V16 b,
const VFromD<D> sum0,
diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h
index 890d6bb..3e57e3f 100644
--- a/hwy/ops/generic_ops-inl.h
+++ b/hwy/ops/generic_ops-inl.h
@@ -1,5 +1,6 @@
// Copyright 2021 Google LLC
-// Copyright 2023,2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// Copyright 2023,2024 Arm Limited and/or
+// its affiliates <open-source-office@arm.com>
// SPDX-License-Identifier: Apache-2.0
// SPDX-License-Identifier: BSD-3-Clause
//
@@ -2802,9 +2803,23 @@
static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
VFromD<D> v = Zero(d);
- for (size_t i = 0; i < MaxLanes(d); ++i) {
- if (i < max_lanes_to_load)
- v = InsertLane(v, i, base[ExtractLane(index, i)]);
+ for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) {
+ v = InsertLane(v, i, base[ExtractLane(index, i)]);
+ }
+ return v;
+}
+
+template <class D, typename T = TFromD<D>>
+HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
+ VFromD<RebindToSigned<D>> index,
+ const size_t max_lanes_to_load) {
+ const RebindToSigned<D> di;
+ using TI = TFromD<decltype(di)>;
+ static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
+
+ VFromD<D> v = no;
+ for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) {
+ v = InsertLane(v, i, base[ExtractLane(index, i)]);
}
return v;
}
@@ -2815,6 +2830,12 @@
const size_t max_lanes_to_load) {
return MaskedGatherIndex(FirstN(d, max_lanes_to_load), d, base, index);
}
+template <class D, typename T = TFromD<D>>
+HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
+ VFromD<RebindToSigned<D>> index,
+ const size_t max_lanes_to_load) {
+ return MaskedGatherIndexOr(no, FirstN(d, max_lanes_to_load), d, base, index);
+}
#endif // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
// ------------------------------ Integer AbsDiff and SumsOf8AbsDiff
@@ -4924,6 +4945,56 @@
#endif // HWY_NATIVE_INT_DIV
+// ------------------------------ MulEvenAdd (PromoteEvenTo)
+
+// SVE with bf16 and NEON with bf16 override this.
+#if (defined(HWY_NATIVE_MUL_EVEN_BF16) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_MUL_EVEN_BF16
+#undef HWY_NATIVE_MUL_EVEN_BF16
+#else
+#define HWY_NATIVE_MUL_EVEN_BF16
+#endif
+
+template <class DF, HWY_IF_F32_D(DF),
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
+HWY_API VFromD<DF> MulEvenAdd(DF df, VBF a, VBF b, VFromD<DF> c) {
+ return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), c);
+}
+
+template <class DF, HWY_IF_F32_D(DF),
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
+HWY_API VFromD<DF> MulOddAdd(DF df, VBF a, VBF b, VFromD<DF> c) {
+ return MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), c);
+}
+
+#endif // HWY_NATIVE_MUL_EVEN_BF16
+
+// ------------------------------ ReorderWidenMulAccumulate (MulEvenAdd)
+
+// AVX3_SPR/ZEN4, and NEON with bf16 but not(!) SVE override this.
+#if (defined(HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16) == \
+ defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#else
+#define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#endif
+
+template <class DF, HWY_IF_F32_D(DF),
+ class VBF = VFromD<Repartition<bfloat16_t, DF>>>
+HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
+ VFromD<DF> sum0,
+ VFromD<DF>& sum1) {
+ // Lane order within sum0/1 is undefined, hence we can avoid the
+ // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
+ sum1 = MulOddAdd(df, a, b, sum1);
+ return MulEvenAdd(df, a, b, sum0);
+}
+
+#endif // HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+
+// ------------------------------ WidenMulAccumulate
+
#if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
#undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
@@ -4939,7 +5010,7 @@
return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low);
}
-#endif
+#endif // HWY_NATIVE_WIDEN_MUL_ACCUMULATE
#if 0
#if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16) == defined(HWY_TARGET_TOGGLE))
@@ -4959,10 +5030,10 @@
return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low);
}
-#endif
+#endif // HWY_HAVE_FLOAT16
-#endif
-#endif
+#endif // HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
+#endif // #if 0
// ------------------------------ SatWidenMulPairwiseAdd
diff --git a/hwy/ops/ppc_vsx-inl.h b/hwy/ops/ppc_vsx-inl.h
index 9360ee8..03a20ce 100644
--- a/hwy/ops/ppc_vsx-inl.h
+++ b/hwy/ops/ppc_vsx-inl.h
@@ -3458,17 +3458,6 @@
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-template <class DF, HWY_IF_F32_D(DF),
- class VBF = VFromD<Repartition<bfloat16_t, DF>>>
-HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
- VFromD<DF> sum0,
- VFromD<DF>& sum1) {
- // Lane order within sum0/1 is undefined, hence we can avoid the
- // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
- sum1 = MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), sum1);
- return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), sum0);
-}
-
// Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe.
template <class D32, HWY_IF_UI32_D(D32),
class V16 = VFromD<RepartitionToNarrow<D32>>>
diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h
index 1700e67..63a6ec2 100644
--- a/hwy/ops/rvv-inl.h
+++ b/hwy/ops/rvv-inl.h
@@ -5573,19 +5573,6 @@
namespace detail {
-// Non-overloaded wrapper function so we can define DF in template args.
-template <size_t N, int kPow2, class DF = Simd<float, N, kPow2>,
- class VF = VFromD<DF>,
- class DBF16 = Repartition<hwy::bfloat16_t, Simd<float, N, kPow2>>>
-HWY_API VF ReorderWidenMulAccumulateBF16(Simd<float, N, kPow2> df,
- VFromD<DBF16> a, VFromD<DBF16> b,
- const VF sum0, VF& sum1) {
- // Lane order within sum0/1 is undefined, hence we can avoid the
- // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
- sum1 = MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), sum1);
- return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), sum0);
-}
-
#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
template <size_t N> \
@@ -5662,12 +5649,6 @@
} // namespace detail
template <size_t N, int kPow2, class VN, class VW>
-HWY_API VW ReorderWidenMulAccumulate(Simd<float, N, kPow2> d32, VN a, VN b,
- const VW sum0, VW& sum1) {
- return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1);
-}
-
-template <size_t N, int kPow2, class VN, class VW>
HWY_API VW ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32, VN a, VN b,
const VW sum0, VW& sum1) {
return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1);
diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h
index b156458..4e2c2ed 100644
--- a/hwy/ops/scalar-inl.h
+++ b/hwy/ops/scalar-inl.h
@@ -2067,6 +2067,12 @@
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
+#ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#else
+#define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#endif
+
template <class D32, HWY_IF_F32_D(D32)>
HWY_API Vec1<float> ReorderWidenMulAccumulate(D32 /* tag */, Vec1<bfloat16_t> a,
Vec1<bfloat16_t> b,
diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h
index 5df6f7a..0185e3d 100644
--- a/hwy/ops/wasm_128-inl.h
+++ b/hwy/ops/wasm_128-inl.h
@@ -5834,17 +5834,6 @@
// ------------------------------ ReorderWidenMulAccumulate
-template <class DF, HWY_IF_F32_D(DF),
- class VBF = VFromD<Repartition<bfloat16_t, DF>>>
-HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
- const VFromD<DF> sum0,
- VFromD<DF>& sum1) {
- // Lane order within sum0/1 is undefined, hence we can avoid the
- // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
- sum1 = MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), sum1);
- return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), sum0);
-}
-
template <class D32, HWY_IF_UI32_D(D32), HWY_IF_V_SIZE_LE_D(D32, 16),
class V16 = VFromD<RepartitionToNarrow<D32>>>
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 d32, V16 a, V16 b,
diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h
index 9140749..aa6a43e 100644
--- a/hwy/ops/x86_128-inl.h
+++ b/hwy/ops/x86_128-inl.h
@@ -5990,20 +5990,14 @@
} // namespace detail
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
-HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
+HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> offsets) {
- const RebindToSigned<decltype(d)> di;
- (void)di; // for HWY_DASSERT
- HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
return detail::NativeGather128<1>(base, offsets);
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), typename T = TFromD<D>>
-HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base,
+HWY_API VFromD<D> GatherIndex(D /*d*/, const T* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> indices) {
- const RebindToSigned<decltype(d)> di;
- (void)di; // for HWY_DASSERT
- HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
return detail::NativeGather128<sizeof(T)>(base, indices);
}
@@ -6014,9 +6008,6 @@
// For partial vectors, ensure upper mask lanes are zero to prevent faults.
if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d)));
- const RebindToSigned<decltype(d)> di;
- (void)di; // for HWY_DASSERT
- HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
return detail::NativeMaskedGatherOr128<sizeof(T)>(no, m, base, indices);
}
@@ -9537,25 +9528,20 @@
// ------------------------------ ReorderWidenMulAccumulate (PromoteEvenTo)
#if HWY_NATIVE_DOT_BF16
+
+#ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#else
+#define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
+#endif
+
template <class DF, HWY_IF_F32_D(DF), HWY_IF_V_SIZE_LE_D(DF, 16),
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF /*df*/, VBF a, VBF b,
const VFromD<DF> sum0,
VFromD<DF>& /*sum1*/) {
- return VFromD<DF>{_mm_dpbf16_ps(sum0.raw, reinterpret_cast<__m128bh>(a.raw), reinterpret_cast<__m128bh>(b.raw))};
-}
-#else
-
-// Generic for all vector lengths.
-template <class DF, HWY_IF_F32_D(DF),
- class VBF = VFromD<Repartition<bfloat16_t, DF>>>
-HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
- const VFromD<DF> sum0,
- VFromD<DF>& sum1) {
- // Lane order within sum0/1 is undefined, hence we can avoid the
- // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
- sum1 = MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), sum1);
- return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), sum0);
+ return VFromD<DF>{_mm_dpbf16_ps(sum0.raw, reinterpret_cast<__m128bh>(a.raw),
+ reinterpret_cast<__m128bh>(b.raw))};
}
#endif // HWY_NATIVE_DOT_BF16
diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h
index c99d7c9..e890408 100644
--- a/hwy/ops/x86_256-inl.h
+++ b/hwy/ops/x86_256-inl.h
@@ -3822,20 +3822,14 @@
} // namespace detail
template <class D, HWY_IF_V_SIZE_D(D, 32)>
-HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
+HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> offsets) {
- const RebindToSigned<decltype(d)> di;
- (void)di; // for HWY_DASSERT
- HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
return detail::NativeGather256<1>(base, offsets);
}
template <class D, HWY_IF_V_SIZE_D(D, 32)>
-HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
+HWY_API VFromD<D> GatherIndex(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> indices) {
- const RebindToSigned<decltype(d)> di;
- (void)di; // for HWY_DASSERT
- HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
return detail::NativeGather256<sizeof(TFromD<D>)>(base, indices);
}
@@ -3908,12 +3902,9 @@
} // namespace detail
template <class D, HWY_IF_V_SIZE_D(D, 32)>
-HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
+HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D /*d*/,
const TFromD<D>* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> indices) {
- const RebindToSigned<decltype(d)> di;
- (void)di; // for HWY_DASSERT
- HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
return detail::NativeMaskedGatherOr256<sizeof(TFromD<D>)>(no, m, base,
indices);
}
diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h
index b212ce9..7aede9d 100644
--- a/hwy/ops/x86_512-inl.h
+++ b/hwy/ops/x86_512-inl.h
@@ -3484,30 +3484,21 @@
} // namespace detail
template <class D, HWY_IF_V_SIZE_D(D, 64)>
-HWY_API VFromD<D> GatherOffset(D d, const TFromD<D>* HWY_RESTRICT base,
+HWY_API VFromD<D> GatherOffset(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> offsets) {
- const RebindToSigned<decltype(d)> di;
- (void)di; // for HWY_DASSERT
- HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di))));
return detail::NativeGather512<1>(base, offsets);
}
template <class D, HWY_IF_V_SIZE_D(D, 64)>
-HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
+HWY_API VFromD<D> GatherIndex(D /*d*/, const TFromD<D>* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> indices) {
- const RebindToSigned<decltype(d)> di;
- (void)di; // for HWY_DASSERT
- HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
return detail::NativeGather512<sizeof(TFromD<D>)>(base, indices);
}
template <class D, HWY_IF_V_SIZE_D(D, 64)>
-HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
+HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D /*d*/,
const TFromD<D>* HWY_RESTRICT base,
VFromD<RebindToSigned<D>> indices) {
- const RebindToSigned<decltype(d)> di;
- (void)di; // for HWY_DASSERT
- HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di))));
return detail::NativeMaskedGatherOr512<sizeof(TFromD<D>)>(no, m, base,
indices);
}
diff --git a/hwy/tests/widen_mul_test.cc b/hwy/tests/widen_mul_test.cc
index 1ebeafb..d6e04c3 100644
--- a/hwy/tests/widen_mul_test.cc
+++ b/hwy/tests/widen_mul_test.cc
@@ -20,6 +20,7 @@
#define HWY_TARGET_INCLUDE "tests/widen_mul_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/highway.h"
+#include "hwy/nanobenchmark.h" // Unpredictable1
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
@@ -49,6 +50,7 @@
// delta[p] := p all others zero.
auto delta_w = AllocateAligned<TW>(NN);
+ HWY_ASSERT(delta_w);
for (size_t p = 0; p < NN; ++p) {
// Workaround for incorrect Clang wasm codegen: re-initialize the entire
// array rather than zero-initialize once and then set lane p to p.
@@ -418,16 +420,86 @@
}
#ifndef HWY_NATIVE_DOT_BF16
-#error "Update set_macros-inl to set this required macro"
+#error "Update set_macros-inl.h to set this required macro"
#endif
+struct TestMulEvenAdd {
+ // Must be inlined on aarch64 for bf16, else clang crashes.
+ template <typename TN, class DN>
+ HWY_INLINE void operator()(TN /*unused*/, DN dn) {
+ using TW = MakeWide<TN>;
+ const RepartitionToWide<DN> dw;
+ using VW = Vec<decltype(dw)>;
+ using VN = Vec<decltype(dn)>;
+ const size_t NN = Lanes(dn);
+
+ const VW f0 = Zero(dw);
+ const VW f1 = Set(dw, TW{1});
+ const VN bf0 = Zero(dn);
+ // Cannot Set() bfloat16_t directly.
+ const VN bf1 = ReorderDemote2To(dn, f1, f1);
+
+ // Any input zero => both outputs zero
+ HWY_ASSERT_VEC_EQ(dw, f0, MulEvenAdd(dw, bf0, bf0, f0));
+ HWY_ASSERT_VEC_EQ(dw, f0, MulEvenAdd(dw, bf0, bf1, f0));
+ HWY_ASSERT_VEC_EQ(dw, f0, MulEvenAdd(dw, bf1, bf0, f0));
+ HWY_ASSERT_VEC_EQ(dw, f0, MulOddAdd(dw, bf0, bf0, f0));
+ HWY_ASSERT_VEC_EQ(dw, f0, MulOddAdd(dw, bf0, bf1, f0));
+ HWY_ASSERT_VEC_EQ(dw, f0, MulOddAdd(dw, bf1, bf0, f0));
+
+ // delta[p] := 1, all others zero. For each p: Mul(delta, 1, 0) == 1.
+ auto delta_w = AllocateAligned<TW>(NN);
+ HWY_ASSERT(delta_w);
+ for (size_t p = 0; p < NN; ++p) {
+ // Workaround for incorrect Clang wasm codegen: re-initialize the entire
+ // array rather than zero-initialize once and then toggle lane p.
+ for (size_t i = 0; i < NN; ++i) {
+ delta_w[i] = static_cast<TW>(i == p ? Unpredictable1() : 0);
+ }
+ const VW delta0 = Load(dw, delta_w.get());
+ const VW delta1 = Load(dw, delta_w.get() + NN / 2);
+ const VN delta = OrderedDemote2To(dn, delta0, delta1);
+
+ {
+ const VW sum_e = MulEvenAdd(dw, delta, bf1, f0);
+ const VW sum_o = MulOddAdd(dw, delta, bf1, f0);
+ HWY_ASSERT_VEC_EQ(dw, p & 1 ? f0 : f1, SumOfLanes(dw, sum_e));
+ HWY_ASSERT_VEC_EQ(dw, p & 1 ? f1 : f0, SumOfLanes(dw, sum_o));
+ }
+ // Swapped arg order
+ {
+ const VW sum_e = MulEvenAdd(dw, bf1, delta, f0);
+ const VW sum_o = MulOddAdd(dw, bf1, delta, f0);
+ HWY_ASSERT_VEC_EQ(dw, p & 1 ? f0 : f1, SumOfLanes(dw, sum_e));
+ HWY_ASSERT_VEC_EQ(dw, p & 1 ? f1 : f0, SumOfLanes(dw, sum_o));
+ }
+ // Start with nonzero sum
+ {
+ const VW sum_e = MulEvenAdd(dw, delta, bf1, Add(delta0, delta1));
+ const VW sum_o = MulOddAdd(dw, delta, bf1, Add(delta0, delta1));
+ HWY_ASSERT_EQ(TW{3}, ReduceSum(dw, Add(sum_e, sum_o)));
+ }
+
+ // Start with nonzero sum and swap arg order
+ {
+ const VW sum_e = MulEvenAdd(dw, bf1, delta, Add(delta0, delta1));
+ const VW sum_o = MulOddAdd(dw, bf1, delta, Add(delta0, delta1));
+ HWY_ASSERT_EQ(TW{3}, ReduceSum(dw, Add(sum_e, sum_o)));
+ }
+ }
+ }
+};
+
+HWY_NOINLINE void TestAllMulEvenAdd() {
+ ForShrinkableVectors<TestMulEvenAdd>()(bfloat16_t());
+}
+
struct TestReorderWidenMulAccumulate {
// Must be inlined on aarch64 for bf16, else clang crashes.
template <typename TN, class DN>
HWY_INLINE void operator()(TN /*unused*/, DN dn) {
using TW = MakeWide<TN>;
const RepartitionToWide<DN> dw;
- const Half<DN> dnh;
using VW = Vec<decltype(dw)>;
using VN = Vec<decltype(dn)>;
const size_t NN = Lanes(dn);
@@ -452,6 +524,7 @@
// delta[p] := 1, all others zero. For each p: Dot(delta, all-ones) == 1.
auto delta_w = AllocateAligned<TW>(NN);
+ HWY_ASSERT(delta_w);
for (size_t p = 0; p < NN; ++p) {
// Workaround for incorrect Clang wasm codegen: re-initialize the entire
// array rather than zero-initialize once and then toggle lane p.
@@ -475,15 +548,15 @@
}
// Start with nonzero sum0 or sum1
{
- VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
- sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
+ VW sum0 = delta0;
+ sum1 = delta1;
sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, sum0, sum1);
HWY_ASSERT_EQ(TW{2}, ReduceSum(dw, Add(sum0, sum1)));
}
// Start with nonzero sum0 or sum1, and swap arg order
{
- VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
- sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
+ VW sum0 = delta0;
+ sum1 = delta1;
sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, sum0, sum1);
HWY_ASSERT_EQ(TW{2}, ReduceSum(dw, Add(sum0, sum1)));
}
@@ -524,6 +597,7 @@
// delta[p] := 1, all others zero.
auto delta_w = AllocateAligned<TW>(NN);
+ HWY_ASSERT(delta_w);
for (size_t p = 0; p < NN; ++p) {
// Workaround for incorrect Clang wasm codegen: re-initialize the entire
// array rather than zero-initialize once and then toggle lane p.
@@ -619,6 +693,7 @@
const size_t NW = Lanes(dw);
const auto expected = AllocateAligned<TW>(NW);
+ HWY_ASSERT(expected);
for (size_t iw = 0; iw < NW; ++iw) {
const size_t in = iw * 2; // even, odd is +1
const size_t a0 = 1 + in;
@@ -757,6 +832,7 @@
HWY_EXPORT_AND_TEST_P(HwyWidenMulTest, TestAllSatWidenMulPairwiseAdd);
HWY_EXPORT_AND_TEST_P(HwyWidenMulTest, TestAllSatWidenMulPairwiseAccumulate);
HWY_EXPORT_AND_TEST_P(HwyWidenMulTest, TestAllSatWidenMulAccumFixedPoint);
+HWY_EXPORT_AND_TEST_P(HwyWidenMulTest, TestAllMulEvenAdd);
HWY_EXPORT_AND_TEST_P(HwyWidenMulTest, TestAllReorderWidenMulAccumulate);
HWY_EXPORT_AND_TEST_P(HwyWidenMulTest, TestAllRearrangeToOddPlusEven);
HWY_EXPORT_AND_TEST_P(HwyWidenMulTest, TestAllSumOfMulQuadAccumulate);