#include "ruy/ctx.h"
#include <functional>
#include "ruy/check_macros.h"
#include "ruy/cpuinfo.h"
#include "ruy/ctx_impl.h"
#include "ruy/have_built_path_for.h"
#include "ruy/path.h"
#include "ruy/platform.h"
#include "ruy/prepacked_cache.h"
namespace ruy {
const CtxImpl& Ctx::impl() const { return static_cast<const CtxImpl&>(*this); }
CtxImpl* Ctx::mutable_impl() { return static_cast<CtxImpl*>(this); }
Path Ctx::last_used_path() const { return impl().last_used_path_; }
Tuning Ctx::explicit_tuning() const { return impl().explicit_tuning_; }
void Ctx::set_explicit_tuning(Tuning value) {
mutable_impl()->explicit_tuning_ = value;
const ThreadPool& Ctx::thread_pool() const { return impl().thread_pool_; }
ThreadPool* Ctx::mutable_thread_pool() { return &mutable_impl()->thread_pool_; }
int Ctx::max_num_threads() const { return impl().max_num_threads_; }
void Ctx::set_max_num_threads(int value) {
mutable_impl()->max_num_threads_ = value;
void Ctx::SetRuntimeEnabledPaths(Path paths) {
mutable_impl()->runtime_enabled_paths_ = paths | kNonArchPaths;
CpuInfo* Ctx::mutable_cpuinfo() { return &mutable_impl()->cpuinfo_; }
namespace {
// For each Path bit set in `paths_to_test`, performs runtime detection and
// sets the corresponding bit in the return value if and only if it is
// supported. Path bits that are not set in the input
// `paths_to_detect` value are also left not set in the return value.
Path DetectRuntimeSupportedPaths(Path paths_to_detect, CpuInfo* cpuinfo) {
// Paths in kNonArchPaths are always implicitly supported.
// Further logic below may add more bits to `results`.
Path result = kNonArchPaths;
// Conditionally sets the `path` bit in `result`, if reported as supported
// by the `is_supported` predicate.
auto maybe_add = [&](Path path, std::function<bool(void)> is_supported) {
if ((paths_to_detect & path) != Path::kNone) {
if (is_supported()) {
result = result | path;
// NEON is unconditionally available on ARM64.
// On ARM32 it's technically possible for it to be unavailable, but we've
// always chosen to just crash on such devices. We could reevaluate that,
// however for non-NEON devices to be actually supported, we would need to
// address also compiler-generated NEON code. That would mean to remove
// -mfpu=neon from ruy_copts and only use this flag in select NEON translation
// units, and implement have_built_path_for_neon, similar to the x86 SIMD
// paths.
maybe_add(Path::kNeon, []() { return true; });
// NEON dotprod requires runtime detection, however unlike the x86 SIMD paths
// it still does not require have_built_path_for because we unconditionally
// build it at the moment. That is largely because we have had to machine
// encode dotprod instructions, so we don't actually rely on toolchain support
// for them.
maybe_add(Path::kNeonDotprod, [=]() { return cpuinfo->NeonDotprod(); });
// x86 SIMD paths currently require both runtime detection, and detection of
// whether we're building the path at all.
[=]() { return HaveBuiltPathForSse42() && cpuinfo->Sse42(); });
[=]() { return HaveBuiltPathForAvx2() && cpuinfo->Avx2(); });
[=]() { return HaveBuiltPathForAvx512() && cpuinfo->Avx512(); });
[=]() { return HaveBuiltPathForAvxVnni() && cpuinfo->AvxVnni(); });
// Sanity checks
RUY_DCHECK_EQ(kNonArchPaths & ~result, Path::kNone);
RUY_DCHECK_EQ(result & ~(kNonArchPaths | paths_to_detect), Path::kNone);
return result;
} // namespace
Path Ctx::GetRuntimeEnabledPaths() {
// Just a shorthand alias. Using a pointer to make it clear we're mutating
// this value in-place.
Path* paths = &mutable_impl()->runtime_enabled_paths_;
// The value Path::kNone indicates the initial state before detection has been
// performed.
if (*paths == Path::kNone) {
*paths = DetectRuntimeSupportedPaths(kAllPaths, mutable_cpuinfo());
return *paths;
Path Ctx::SelectPath(Path compiled_paths) {
return mutable_impl()->last_used_path_ =
GetMostSignificantPath(compiled_paths & GetRuntimeEnabledPaths());
void Ctx::EnsureThreadSpecificResources(int thread_count) {
auto& resources = mutable_impl()->thread_specific_resources_;
while (thread_count > static_cast<int>(resources.size())) {
resources.emplace_back(new ThreadSpecificResource);
RUY_DCHECK_LE(thread_count, static_cast<int>(resources.size()));
TuningResolver* Ctx::GetThreadSpecificTuningResolver(int thread_index) const {
const auto& resources = impl().thread_specific_resources_;
RUY_DCHECK_LT(thread_index, static_cast<int>(resources.size()));
return &resources[thread_index]->tuning_resolver;
Allocator* Ctx::GetThreadSpecificAllocator(int thread_index) const {
const auto& resources = impl().thread_specific_resources_;
RUY_DCHECK_LT(thread_index, static_cast<int>(resources.size()));
return &resources[thread_index]->allocator;
Allocator* Ctx::GetMainAllocator() {
if (!impl().main_allocator_) {
mutable_impl()->main_allocator_.reset(new Allocator);
return impl().main_allocator_.get();
PrepackedCache* Ctx::GetPrepackedCache() {
if (!impl().prepacked_cache_) {
mutable_impl()->prepacked_cache_.reset(new PrepackedCache);
return impl().prepacked_cache_.get();
Tuning Ctx::GetMainThreadTuning() {
TuningResolver* tuning_resolver = GetThreadSpecificTuningResolver(0);
return tuning_resolver->Resolve();
void Ctx::ClearPrepackedCache() { mutable_impl()->prepacked_cache_ = nullptr; }
} // namespace ruy