Minor cleanup in the TrMulTask main loop:
Shortened a comment to the length it deserves (in this most important loop in ruy, conciseness matters). The optimization that this is about is not that important and I believe that the shortened comment is enough.
Moved local var defs to where they are used. Makes it clear that their values aren't carried over across loop iterations.
PiperOrigin-RevId: 314399108
diff --git a/ruy/BUILD b/ruy/BUILD
index 9bd40dc..6db6a4d 100644
--- a/ruy/BUILD
+++ b/ruy/BUILD
@@ -92,6 +92,7 @@
cc_test(
name = "wait_test",
srcs = ["wait_test.cc"],
+ copts = ruy_copts(),
linkopts = ruy_linkopts_thread_standard_library(),
deps = [
":gtest_wrapper",
@@ -110,6 +111,7 @@
cc_test(
name = "size_util_test",
srcs = ["size_util_test.cc"],
+ copts = ruy_copts(),
deps = [
":gtest_wrapper",
":size_util",
@@ -162,6 +164,7 @@
cc_test(
name = "tune_test",
srcs = ["tune_test.cc"],
+ copts = ruy_copts(),
deps = [
":gtest_wrapper",
":tune",
@@ -171,6 +174,7 @@
cc_test(
name = "prepacked_cache_test",
srcs = ["prepacked_cache_test.cc"],
+ copts = ruy_copts(),
deps = [
":context",
":context_get_ctx",
@@ -210,6 +214,7 @@
cc_test(
name = "allocator_test",
srcs = ["allocator_test.cc"],
+ copts = ruy_copts(),
deps = [
":allocator",
":gtest_wrapper",
@@ -234,6 +239,7 @@
copts = ruy_copts(),
deps = [
":check_macros",
+ ":cpu_cache_params",
":opt_set",
":side_pair",
":size_util",
@@ -244,11 +250,13 @@
cc_test(
name = "block_map_test",
srcs = ["block_map_test.cc"],
+ copts = ruy_copts(),
deps = [
":block_map",
- ":cpu_cache_size",
+ ":cpu_cache_params",
":gtest_wrapper",
":path",
+ ":platform",
":side_pair",
],
)
@@ -290,6 +298,12 @@
)
cc_library(
+ name = "cpu_cache_params",
+ hdrs = ["cpu_cache_params.h"],
+ copts = ruy_copts(),
+)
+
+cc_library(
name = "cpuinfo",
srcs = [
"cpuinfo.cc",
@@ -305,8 +319,11 @@
"-Wno-undef",
],
}),
- deps = [":platform"] + select({
- # cpuinfo does not build on ppc.
+ deps = [
+ ":platform",
+ ":check_macros",
+ ":cpu_cache_params",
+ ] + select({
":ppc": [],
":fuchsia": [],
"//conditions:default": ["@cpuinfo"],
@@ -325,16 +342,6 @@
)
cc_library(
- name = "cpu_cache_size",
- hdrs = ["cpu_cache_size.h"],
- copts = ruy_copts(),
- deps = [
- ":path",
- ":platform",
- ],
-)
-
-cc_library(
name = "matrix",
hdrs = ["matrix.h"],
copts = ruy_copts(),
@@ -345,6 +352,7 @@
cc_test(
name = "matrix_test",
srcs = ["matrix_test.cc"],
+ copts = ruy_copts(),
deps = [
":gtest_wrapper",
":matrix",
@@ -357,7 +365,6 @@
copts = ruy_copts(),
visibility = ["//visibility:public"],
deps = [
- ":cpu_cache_size",
":matrix",
],
)
@@ -365,6 +372,7 @@
cc_test(
name = "mul_params_test",
srcs = ["mul_params_test.cc"],
+ copts = ruy_copts(),
deps = [
":gtest_wrapper",
":mul_params",
@@ -772,6 +780,7 @@
cc_test(
name = "context_test",
srcs = ["context_test.cc"],
+ copts = ruy_copts(),
deps = [
":context",
":gtest_wrapper",
@@ -833,6 +842,7 @@
cc_test(
name = "ctx_test",
srcs = ["ctx_test.cc"],
+ copts = ruy_copts(),
deps = [
":ctx",
":gtest_wrapper",
@@ -862,6 +872,8 @@
":block_map",
":check_macros",
":common",
+ ":cpu_cache_params",
+ ":cpuinfo",
":ctx",
":mat",
":matrix",
diff --git a/ruy/block_map.cc b/ruy/block_map.cc
index 75ab3ef..44e5039 100644
--- a/ruy/block_map.cc
+++ b/ruy/block_map.cc
@@ -126,18 +126,16 @@
}
}
-BlockMapTraversalOrder GetTraversalOrder(int rows, int cols, int depth,
- int lhs_scalar_size,
- int rhs_scalar_size,
- int local_data_cache_size,
- int shared_data_cache_size) {
+BlockMapTraversalOrder GetTraversalOrder(
+ int rows, int cols, int depth, int lhs_scalar_size, int rhs_scalar_size,
+ const CpuCacheParams& cpu_cache_params) {
static constexpr bool kAnyFractal =
RUY_OPT(FRACTAL_Z) | RUY_OPT(FRACTAL_U) | RUY_OPT(FRACTAL_HILBERT);
const int working_set_size =
(lhs_scalar_size * rows + rhs_scalar_size * cols) * depth;
- if (kAnyFractal && (working_set_size > local_data_cache_size)) {
+ if (kAnyFractal && (working_set_size > cpu_cache_params.local_cache_size)) {
if (RUY_OPT(FRACTAL_HILBERT) &&
- (working_set_size > shared_data_cache_size)) {
+ (working_set_size > cpu_cache_params.last_level_cache_size)) {
return BlockMapTraversalOrder::kFractalHilbert;
} else if (RUY_OPT(FRACTAL_U)) {
return BlockMapTraversalOrder::kFractalU;
@@ -245,7 +243,7 @@
int GetCacheLocalityScore(int block_size_log2, int rows, int cols, int depth,
int kernel_rows_log2, int kernel_cols_log2,
int lhs_scalar_size, int rhs_scalar_size,
- int local_data_cache_size) {
+ const CpuCacheParams& cpu_cache_params) {
// In the narrow case (e.g. matrix*vector), each byte of the big operand
// matrix (either LHS or RHS) is traversed only once, so any notion of data
// locality is irrelevant. Ignore the 'cache locality score' by forcing it to
@@ -259,7 +257,7 @@
(lhs_scalar_size * block_rows + rhs_scalar_size * block_cols) * depth;
const int total_read_bytes_log2 = ceil_log2(total_read_bytes);
const int nonlocality_log2 =
- total_read_bytes_log2 - floor_log2(local_data_cache_size);
+ total_read_bytes_log2 - floor_log2(cpu_cache_params.local_cache_size);
// The values here have been tuned on ARM Cortex-A55.
// We expect this to have to be tuned differently for other CPUs.
if (nonlocality_log2 < -1) {
@@ -317,8 +315,8 @@
void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
- int tentative_thread_count, int local_data_cache_size,
- int shared_data_cache_size, BlockMap* block_map) {
+ int tentative_thread_count,
+ const CpuCacheParams& cpu_cache_params, BlockMap* block_map) {
profiler::ScopeLabel label("MakeBlockMap");
#ifdef RUY_MAKEBLOCKMAP_DEBUG
@@ -343,9 +341,8 @@
RUY_DCHECK_EQ(rows % kernel_rows, 0);
RUY_DCHECK_EQ(cols % kernel_cols, 0);
- block_map->traversal_order =
- GetTraversalOrder(rows, cols, depth, lhs_scalar_size, rhs_scalar_size,
- local_data_cache_size, shared_data_cache_size);
+ block_map->traversal_order = GetTraversalOrder(
+ rows, cols, depth, lhs_scalar_size, rhs_scalar_size, cpu_cache_params);
int rows_rectangularness_log2 = 0;
int cols_rectangularness_log2 = 0;
@@ -383,7 +380,7 @@
block_size_log2, rows, cols, tentative_thread_count);
const int cache_locality_score = GetCacheLocalityScore(
block_size_log2, rows, cols, depth, kernel_rows_log2, kernel_cols_log2,
- lhs_scalar_size, rhs_scalar_size, local_data_cache_size);
+ lhs_scalar_size, rhs_scalar_size, cpu_cache_params);
const int kernel_amortization_score = GetKernelAmortizationScore(
block_size_log2, rows, cols, kernel_rows_log2, kernel_cols_log2);
const int score =
diff --git a/ruy/block_map.h b/ruy/block_map.h
index 8fdd702..8053916 100644
--- a/ruy/block_map.h
+++ b/ruy/block_map.h
@@ -16,6 +16,7 @@
#ifndef RUY_RUY_BLOCK_MAP_H_
#define RUY_RUY_BLOCK_MAP_H_
+#include "ruy/cpu_cache_params.h"
#include "ruy/side_pair.h"
namespace ruy {
@@ -104,18 +105,16 @@
// Returns the traversal order to be used for the given matrix multiplication
// parameters.
-BlockMapTraversalOrder GetTraversalOrder(int rows, int cols, int depth,
- int lhs_scalar_size,
- int rhs_scalar_size,
- int local_data_cache_size,
- int shared_data_cache_size);
+BlockMapTraversalOrder GetTraversalOrder(
+ int rows, int cols, int depth, int lhs_scalar_size, int rhs_scalar_size,
+ const CpuCacheParams& cpu_cache_params);
// Create a BlockMap suitable for tiling the destination matrix in a
// matrix multiplication with the given parameters.
void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
- int tentative_thread_count, int local_data_cache_size,
- int shared_data_cache_size, BlockMap* block_map);
+ int tentative_thread_count,
+ const CpuCacheParams& cpu_cache_params, BlockMap* block_map);
// Maps an integer index to a block position in the grid.
void GetBlockByIndex(const BlockMap& block_map, int index,
diff --git a/ruy/block_map_test.cc b/ruy/block_map_test.cc
index 68b80b4..8245a5c 100644
--- a/ruy/block_map_test.cc
+++ b/ruy/block_map_test.cc
@@ -21,9 +21,9 @@
#include <limits>
#include <vector>
-#include "ruy/cpu_cache_size.h"
+#include "ruy/cpu_cache_params.h"
#include "ruy/gtest_wrapper.h"
-#include "ruy/path.h"
+#include "ruy/platform.h"
#include "ruy/side_pair.h"
namespace ruy {
@@ -35,12 +35,16 @@
void MakeBlockMapTuningTest(int rows, int cols, int depth, int kernel_rows,
int kernel_cols, int lhs_scalar_size,
int rhs_scalar_size, int tentative_thread_count,
- Path path, int expected_num_blocks_base_log2,
+ int expected_num_blocks_base_log2,
int expected_rectangularness_log2) {
+ // Plausible Cortex-A55 cache sizes.
+ CpuCacheParams cpu_cache_params;
+ cpu_cache_params.local_cache_size = 128 * 1024;
+ cpu_cache_params.last_level_cache_size = 1024 * 1024;
BlockMap block_map;
MakeBlockMap(rows, cols, depth, kernel_rows, kernel_cols, lhs_scalar_size,
- rhs_scalar_size, tentative_thread_count,
- LocalDataCacheSize(path), SharedDataCacheSize(path), &block_map);
+ rhs_scalar_size, tentative_thread_count, cpu_cache_params,
+ &block_map);
EXPECT_EQ(block_map.num_blocks_base_log2, expected_num_blocks_base_log2);
EXPECT_EQ(std::min(block_map.rectangularness_log2[Side::kLhs],
block_map.rectangularness_log2[Side::kRhs]),
@@ -52,35 +56,31 @@
TEST(BlockMapTest, MakeBlockMapTuningTest8bitCubicShapesOneThreadNeonDotprod) {
MakeBlockMapTuningTest(32, 32, 32, 8, 8, 1, 1, /* tentative_thread_count */ 1,
- Path::kNeonDotprod,
/* expected_num_blocks_base_log2 */ 0,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(48, 48, 48, 8, 8, 1, 1, /* tentative_thread_count */ 1,
- Path::kNeonDotprod,
/* expected_num_blocks_base_log2 */ 0,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(64, 64, 64, 8, 8, 1, 1, /* tentative_thread_count */ 1,
- Path::kNeonDotprod,
/* expected_num_blocks_base_log2 */ 0,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(96, 96, 96, 8, 8, 1, 1, /* tentative_thread_count */ 1,
- Path::kNeonDotprod,
/* expected_num_blocks_base_log2 */ 0,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(128, 128, 128, 8, 8, 1, 1,
- /* tentative_thread_count */ 1, Path::kNeonDotprod,
+ /* tentative_thread_count */ 1,
/* expected_num_blocks_base_log2 */ 0,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(192, 192, 192, 8, 8, 1, 1,
- /* tentative_thread_count */ 1, Path::kNeonDotprod,
+ /* tentative_thread_count */ 1,
/* expected_num_blocks_base_log2 */ 0,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(256, 256, 256, 8, 8, 1, 1,
- /* tentative_thread_count */ 1, Path::kNeonDotprod,
+ /* tentative_thread_count */ 1,
/* expected_num_blocks_base_log2 */ 1,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(384, 384, 384, 8, 8, 1, 1,
- /* tentative_thread_count */ 1, Path::kNeonDotprod,
+ /* tentative_thread_count */ 1,
/* expected_num_blocks_base_log2 */ 1,
/* expected_rectangularness_log2 */ 0);
}
@@ -88,57 +88,53 @@
TEST(BlockMapTest,
MakeBlockMapTuningTest8bitCubicShapesFourThreadsNeonDotprod) {
MakeBlockMapTuningTest(32, 32, 32, 8, 8, 1, 1, /* tentative_thread_count */ 4,
- Path::kNeonDotprod,
/* expected_num_blocks_base_log2 */ 1,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(48, 48, 48, 8, 8, 1, 1, /* tentative_thread_count */ 4,
- Path::kNeonDotprod,
/* expected_num_blocks_base_log2 */ 1,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(64, 64, 64, 8, 8, 1, 1, /* tentative_thread_count */ 4,
- Path::kNeonDotprod,
/* expected_num_blocks_base_log2 */ 1,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(96, 96, 96, 8, 8, 1, 1, /* tentative_thread_count */ 4,
- Path::kNeonDotprod,
/* expected_num_blocks_base_log2 */ 1,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(128, 128, 128, 8, 8, 1, 1,
- /* tentative_thread_count */ 4, Path::kNeonDotprod,
+ /* tentative_thread_count */ 4,
/* expected_num_blocks_base_log2 */ 1,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(192, 192, 192, 8, 8, 1, 1,
- /* tentative_thread_count */ 4, Path::kNeonDotprod,
+ /* tentative_thread_count */ 4,
/* expected_num_blocks_base_log2 */ 1,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(256, 256, 256, 8, 8, 1, 1,
- /* tentative_thread_count */ 4, Path::kNeonDotprod,
+ /* tentative_thread_count */ 4,
/* expected_num_blocks_base_log2 */ 2,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(384, 384, 384, 8, 8, 1, 1,
- /* tentative_thread_count */ 4, Path::kNeonDotprod,
+ /* tentative_thread_count */ 4,
/* expected_num_blocks_base_log2 */ 2,
/* expected_rectangularness_log2 */ 0);
}
TEST(BlockMapTest, MakeBlockMapTuningTest32bit) {
MakeBlockMapTuningTest(256, 256, 256, 8, 8, 4, 4,
- /* tentative_thread_count */ 4, Path::kNeonDotprod,
+ /* tentative_thread_count */ 4,
/* expected_num_blocks_base_log2 */ 3,
/* expected_rectangularness_log2 */ 0);
MakeBlockMapTuningTest(4096, 4096, 4096, 8, 8, 4, 4,
- /* tentative_thread_count */ 4, Path::kNeonDotprod,
+ /* tentative_thread_count */ 4,
/* expected_num_blocks_base_log2 */ 7,
/* expected_rectangularness_log2 */ 0);
}
TEST(BlockMapTest, MakeBlockMapTuningTestRectangular) {
MakeBlockMapTuningTest(256, 16, 256, 8, 8, 1, 1,
- /* tentative_thread_count */ 1, Path::kNeonDotprod,
+ /* tentative_thread_count */ 1,
/* expected_num_blocks_base_log2 */ 0,
/* expected_rectangularness_log2 */ 3);
MakeBlockMapTuningTest(24, 2400, 256, 8, 8, 1, 1,
- /* tentative_thread_count */ 1, Path::kNeonDotprod,
+ /* tentative_thread_count */ 1,
/* expected_num_blocks_base_log2 */ 0,
/* expected_rectangularness_log2 */ 6);
}
diff --git a/ruy/cpu_cache_params.h b/ruy/cpu_cache_params.h
new file mode 100644
index 0000000..8c1cdaf
--- /dev/null
+++ b/ruy/cpu_cache_params.h
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef RUY_RUY_CPU_CACHE_PARAMS_H_
+#define RUY_RUY_CPU_CACHE_PARAMS_H_
+
+namespace ruy {
+
+// Holds some information about a CPU's data caches.
+//
+// Meaning of 'local': a 'local' cache means a cache that is used by only one
+// CPU core, not shared with other cores. It might still be used by multiple
+// 'processors' in case of SMT as in Intel HyperThreading. CPUs often have
+// multiple levels of local cache, e.g. L1 and L2. We typically return the
+// larger one, the assumption being that even the larger one has substantially
+// lower latency than any higher (non-local) cache, however as noted below (*)
+// the implementation may choose to ignore a cache level.
+//
+// Meaning of 'last level': this refers to some higher cache level, typically
+// shared among multiple CPU cores, so we considered using the terminology
+// 'shared' instead of 'last_level'. However that created some confusion of its
+// own, as the meaning of 'shared' varies between CPUs, with some CPUs not
+// having any level of cache shared among all cores. That is why we stick with
+// the 'last_level' terminology, however with the following caveats:
+// 1. As noted below (*) the implementation may choose to ignore a cache
+// level, which could cause the 'last level' cache according to ruy not to be
+// the actual last level.
+// 2. On some systems-on-chip there is a 'last level' cache outside of the
+// last level cache in the CPU complex. We are not concerned with such SoC
+// caches in ruy.
+// 3. We haven't figured out how to amend our terminology to be meaningful
+// on NUMA architectures. NUMA hasn't been part of ruy's scope so far.
+//
+// (*) Note on ignoring certain cache levels:
+// The implementation may choose to ignore a cache if it's suspected not to
+// have compelling performance. This is true about all cache levels, but more
+// likely regarding the 'last level' cache. For example, a L4 cache may be
+// ignored if we believe that it's not the right latency/size compromise for us,
+// so on such a CPU, the L3 cache may be used as the 'last level' cache instead.
+//
+// (**) Note on CPUs with heterogeneous cores:
+// Some CPUs have multiple cores with different local caches. For example, some
+// ARM big.LITTLE CPUs have some CPU cores with L1=32k and L2=128k, and some
+// other CPU cores with L1=64k and L2=256k or even 512k. On such CPUs, the
+// fields in this struct refer to the minimum value over all cores. In other
+// words, we use conservative values that do not risk over-estimating local
+// cache sizes in case of a migration of our threads to smaller cores.
+//
+// An example:
+// On a Qualcomm S855 SoC, there are 8 CPU cores. All cores share a single L3
+// cache, and each core has L1 and L2 data caches:
+// - 4 cores have L1=32k, L2=128k.
+// - 3 cores have L1=64k, L2=256k.
+// - 1 core has L1=64k, L2=512k.
+// On such a system, we should have:
+// - local_level_cache_size=128k (the smallest L2 size).
+// - last_level_cache_size=size of the shared L3 cache.
+struct CpuCacheParams final {
+ // Minimum value (see (**)), over all cores, of the size in bytes of its local
+ // cache (see "Meaning of 'local'").
+ int local_cache_size = 0;
+ // Minimum value (see (**)), over all cores, of the size in bytes of its last
+ // level cache (see "Meaning of 'last level'").
+ int last_level_cache_size = 0;
+};
+
+} // namespace ruy
+
+#endif // RUY_RUY_CPU_CACHE_PARAMS_H_
diff --git a/ruy/cpu_cache_size.h b/ruy/cpu_cache_size.h
deleted file mode 100644
index 49f761c..0000000
--- a/ruy/cpu_cache_size.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2020 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef RUY_RUY_CPU_CACHE_SIZE_H_
-#define RUY_RUY_CPU_CACHE_SIZE_H_
-
-#include "ruy/path.h"
-#include "ruy/platform.h"
-
-namespace ruy {
-
-// LocalDataCacheSize returns a sane default size for each CPU core's local
-// data cache, i.e. the largest data cache that is local to that CPU core, not
-// shared with other cores. That allows coarse tuning of code that aims for
-// most of its memory accesses to hit such a typically fast data cache.
-//
-// SharedDataCacheSize returns a sane default size of the total data cache
-// accessible to each CPU, including any shared cache.
-//
-// For example, if we design tune this code for a ARM Cortex-A55 with a local L1
-// cache of 32k, a local L2 cache of 128k and a shared L3 cache of 1M,
-// LocalDataCacheSize should return 128k and SharedDataCacheSize
-// should return 1M.
-//
-// Ideally these values would be queried at runtime, and we should probably
-// do that on x86, but that is hard to do on ARM.
-#if RUY_PLATFORM_ARM_64
-inline int LocalDataCacheSize() { return 1 << 15; }
-inline int SharedDataCacheSize() { return 1 << 19; }
-#elif RUY_PLATFORM_ARM_32
-inline int LocalDataCacheSize() { return 1 << 14; }
-inline int SharedDataCacheSize() { return 1 << 18; }
-#elif RUY_PLATFORM_X86
-inline int LocalDataCacheSize() { return 1 << 17; }
-inline int SharedDataCacheSize() { return 1 << 21; }
-#else
-inline int LocalDataCacheSize() { return 1 << 14; }
-inline int SharedDataCacheSize() { return 1 << 18; }
-#endif
-// Variants taking a Path argument which acts
-// as a hint telling whether we're targeting more or less recent/powerful CPUs.
-inline int LocalDataCacheSize(Path path) {
-#if RUY_PLATFORM_ARM_64
- if (path == Path::kNeonDotprod) {
- // At the moment, the smallest CPU with dotprod is probably Cortex-A55 with
- // 128k L2 local cache.
- return 1 << 17;
- }
-#else
- (void)path;
-#endif
- return LocalDataCacheSize();
-}
-inline int SharedDataCacheSize(Path path) {
-#if RUY_PLATFORM_ARM_64
- if (path == Path::kNeonDotprod) {
- // At the moment, the smallest CPU with dotprod is probably Cortex-A55 with
- // 1M L3 shared cache.
- return 1 << 20;
- }
-#else
- (void)path;
-#endif
- return SharedDataCacheSize();
-}
-
-} // namespace ruy
-
-#endif // RUY_RUY_CPU_CACHE_SIZE_H_
diff --git a/ruy/cpuinfo.cc b/ruy/cpuinfo.cc
index 793ba7b..147cb17 100644
--- a/ruy/cpuinfo.cc
+++ b/ruy/cpuinfo.cc
@@ -1,15 +1,31 @@
#include "ruy/cpuinfo.h"
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+#include "ruy/check_macros.h"
+#include "ruy/cpu_cache_params.h"
#include "ruy/platform.h"
#define RUY_HAVE_CPUINFO (!(RUY_PLATFORM_PPC || RUY_PLATFORM_FUCHSIA))
#if RUY_HAVE_CPUINFO
-
#include <cpuinfo.h>
+#endif
namespace ruy {
+namespace {
+void MakeDummyCacheParams(CpuCacheParams* result) {
+ // Reasonable dummy values
+ result->local_cache_size = 32 * 1024;
+ result->last_level_cache_size = 512 * 1024;
+}
+} // end namespace
+
+#if RUY_HAVE_CPUINFO
+
CpuInfo::~CpuInfo() {
if (init_status_ == InitStatus::kInitialized) {
cpuinfo_deinitialize();
@@ -18,12 +34,67 @@
bool CpuInfo::EnsureInitialized() {
if (init_status_ == InitStatus::kNotYetAttempted) {
- init_status_ =
- cpuinfo_initialize() ? InitStatus::kInitialized : InitStatus::kFailed;
+ init_status_ = Initialize();
+ RUY_DCHECK_NE(init_status_, InitStatus::kNotYetAttempted);
}
return init_status_ == InitStatus::kInitialized;
}
+namespace {
+void QueryCacheParams(CpuCacheParams* cache_params) {
+ const int processors_count = cpuinfo_get_processors_count();
+ RUY_DCHECK_GT(processors_count, 0);
+ int overall_local_cache_size = std::numeric_limits<int>::max();
+ int overall_last_level_cache_size = std::numeric_limits<int>::max();
+ for (int i = 0; i < processors_count; i++) {
+ int local_cache_size = 0;
+ int last_level_cache_size = 0;
+ const cpuinfo_processor* processor = cpuinfo_get_processor(i);
+ // Loop over cache levels. Ignoring L4 for now: it seems that in CPUs that
+ // have L4, we would still prefer to stay in lower-latency L3.
+ for (const cpuinfo_cache* cache :
+ {processor->cache.l1d, processor->cache.l2, processor->cache.l3}) {
+ if (!cache) {
+ continue; // continue, not break, it is possible to have L1+L3 but no
+ // L2.
+ }
+ const bool is_local =
+ cpuinfo_get_processor(cache->processor_start)->core ==
+ cpuinfo_get_processor(cache->processor_start +
+ cache->processor_count - 1)
+ ->core;
+ if (is_local) {
+ local_cache_size = cache->size;
+ }
+ last_level_cache_size = cache->size;
+ }
+ // If no local cache was found, use the last-level cache.
+ if (!local_cache_size) {
+ local_cache_size = last_level_cache_size;
+ }
+ RUY_DCHECK_GT(local_cache_size, 0);
+ RUY_DCHECK_GT(last_level_cache_size, 0);
+ RUY_DCHECK_GE(last_level_cache_size, local_cache_size);
+ overall_local_cache_size =
+ std::min(overall_local_cache_size, local_cache_size);
+ overall_last_level_cache_size =
+ std::min(overall_last_level_cache_size, last_level_cache_size);
+ }
+ cache_params->local_cache_size = overall_local_cache_size;
+ cache_params->last_level_cache_size = overall_last_level_cache_size;
+}
+} // end namespace
+
+CpuInfo::InitStatus CpuInfo::Initialize() {
+ RUY_DCHECK_EQ(init_status_, InitStatus::kNotYetAttempted);
+ if (!cpuinfo_initialize()) {
+ MakeDummyCacheParams(&cache_params_);
+ return InitStatus::kFailed;
+ }
+ QueryCacheParams(&cache_params_);
+ return InitStatus::kInitialized;
+}
+
bool CpuInfo::NeonDotprod() {
return EnsureInitialized() && cpuinfo_has_arm_neon_dot();
}
@@ -44,18 +115,29 @@
return EnsureInitialized() && cpuinfo_has_x86_avx512vnni();
}
-} // namespace ruy
-
#else // not RUY_HAVE_CPUINFO
-namespace ruy {
CpuInfo::~CpuInfo() {}
-bool CpuInfo::EnsureInitialized() { return false; }
+bool CpuInfo::EnsureInitialized() {
+ if (init_status_ == InitStatus::kNotYetAttempted) {
+ MakeDummyCacheParams(&cache_params_);
+ init_status_ = InitStatus::kInitialized;
+ }
+ RUY_DCHECK_EQ(init_status_, InitStatus::kInitialized);
+ return true;
+}
bool CpuInfo::NeonDotprod() { return false; }
bool CpuInfo::Sse42() { return false; }
bool CpuInfo::Avx2() { return false; }
bool CpuInfo::Avx512() { return false; }
bool CpuInfo::AvxVnni() { return false; }
-} // namespace ruy
#endif
+
+const CpuCacheParams& CpuInfo::CacheParams() {
+ EnsureInitialized();
+ // On failure, EnsureInitialized leaves dummy values in cache_params_.
+ return cache_params_;
+}
+
+} // namespace ruy
diff --git a/ruy/cpuinfo.h b/ruy/cpuinfo.h
index 0a3de28..de2cbc7 100644
--- a/ruy/cpuinfo.h
+++ b/ruy/cpuinfo.h
@@ -16,6 +16,8 @@
#ifndef RUY_RUY_CPUINFO_H_
#define RUY_RUY_CPUINFO_H_
+#include "ruy/cpu_cache_params.h"
+
namespace ruy {
// Wraps the functionality that ruy needs from the cpuinfo library.
@@ -33,14 +35,22 @@
bool Avx512();
bool AvxVnni();
+ // Common features
+ const CpuCacheParams& CacheParams();
+
private:
enum class InitStatus {
kNotYetAttempted,
kInitialized,
kFailed,
};
+
InitStatus init_status_ = InitStatus::kNotYetAttempted;
+ CpuCacheParams cache_params_;
+
bool EnsureInitialized();
+ InitStatus Initialize();
+
CpuInfo(const CpuInfo&) = delete;
};
diff --git a/ruy/dispatch.h b/ruy/dispatch.h
index 2f75a9c..144cff6 100644
--- a/ruy/dispatch.h
+++ b/ruy/dispatch.h
@@ -199,9 +199,6 @@
params->path = ThePath;
- params->local_data_cache_size = MulParamsType::local_data_cache_size();
- params->shared_data_cache_size = MulParamsType::shared_data_cache_size();
-
CreatePackedMatrix<LhsScalar, PackedLhsScalar>(
Side::kLhs, ToKernelLayout<LhsKernelLayout>(), params);
CreatePackedMatrix<RhsScalar, PackedRhsScalar>(
diff --git a/ruy/mul_params.h b/ruy/mul_params.h
index eb1abed..bb9546f 100644
--- a/ruy/mul_params.h
+++ b/ruy/mul_params.h
@@ -19,7 +19,6 @@
#include <limits>
#include <type_traits>
-#include "ruy/cpu_cache_size.h"
#include "ruy/matrix.h"
namespace ruy {
@@ -134,15 +133,6 @@
// Used for testing of various kernel layouts.
using StandardCppKernelLhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
using StandardCppKernelRhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
- // Returns (a reasonable estimate of) the local CPU cache size.
- // See ruy::LocalDataCacheSize() which returns some coarse, sane default for
- // each CPU architecture.
- // This may be overridden, either to provide more accurate/runtime values,
- // or to test with other values to let testcases have more coverage.
- static int local_data_cache_size() { return LocalDataCacheSize(); }
- // Same as local_data_cache_size but for the total data cache size accessible
- // to each CPU core. See ruy::SharedDataCacheSize().
- static int shared_data_cache_size() { return SharedDataCacheSize(); }
};
template <typename tAccumScalar, typename tDstScalar>
diff --git a/ruy/test_special_mul_params.cc b/ruy/test_special_mul_params.cc
index 0249150..d76e8a1 100644
--- a/ruy/test_special_mul_params.cc
+++ b/ruy/test_special_mul_params.cc
@@ -41,8 +41,6 @@
struct StandardCppKernelLayoutMulParams : MulParams<AccumScalar, DstScalar> {
using StandardCppKernelLhsLayout = LhsKernelLayout;
using StandardCppKernelRhsLayout = RhsKernelLayout;
- static int local_data_cache_size() { return 1; }
- static int shared_data_cache_size() { return 1; }
};
using LhsScalar = RUY_TEST_LHSSCALAR;
diff --git a/ruy/trmul.cc b/ruy/trmul.cc
index c175f01..94d0e77 100644
--- a/ruy/trmul.cc
+++ b/ruy/trmul.cc
@@ -26,6 +26,8 @@
#include "ruy/block_map.h"
#include "ruy/check_macros.h"
#include "ruy/common.h"
+#include "ruy/cpu_cache_params.h"
+#include "ruy/cpuinfo.h"
#include "ruy/ctx.h"
#include "ruy/mat.h"
#include "ruy/matrix.h"
@@ -70,24 +72,19 @@
const Tuning tuning = tuning_resolver->Resolve();
const int num_blocks = NumBlocks(block_map);
- SidePair<int> block;
- SidePair<int> start;
- SidePair<int> end;
// Each thread starts by initially reserving the block whose id
// is the thread id.
int block_id = thread_id;
while (block_id < num_blocks) {
- // Reserve the next block to handle. In order to hide the latency
- // (typically comparable to an access to the level of data cache that
- // is shared among CPU cores, e.g. 60 cycles on an ARM CPU as of 2019)
- // of this atomic operation, we structure this code so as to avoid
- // immediately depending on the `next_n` result.
+ // Reserve the next block to handle, hiding the latency of this atomic op.
const int next_block_id =
atomic_block_id->fetch_add(1, std::memory_order_relaxed);
// Get coordinates of the current block to handle, in "block space".
+ SidePair<int> block;
GetBlockByIndex(block_map, block_id, &block);
// Get coordinates of the current block to handle, in matrix space.
+ SidePair<int> start, end;
GetBlockMatrixCoords(block_map, block, &start, &end);
// Maybe pack the current LHS/RHS block, if not already packed.
EnsurePacked(block, start, end, tuning);
@@ -244,12 +241,11 @@
LoopStructure GetLoopStructure(int tentative_thread_count, int rows, int cols,
int depth, int lhs_scalar_size,
- int rhs_scalar_size, int local_data_cache_size,
- int shared_data_cache_size) {
+ int rhs_scalar_size,
+ const CpuCacheParams& cpu_cache_params) {
if (tentative_thread_count == 1) {
- const BlockMapTraversalOrder traversal_order =
- GetTraversalOrder(rows, cols, depth, lhs_scalar_size, rhs_scalar_size,
- local_data_cache_size, shared_data_cache_size);
+ const BlockMapTraversalOrder traversal_order = GetTraversalOrder(
+ rows, cols, depth, lhs_scalar_size, rhs_scalar_size, cpu_cache_params);
// If we are in the GEMV case or the block_map would be using linear
// traversal anyway, use the simple loop.
if ((cols == 1) || traversal_order == BlockMapTraversalOrder::kLinear) {
@@ -277,10 +273,10 @@
const int depth = lhs.layout.rows;
const int tentative_thread_count = GetThreadCount(ctx, rows, cols, depth);
+ const auto& cpu_cache_params = ctx->mutable_cpuinfo()->CacheParams();
const auto loop_structure = GetLoopStructure(
tentative_thread_count, rows, cols, depth, lhs.data_type.size,
- rhs.data_type.size, params->local_data_cache_size,
- params->shared_data_cache_size);
+ rhs.data_type.size, cpu_cache_params);
Allocator* allocator = ctx->GetMainAllocator();
// Allocate packed matrices
@@ -319,8 +315,7 @@
MakeBlockMap(packed_lhs.layout.cols, packed_rhs.layout.cols, depth,
packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
packed_lhs.data_type.size, packed_rhs.data_type.size,
- tentative_thread_count, params->local_data_cache_size,
- params->shared_data_cache_size, &block_map);
+ tentative_thread_count, cpu_cache_params, &block_map);
// Initialize per-thread state.
const int thread_count = block_map.thread_count;
diff --git a/ruy/trmul_params.h b/ruy/trmul_params.h
index fecef99..877c1f0 100644
--- a/ruy/trmul_params.h
+++ b/ruy/trmul_params.h
@@ -43,11 +43,6 @@
// cache sizes when not runtime-detectable.
Path path;
- // See MulParamsType::local_data_cache_size().
- int local_data_cache_size = 0;
- // See MulParamsType::shared_data_cache_size().
- int shared_data_cache_size = 0;
-
// Function pointers to type-erased entry points for kernels and packers.
SidePair<RunPackFn*> run_pack;
RunKernelFn* run_kernel = nullptr;