Minor cleanup in the TrMulTask main loop:
Shortened a comment to the length it deserves (in this most important loop in ruy, conciseness matters). The optimization that this is about is not that important and I believe that the shortened comment is enough.
Moved local var defs to where they are used. Makes it clear that their values aren't carried over across loop iterations.

PiperOrigin-RevId: 314399108
diff --git a/ruy/BUILD b/ruy/BUILD
index 9bd40dc..6db6a4d 100644
--- a/ruy/BUILD
+++ b/ruy/BUILD
@@ -92,6 +92,7 @@
 cc_test(
     name = "wait_test",
     srcs = ["wait_test.cc"],
+    copts = ruy_copts(),
     linkopts = ruy_linkopts_thread_standard_library(),
     deps = [
         ":gtest_wrapper",
@@ -110,6 +111,7 @@
 cc_test(
     name = "size_util_test",
     srcs = ["size_util_test.cc"],
+    copts = ruy_copts(),
     deps = [
         ":gtest_wrapper",
         ":size_util",
@@ -162,6 +164,7 @@
 cc_test(
     name = "tune_test",
     srcs = ["tune_test.cc"],
+    copts = ruy_copts(),
     deps = [
         ":gtest_wrapper",
         ":tune",
@@ -171,6 +174,7 @@
 cc_test(
     name = "prepacked_cache_test",
     srcs = ["prepacked_cache_test.cc"],
+    copts = ruy_copts(),
     deps = [
         ":context",
         ":context_get_ctx",
@@ -210,6 +214,7 @@
 cc_test(
     name = "allocator_test",
     srcs = ["allocator_test.cc"],
+    copts = ruy_copts(),
     deps = [
         ":allocator",
         ":gtest_wrapper",
@@ -234,6 +239,7 @@
     copts = ruy_copts(),
     deps = [
         ":check_macros",
+        ":cpu_cache_params",
         ":opt_set",
         ":side_pair",
         ":size_util",
@@ -244,11 +250,13 @@
 cc_test(
     name = "block_map_test",
     srcs = ["block_map_test.cc"],
+    copts = ruy_copts(),
     deps = [
         ":block_map",
-        ":cpu_cache_size",
+        ":cpu_cache_params",
         ":gtest_wrapper",
         ":path",
+        ":platform",
         ":side_pair",
     ],
 )
@@ -290,6 +298,12 @@
 )
 
 cc_library(
+    name = "cpu_cache_params",
+    hdrs = ["cpu_cache_params.h"],
+    copts = ruy_copts(),
+)
+
+cc_library(
     name = "cpuinfo",
     srcs = [
         "cpuinfo.cc",
@@ -305,8 +319,11 @@
                     "-Wno-undef",
                 ],
             }),
-    deps = [":platform"] + select({
-        # cpuinfo does not build on ppc.
+    deps = [
+        ":platform",
+        ":check_macros",
+        ":cpu_cache_params",
+    ] + select({
         ":ppc": [],
         ":fuchsia": [],
         "//conditions:default": ["@cpuinfo"],
@@ -325,16 +342,6 @@
 )
 
 cc_library(
-    name = "cpu_cache_size",
-    hdrs = ["cpu_cache_size.h"],
-    copts = ruy_copts(),
-    deps = [
-        ":path",
-        ":platform",
-    ],
-)
-
-cc_library(
     name = "matrix",
     hdrs = ["matrix.h"],
     copts = ruy_copts(),
@@ -345,6 +352,7 @@
 cc_test(
     name = "matrix_test",
     srcs = ["matrix_test.cc"],
+    copts = ruy_copts(),
     deps = [
         ":gtest_wrapper",
         ":matrix",
@@ -357,7 +365,6 @@
     copts = ruy_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        ":cpu_cache_size",
         ":matrix",
     ],
 )
@@ -365,6 +372,7 @@
 cc_test(
     name = "mul_params_test",
     srcs = ["mul_params_test.cc"],
+    copts = ruy_copts(),
     deps = [
         ":gtest_wrapper",
         ":mul_params",
@@ -772,6 +780,7 @@
 cc_test(
     name = "context_test",
     srcs = ["context_test.cc"],
+    copts = ruy_copts(),
     deps = [
         ":context",
         ":gtest_wrapper",
@@ -833,6 +842,7 @@
 cc_test(
     name = "ctx_test",
     srcs = ["ctx_test.cc"],
+    copts = ruy_copts(),
     deps = [
         ":ctx",
         ":gtest_wrapper",
@@ -862,6 +872,8 @@
         ":block_map",
         ":check_macros",
         ":common",
+        ":cpu_cache_params",
+        ":cpuinfo",
         ":ctx",
         ":mat",
         ":matrix",
diff --git a/ruy/block_map.cc b/ruy/block_map.cc
index 75ab3ef..44e5039 100644
--- a/ruy/block_map.cc
+++ b/ruy/block_map.cc
@@ -126,18 +126,16 @@
   }
 }
 
-BlockMapTraversalOrder GetTraversalOrder(int rows, int cols, int depth,
-                                         int lhs_scalar_size,
-                                         int rhs_scalar_size,
-                                         int local_data_cache_size,
-                                         int shared_data_cache_size) {
+BlockMapTraversalOrder GetTraversalOrder(
+    int rows, int cols, int depth, int lhs_scalar_size, int rhs_scalar_size,
+    const CpuCacheParams& cpu_cache_params) {
   static constexpr bool kAnyFractal =
       RUY_OPT(FRACTAL_Z) | RUY_OPT(FRACTAL_U) | RUY_OPT(FRACTAL_HILBERT);
   const int working_set_size =
       (lhs_scalar_size * rows + rhs_scalar_size * cols) * depth;
-  if (kAnyFractal && (working_set_size > local_data_cache_size)) {
+  if (kAnyFractal && (working_set_size > cpu_cache_params.local_cache_size)) {
     if (RUY_OPT(FRACTAL_HILBERT) &&
-        (working_set_size > shared_data_cache_size)) {
+        (working_set_size > cpu_cache_params.last_level_cache_size)) {
       return BlockMapTraversalOrder::kFractalHilbert;
     } else if (RUY_OPT(FRACTAL_U)) {
       return BlockMapTraversalOrder::kFractalU;
@@ -245,7 +243,7 @@
 int GetCacheLocalityScore(int block_size_log2, int rows, int cols, int depth,
                           int kernel_rows_log2, int kernel_cols_log2,
                           int lhs_scalar_size, int rhs_scalar_size,
-                          int local_data_cache_size) {
+                          const CpuCacheParams& cpu_cache_params) {
   // In the narrow case (e.g. matrix*vector), each byte of the big operand
   // matrix (either LHS or RHS) is traversed only once, so any notion of data
   // locality is irrelevant. Ignore the 'cache locality score' by forcing it to
@@ -259,7 +257,7 @@
       (lhs_scalar_size * block_rows + rhs_scalar_size * block_cols) * depth;
   const int total_read_bytes_log2 = ceil_log2(total_read_bytes);
   const int nonlocality_log2 =
-      total_read_bytes_log2 - floor_log2(local_data_cache_size);
+      total_read_bytes_log2 - floor_log2(cpu_cache_params.local_cache_size);
   // The values here have been tuned on ARM Cortex-A55.
   // We expect this to have to be tuned differently for other CPUs.
   if (nonlocality_log2 < -1) {
@@ -317,8 +315,8 @@
 
 void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
                   int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
-                  int tentative_thread_count, int local_data_cache_size,
-                  int shared_data_cache_size, BlockMap* block_map) {
+                  int tentative_thread_count,
+                  const CpuCacheParams& cpu_cache_params, BlockMap* block_map) {
   profiler::ScopeLabel label("MakeBlockMap");
 
 #ifdef RUY_MAKEBLOCKMAP_DEBUG
@@ -343,9 +341,8 @@
   RUY_DCHECK_EQ(rows % kernel_rows, 0);
   RUY_DCHECK_EQ(cols % kernel_cols, 0);
 
-  block_map->traversal_order =
-      GetTraversalOrder(rows, cols, depth, lhs_scalar_size, rhs_scalar_size,
-                        local_data_cache_size, shared_data_cache_size);
+  block_map->traversal_order = GetTraversalOrder(
+      rows, cols, depth, lhs_scalar_size, rhs_scalar_size, cpu_cache_params);
 
   int rows_rectangularness_log2 = 0;
   int cols_rectangularness_log2 = 0;
@@ -383,7 +380,7 @@
         block_size_log2, rows, cols, tentative_thread_count);
     const int cache_locality_score = GetCacheLocalityScore(
         block_size_log2, rows, cols, depth, kernel_rows_log2, kernel_cols_log2,
-        lhs_scalar_size, rhs_scalar_size, local_data_cache_size);
+        lhs_scalar_size, rhs_scalar_size, cpu_cache_params);
     const int kernel_amortization_score = GetKernelAmortizationScore(
         block_size_log2, rows, cols, kernel_rows_log2, kernel_cols_log2);
     const int score =
diff --git a/ruy/block_map.h b/ruy/block_map.h
index 8fdd702..8053916 100644
--- a/ruy/block_map.h
+++ b/ruy/block_map.h
@@ -16,6 +16,7 @@
 #ifndef RUY_RUY_BLOCK_MAP_H_
 #define RUY_RUY_BLOCK_MAP_H_
 
+#include "ruy/cpu_cache_params.h"
 #include "ruy/side_pair.h"
 
 namespace ruy {
@@ -104,18 +105,16 @@
 
 // Returns the traversal order to be used for the given matrix multiplication
 // parameters.
-BlockMapTraversalOrder GetTraversalOrder(int rows, int cols, int depth,
-                                         int lhs_scalar_size,
-                                         int rhs_scalar_size,
-                                         int local_data_cache_size,
-                                         int shared_data_cache_size);
+BlockMapTraversalOrder GetTraversalOrder(
+    int rows, int cols, int depth, int lhs_scalar_size, int rhs_scalar_size,
+    const CpuCacheParams& cpu_cache_params);
 
 // Create a BlockMap suitable for tiling the destination matrix in a
 // matrix multiplication with the given parameters.
 void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
                   int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
-                  int tentative_thread_count, int local_data_cache_size,
-                  int shared_data_cache_size, BlockMap* block_map);
+                  int tentative_thread_count,
+                  const CpuCacheParams& cpu_cache_params, BlockMap* block_map);
 
 // Maps an integer index to a block position in the grid.
 void GetBlockByIndex(const BlockMap& block_map, int index,
diff --git a/ruy/block_map_test.cc b/ruy/block_map_test.cc
index 68b80b4..8245a5c 100644
--- a/ruy/block_map_test.cc
+++ b/ruy/block_map_test.cc
@@ -21,9 +21,9 @@
 #include <limits>
 #include <vector>
 
-#include "ruy/cpu_cache_size.h"
+#include "ruy/cpu_cache_params.h"
 #include "ruy/gtest_wrapper.h"
-#include "ruy/path.h"
+#include "ruy/platform.h"
 #include "ruy/side_pair.h"
 
 namespace ruy {
@@ -35,12 +35,16 @@
 void MakeBlockMapTuningTest(int rows, int cols, int depth, int kernel_rows,
                             int kernel_cols, int lhs_scalar_size,
                             int rhs_scalar_size, int tentative_thread_count,
-                            Path path, int expected_num_blocks_base_log2,
+                            int expected_num_blocks_base_log2,
                             int expected_rectangularness_log2) {
+  // Plausible Cortex-A55 cache sizes.
+  CpuCacheParams cpu_cache_params;
+  cpu_cache_params.local_cache_size = 128 * 1024;
+  cpu_cache_params.last_level_cache_size = 1024 * 1024;
   BlockMap block_map;
   MakeBlockMap(rows, cols, depth, kernel_rows, kernel_cols, lhs_scalar_size,
-               rhs_scalar_size, tentative_thread_count,
-               LocalDataCacheSize(path), SharedDataCacheSize(path), &block_map);
+               rhs_scalar_size, tentative_thread_count, cpu_cache_params,
+               &block_map);
   EXPECT_EQ(block_map.num_blocks_base_log2, expected_num_blocks_base_log2);
   EXPECT_EQ(std::min(block_map.rectangularness_log2[Side::kLhs],
                      block_map.rectangularness_log2[Side::kRhs]),
@@ -52,35 +56,31 @@
 
 TEST(BlockMapTest, MakeBlockMapTuningTest8bitCubicShapesOneThreadNeonDotprod) {
   MakeBlockMapTuningTest(32, 32, 32, 8, 8, 1, 1, /* tentative_thread_count */ 1,
-                         Path::kNeonDotprod,
                          /* expected_num_blocks_base_log2 */ 0,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(48, 48, 48, 8, 8, 1, 1, /* tentative_thread_count */ 1,
-                         Path::kNeonDotprod,
                          /* expected_num_blocks_base_log2 */ 0,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(64, 64, 64, 8, 8, 1, 1, /* tentative_thread_count */ 1,
-                         Path::kNeonDotprod,
                          /* expected_num_blocks_base_log2 */ 0,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(96, 96, 96, 8, 8, 1, 1, /* tentative_thread_count */ 1,
-                         Path::kNeonDotprod,
                          /* expected_num_blocks_base_log2 */ 0,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(128, 128, 128, 8, 8, 1, 1,
-                         /* tentative_thread_count */ 1, Path::kNeonDotprod,
+                         /* tentative_thread_count */ 1,
                          /* expected_num_blocks_base_log2 */ 0,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(192, 192, 192, 8, 8, 1, 1,
-                         /* tentative_thread_count */ 1, Path::kNeonDotprod,
+                         /* tentative_thread_count */ 1,
                          /* expected_num_blocks_base_log2 */ 0,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(256, 256, 256, 8, 8, 1, 1,
-                         /* tentative_thread_count */ 1, Path::kNeonDotprod,
+                         /* tentative_thread_count */ 1,
                          /* expected_num_blocks_base_log2 */ 1,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(384, 384, 384, 8, 8, 1, 1,
-                         /* tentative_thread_count */ 1, Path::kNeonDotprod,
+                         /* tentative_thread_count */ 1,
                          /* expected_num_blocks_base_log2 */ 1,
                          /* expected_rectangularness_log2 */ 0);
 }
@@ -88,57 +88,53 @@
 TEST(BlockMapTest,
      MakeBlockMapTuningTest8bitCubicShapesFourThreadsNeonDotprod) {
   MakeBlockMapTuningTest(32, 32, 32, 8, 8, 1, 1, /* tentative_thread_count */ 4,
-                         Path::kNeonDotprod,
                          /* expected_num_blocks_base_log2 */ 1,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(48, 48, 48, 8, 8, 1, 1, /* tentative_thread_count */ 4,
-                         Path::kNeonDotprod,
                          /* expected_num_blocks_base_log2 */ 1,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(64, 64, 64, 8, 8, 1, 1, /* tentative_thread_count */ 4,
-                         Path::kNeonDotprod,
                          /* expected_num_blocks_base_log2 */ 1,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(96, 96, 96, 8, 8, 1, 1, /* tentative_thread_count */ 4,
-                         Path::kNeonDotprod,
                          /* expected_num_blocks_base_log2 */ 1,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(128, 128, 128, 8, 8, 1, 1,
-                         /* tentative_thread_count */ 4, Path::kNeonDotprod,
+                         /* tentative_thread_count */ 4,
                          /* expected_num_blocks_base_log2 */ 1,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(192, 192, 192, 8, 8, 1, 1,
-                         /* tentative_thread_count */ 4, Path::kNeonDotprod,
+                         /* tentative_thread_count */ 4,
                          /* expected_num_blocks_base_log2 */ 1,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(256, 256, 256, 8, 8, 1, 1,
-                         /* tentative_thread_count */ 4, Path::kNeonDotprod,
+                         /* tentative_thread_count */ 4,
                          /* expected_num_blocks_base_log2 */ 2,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(384, 384, 384, 8, 8, 1, 1,
-                         /* tentative_thread_count */ 4, Path::kNeonDotprod,
+                         /* tentative_thread_count */ 4,
                          /* expected_num_blocks_base_log2 */ 2,
                          /* expected_rectangularness_log2 */ 0);
 }
 
 TEST(BlockMapTest, MakeBlockMapTuningTest32bit) {
   MakeBlockMapTuningTest(256, 256, 256, 8, 8, 4, 4,
-                         /* tentative_thread_count */ 4, Path::kNeonDotprod,
+                         /* tentative_thread_count */ 4,
                          /* expected_num_blocks_base_log2 */ 3,
                          /* expected_rectangularness_log2 */ 0);
   MakeBlockMapTuningTest(4096, 4096, 4096, 8, 8, 4, 4,
-                         /* tentative_thread_count */ 4, Path::kNeonDotprod,
+                         /* tentative_thread_count */ 4,
                          /* expected_num_blocks_base_log2 */ 7,
                          /* expected_rectangularness_log2 */ 0);
 }
 
 TEST(BlockMapTest, MakeBlockMapTuningTestRectangular) {
   MakeBlockMapTuningTest(256, 16, 256, 8, 8, 1, 1,
-                         /* tentative_thread_count */ 1, Path::kNeonDotprod,
+                         /* tentative_thread_count */ 1,
                          /* expected_num_blocks_base_log2 */ 0,
                          /* expected_rectangularness_log2 */ 3);
   MakeBlockMapTuningTest(24, 2400, 256, 8, 8, 1, 1,
-                         /* tentative_thread_count */ 1, Path::kNeonDotprod,
+                         /* tentative_thread_count */ 1,
                          /* expected_num_blocks_base_log2 */ 0,
                          /* expected_rectangularness_log2 */ 6);
 }
diff --git a/ruy/cpu_cache_params.h b/ruy/cpu_cache_params.h
new file mode 100644
index 0000000..8c1cdaf
--- /dev/null
+++ b/ruy/cpu_cache_params.h
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef RUY_RUY_CPU_CACHE_PARAMS_H_
+#define RUY_RUY_CPU_CACHE_PARAMS_H_
+
+namespace ruy {
+
+// Holds some information about a CPU's data caches.
+//
+// Meaning of 'local': a 'local' cache means a cache that is used by only one
+// CPU core, not shared with other cores. It might still be used by multiple
+// 'processors' in case of SMT as in Intel HyperThreading. CPUs often have
+// multiple levels of local cache, e.g. L1 and L2. We typically return the
+// larger one, the assumption being that even the larger one has substantially
+// lower latency than any higher (non-local) cache, however as noted below (*)
+// the implementation may choose to ignore a cache level.
+//
+// Meaning of 'last level': this refers to some higher cache level, typically
+// shared among multiple CPU cores, so we considered using the terminology
+// 'shared' instead of 'last_level'. However that created some confusion of its
+// own, as the meaning of 'shared' varies between CPUs, with some CPUs not
+// having any level of cache shared among all cores. That is why we stick with
+// the 'last_level' terminology, however with the following caveats:
+//   1. As noted below (*) the implementation may choose to ignore a cache
+// level, which could cause the 'last level' cache according to ruy not to be
+// the actual last level.
+//   2. On some systems-on-chip there is a 'last level' cache outside of the
+// last level cache in the CPU complex. We are not concerned with such SoC
+// caches in ruy.
+//   3. We haven't figured out how to amend our terminology to be meaningful
+// on NUMA architectures. NUMA hasn't been part of ruy's scope so far.
+//
+// (*) Note on ignoring certain cache levels:
+// The implementation may choose to ignore a cache if it's suspected not to
+// have compelling performance. This is true about all cache levels, but more
+// likely regarding the 'last level' cache. For example, a L4 cache may be
+// ignored if we believe that it's not the right latency/size compromise for us,
+// so on such a CPU, the L3 cache may be used as the 'last level' cache instead.
+//
+// (**) Note on CPUs with heterogeneous cores:
+// Some CPUs have multiple cores with different local caches. For example, some
+// ARM big.LITTLE CPUs have some CPU cores with L1=32k and L2=128k, and some
+// other CPU cores with L1=64k and L2=256k or even 512k. On such CPUs, the
+// fields in this struct refer to the minimum value over all cores. In other
+// words, we use conservative values that do not risk over-estimating local
+// cache sizes in case of a migration of our threads to smaller cores.
+//
+// An example:
+// On a Qualcomm S855 SoC, there are 8 CPU cores. All cores share a single L3
+// cache, and each core has L1 and L2 data caches:
+// - 4 cores have L1=32k, L2=128k.
+// - 3 cores have L1=64k, L2=256k.
+// - 1 core has   L1=64k, L2=512k.
+// On such a system, we should have:
+// - local_level_cache_size=128k  (the smallest L2 size).
+// - last_level_cache_size=size of the shared L3 cache.
+struct CpuCacheParams final {
+  // Minimum value (see (**)), over all cores, of the size in bytes of its local
+  // cache (see "Meaning of 'local'").
+  int local_cache_size = 0;
+  // Minimum value (see (**)), over all cores, of the size in bytes of its last
+  // level cache (see "Meaning of 'last level'").
+  int last_level_cache_size = 0;
+};
+
+}  // namespace ruy
+
+#endif  // RUY_RUY_CPU_CACHE_PARAMS_H_
diff --git a/ruy/cpu_cache_size.h b/ruy/cpu_cache_size.h
deleted file mode 100644
index 49f761c..0000000
--- a/ruy/cpu_cache_size.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2020 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef RUY_RUY_CPU_CACHE_SIZE_H_
-#define RUY_RUY_CPU_CACHE_SIZE_H_
-
-#include "ruy/path.h"
-#include "ruy/platform.h"
-
-namespace ruy {
-
-// LocalDataCacheSize returns a sane default size for each CPU core's local
-// data cache, i.e. the largest data cache that is local to that CPU core, not
-// shared with other cores. That allows coarse tuning of code that aims for
-// most of its memory accesses to hit such a typically fast data cache.
-//
-// SharedDataCacheSize returns a sane default size of the total data cache
-// accessible to each CPU, including any shared cache.
-//
-// For example, if we design tune this code for a ARM Cortex-A55 with a local L1
-// cache of 32k, a local L2 cache of 128k and a shared L3 cache of 1M,
-// LocalDataCacheSize should return 128k and SharedDataCacheSize
-// should return 1M.
-//
-// Ideally these values would be queried at runtime, and we should probably
-// do that on x86, but that is hard to do on ARM.
-#if RUY_PLATFORM_ARM_64
-inline int LocalDataCacheSize() { return 1 << 15; }
-inline int SharedDataCacheSize() { return 1 << 19; }
-#elif RUY_PLATFORM_ARM_32
-inline int LocalDataCacheSize() { return 1 << 14; }
-inline int SharedDataCacheSize() { return 1 << 18; }
-#elif RUY_PLATFORM_X86
-inline int LocalDataCacheSize() { return 1 << 17; }
-inline int SharedDataCacheSize() { return 1 << 21; }
-#else
-inline int LocalDataCacheSize() { return 1 << 14; }
-inline int SharedDataCacheSize() { return 1 << 18; }
-#endif
-// Variants taking a Path argument which acts
-// as a hint telling whether we're targeting more or less recent/powerful CPUs.
-inline int LocalDataCacheSize(Path path) {
-#if RUY_PLATFORM_ARM_64
-  if (path == Path::kNeonDotprod) {
-    // At the moment, the smallest CPU with dotprod is probably Cortex-A55 with
-    // 128k L2 local cache.
-    return 1 << 17;
-  }
-#else
-  (void)path;
-#endif
-  return LocalDataCacheSize();
-}
-inline int SharedDataCacheSize(Path path) {
-#if RUY_PLATFORM_ARM_64
-  if (path == Path::kNeonDotprod) {
-    // At the moment, the smallest CPU with dotprod is probably Cortex-A55 with
-    // 1M L3 shared cache.
-    return 1 << 20;
-  }
-#else
-  (void)path;
-#endif
-  return SharedDataCacheSize();
-}
-
-}  // namespace ruy
-
-#endif  // RUY_RUY_CPU_CACHE_SIZE_H_
diff --git a/ruy/cpuinfo.cc b/ruy/cpuinfo.cc
index 793ba7b..147cb17 100644
--- a/ruy/cpuinfo.cc
+++ b/ruy/cpuinfo.cc
@@ -1,15 +1,31 @@
 #include "ruy/cpuinfo.h"
 
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+#include "ruy/check_macros.h"
+#include "ruy/cpu_cache_params.h"
 #include "ruy/platform.h"
 
 #define RUY_HAVE_CPUINFO (!(RUY_PLATFORM_PPC || RUY_PLATFORM_FUCHSIA))
 
 #if RUY_HAVE_CPUINFO
-
 #include <cpuinfo.h>
+#endif
 
 namespace ruy {
 
+namespace {
+void MakeDummyCacheParams(CpuCacheParams* result) {
+  // Reasonable dummy values
+  result->local_cache_size = 32 * 1024;
+  result->last_level_cache_size = 512 * 1024;
+}
+}  // end namespace
+
+#if RUY_HAVE_CPUINFO
+
 CpuInfo::~CpuInfo() {
   if (init_status_ == InitStatus::kInitialized) {
     cpuinfo_deinitialize();
@@ -18,12 +34,67 @@
 
 bool CpuInfo::EnsureInitialized() {
   if (init_status_ == InitStatus::kNotYetAttempted) {
-    init_status_ =
-        cpuinfo_initialize() ? InitStatus::kInitialized : InitStatus::kFailed;
+    init_status_ = Initialize();
+    RUY_DCHECK_NE(init_status_, InitStatus::kNotYetAttempted);
   }
   return init_status_ == InitStatus::kInitialized;
 }
 
+namespace {
+void QueryCacheParams(CpuCacheParams* cache_params) {
+  const int processors_count = cpuinfo_get_processors_count();
+  RUY_DCHECK_GT(processors_count, 0);
+  int overall_local_cache_size = std::numeric_limits<int>::max();
+  int overall_last_level_cache_size = std::numeric_limits<int>::max();
+  for (int i = 0; i < processors_count; i++) {
+    int local_cache_size = 0;
+    int last_level_cache_size = 0;
+    const cpuinfo_processor* processor = cpuinfo_get_processor(i);
+    // Loop over cache levels. Ignoring L4 for now: it seems that in CPUs that
+    // have L4, we would still prefer to stay in lower-latency L3.
+    for (const cpuinfo_cache* cache :
+         {processor->cache.l1d, processor->cache.l2, processor->cache.l3}) {
+      if (!cache) {
+        continue;  // continue, not break, it is possible to have L1+L3 but no
+                   // L2.
+      }
+      const bool is_local =
+          cpuinfo_get_processor(cache->processor_start)->core ==
+          cpuinfo_get_processor(cache->processor_start +
+                                cache->processor_count - 1)
+              ->core;
+      if (is_local) {
+        local_cache_size = cache->size;
+      }
+      last_level_cache_size = cache->size;
+    }
+    // If no local cache was found, use the last-level cache.
+    if (!local_cache_size) {
+      local_cache_size = last_level_cache_size;
+    }
+    RUY_DCHECK_GT(local_cache_size, 0);
+    RUY_DCHECK_GT(last_level_cache_size, 0);
+    RUY_DCHECK_GE(last_level_cache_size, local_cache_size);
+    overall_local_cache_size =
+        std::min(overall_local_cache_size, local_cache_size);
+    overall_last_level_cache_size =
+        std::min(overall_last_level_cache_size, last_level_cache_size);
+  }
+  cache_params->local_cache_size = overall_local_cache_size;
+  cache_params->last_level_cache_size = overall_last_level_cache_size;
+}
+}  // end namespace
+
+CpuInfo::InitStatus CpuInfo::Initialize() {
+  RUY_DCHECK_EQ(init_status_, InitStatus::kNotYetAttempted);
+  if (!cpuinfo_initialize()) {
+    MakeDummyCacheParams(&cache_params_);
+    return InitStatus::kFailed;
+  }
+  QueryCacheParams(&cache_params_);
+  return InitStatus::kInitialized;
+}
+
 bool CpuInfo::NeonDotprod() {
   return EnsureInitialized() && cpuinfo_has_arm_neon_dot();
 }
@@ -44,18 +115,29 @@
   return EnsureInitialized() && cpuinfo_has_x86_avx512vnni();
 }
 
-}  // namespace ruy
-
 #else  // not RUY_HAVE_CPUINFO
 
-namespace ruy {
 CpuInfo::~CpuInfo() {}
-bool CpuInfo::EnsureInitialized() { return false; }
+bool CpuInfo::EnsureInitialized() {
+  if (init_status_ == InitStatus::kNotYetAttempted) {
+    MakeDummyCacheParams(&cache_params_);
+    init_status_ = InitStatus::kInitialized;
+  }
+  RUY_DCHECK_EQ(init_status_, InitStatus::kInitialized);
+  return true;
+}
 bool CpuInfo::NeonDotprod() { return false; }
 bool CpuInfo::Sse42() { return false; }
 bool CpuInfo::Avx2() { return false; }
 bool CpuInfo::Avx512() { return false; }
 bool CpuInfo::AvxVnni() { return false; }
-}  // namespace ruy
 
 #endif
+
+const CpuCacheParams& CpuInfo::CacheParams() {
+  EnsureInitialized();
+  // On failure, EnsureInitialized leaves dummy values in cache_params_.
+  return cache_params_;
+}
+
+}  // namespace ruy
diff --git a/ruy/cpuinfo.h b/ruy/cpuinfo.h
index 0a3de28..de2cbc7 100644
--- a/ruy/cpuinfo.h
+++ b/ruy/cpuinfo.h
@@ -16,6 +16,8 @@
 #ifndef RUY_RUY_CPUINFO_H_
 #define RUY_RUY_CPUINFO_H_
 
+#include "ruy/cpu_cache_params.h"
+
 namespace ruy {
 
 // Wraps the functionality that ruy needs from the cpuinfo library.
@@ -33,14 +35,22 @@
   bool Avx512();
   bool AvxVnni();
 
+  // Common features
+  const CpuCacheParams& CacheParams();
+
  private:
   enum class InitStatus {
     kNotYetAttempted,
     kInitialized,
     kFailed,
   };
+
   InitStatus init_status_ = InitStatus::kNotYetAttempted;
+  CpuCacheParams cache_params_;
+
   bool EnsureInitialized();
+  InitStatus Initialize();
+
   CpuInfo(const CpuInfo&) = delete;
 };
 
diff --git a/ruy/dispatch.h b/ruy/dispatch.h
index 2f75a9c..144cff6 100644
--- a/ruy/dispatch.h
+++ b/ruy/dispatch.h
@@ -199,9 +199,6 @@
 
   params->path = ThePath;
 
-  params->local_data_cache_size = MulParamsType::local_data_cache_size();
-  params->shared_data_cache_size = MulParamsType::shared_data_cache_size();
-
   CreatePackedMatrix<LhsScalar, PackedLhsScalar>(
       Side::kLhs, ToKernelLayout<LhsKernelLayout>(), params);
   CreatePackedMatrix<RhsScalar, PackedRhsScalar>(
diff --git a/ruy/mul_params.h b/ruy/mul_params.h
index eb1abed..bb9546f 100644
--- a/ruy/mul_params.h
+++ b/ruy/mul_params.h
@@ -19,7 +19,6 @@
 #include <limits>
 #include <type_traits>
 
-#include "ruy/cpu_cache_size.h"
 #include "ruy/matrix.h"
 
 namespace ruy {
@@ -134,15 +133,6 @@
   // Used for testing of various kernel layouts.
   using StandardCppKernelLhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
   using StandardCppKernelRhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
-  // Returns (a reasonable estimate of) the local CPU cache size.
-  // See ruy::LocalDataCacheSize() which returns some coarse, sane default for
-  // each CPU architecture.
-  // This may be overridden, either to provide more accurate/runtime values,
-  // or to test with other values to let testcases have more coverage.
-  static int local_data_cache_size() { return LocalDataCacheSize(); }
-  // Same as local_data_cache_size but for the total data cache size accessible
-  // to each CPU core. See ruy::SharedDataCacheSize().
-  static int shared_data_cache_size() { return SharedDataCacheSize(); }
 };
 
 template <typename tAccumScalar, typename tDstScalar>
diff --git a/ruy/test_special_mul_params.cc b/ruy/test_special_mul_params.cc
index 0249150..d76e8a1 100644
--- a/ruy/test_special_mul_params.cc
+++ b/ruy/test_special_mul_params.cc
@@ -41,8 +41,6 @@
 struct StandardCppKernelLayoutMulParams : MulParams<AccumScalar, DstScalar> {
   using StandardCppKernelLhsLayout = LhsKernelLayout;
   using StandardCppKernelRhsLayout = RhsKernelLayout;
-  static int local_data_cache_size() { return 1; }
-  static int shared_data_cache_size() { return 1; }
 };
 
 using LhsScalar = RUY_TEST_LHSSCALAR;
diff --git a/ruy/trmul.cc b/ruy/trmul.cc
index c175f01..94d0e77 100644
--- a/ruy/trmul.cc
+++ b/ruy/trmul.cc
@@ -26,6 +26,8 @@
 #include "ruy/block_map.h"
 #include "ruy/check_macros.h"
 #include "ruy/common.h"
+#include "ruy/cpu_cache_params.h"
+#include "ruy/cpuinfo.h"
 #include "ruy/ctx.h"
 #include "ruy/mat.h"
 #include "ruy/matrix.h"
@@ -70,24 +72,19 @@
 
     const Tuning tuning = tuning_resolver->Resolve();
     const int num_blocks = NumBlocks(block_map);
-    SidePair<int> block;
-    SidePair<int> start;
-    SidePair<int> end;
 
     // Each thread starts by initially reserving the block whose id
     // is the thread id.
     int block_id = thread_id;
     while (block_id < num_blocks) {
-      // Reserve the next block to handle. In order to hide the latency
-      // (typically comparable to an access to the level of data cache that
-      // is shared among CPU cores, e.g. 60 cycles on an ARM CPU as of 2019)
-      // of this atomic operation, we structure this code so as to avoid
-      // immediately depending on the `next_n` result.
+      // Reserve the next block to handle, hiding the latency of this atomic op.
       const int next_block_id =
           atomic_block_id->fetch_add(1, std::memory_order_relaxed);
       // Get coordinates of the current block to handle, in "block space".
+      SidePair<int> block;
       GetBlockByIndex(block_map, block_id, &block);
       // Get coordinates of the current block to handle, in matrix space.
+      SidePair<int> start, end;
       GetBlockMatrixCoords(block_map, block, &start, &end);
       // Maybe pack the current LHS/RHS block, if not already packed.
       EnsurePacked(block, start, end, tuning);
@@ -244,12 +241,11 @@
 
 LoopStructure GetLoopStructure(int tentative_thread_count, int rows, int cols,
                                int depth, int lhs_scalar_size,
-                               int rhs_scalar_size, int local_data_cache_size,
-                               int shared_data_cache_size) {
+                               int rhs_scalar_size,
+                               const CpuCacheParams& cpu_cache_params) {
   if (tentative_thread_count == 1) {
-    const BlockMapTraversalOrder traversal_order =
-        GetTraversalOrder(rows, cols, depth, lhs_scalar_size, rhs_scalar_size,
-                          local_data_cache_size, shared_data_cache_size);
+    const BlockMapTraversalOrder traversal_order = GetTraversalOrder(
+        rows, cols, depth, lhs_scalar_size, rhs_scalar_size, cpu_cache_params);
     // If we are in the GEMV case or the block_map would be using linear
     // traversal anyway, use the simple loop.
     if ((cols == 1) || traversal_order == BlockMapTraversalOrder::kLinear) {
@@ -277,10 +273,10 @@
   const int depth = lhs.layout.rows;
 
   const int tentative_thread_count = GetThreadCount(ctx, rows, cols, depth);
+  const auto& cpu_cache_params = ctx->mutable_cpuinfo()->CacheParams();
   const auto loop_structure = GetLoopStructure(
       tentative_thread_count, rows, cols, depth, lhs.data_type.size,
-      rhs.data_type.size, params->local_data_cache_size,
-      params->shared_data_cache_size);
+      rhs.data_type.size, cpu_cache_params);
   Allocator* allocator = ctx->GetMainAllocator();
 
   // Allocate packed matrices
@@ -319,8 +315,7 @@
   MakeBlockMap(packed_lhs.layout.cols, packed_rhs.layout.cols, depth,
                packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
                packed_lhs.data_type.size, packed_rhs.data_type.size,
-               tentative_thread_count, params->local_data_cache_size,
-               params->shared_data_cache_size, &block_map);
+               tentative_thread_count, cpu_cache_params, &block_map);
 
   // Initialize per-thread state.
   const int thread_count = block_map.thread_count;
diff --git a/ruy/trmul_params.h b/ruy/trmul_params.h
index fecef99..877c1f0 100644
--- a/ruy/trmul_params.h
+++ b/ruy/trmul_params.h
@@ -43,11 +43,6 @@
   // cache sizes when not runtime-detectable.
   Path path;
 
-  // See MulParamsType::local_data_cache_size().
-  int local_data_cache_size = 0;
-  // See MulParamsType::shared_data_cache_size().
-  int shared_data_cache_size = 0;
-
   // Function pointers to type-erased entry points for kernels and packers.
   SidePair<RunPackFn*> run_pack;
   RunKernelFn* run_kernel = nullptr;