Added support to configure lower bound on per-thread cache size

[alkondratenko@gmail.com: removed spurious new line at thread_cache.h]
Signed-off-by: Aliaksey Kandratsenka <alkondratenko@gmail.com>
diff --git a/.gitignore b/.gitignore
index 87e16a4..b3f7603 100644
--- a/.gitignore
+++ b/.gitignore
@@ -155,6 +155,8 @@
 /test-driver
 /thread_dealloc_unittest
 /thread_dealloc_unittest.exe
+/per_thread_cache_size_test
+/per_thread_cache_size_test.exe
 /unique_path_unittest
 /unique_path_unittest.exe
 /unwind_bench
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e969ee..f24d3d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -687,6 +687,10 @@
           src/tests/testutil.cc)
   target_link_libraries(thread_dealloc_unittest tcmalloc_minimal)
   add_test(thread_dealloc_unittest thread_dealloc_unittest)
+
+  add_executable(min_per_thread_cache_size_test src/tests/min_per_thread_cache_size_test.cc)
+  target_link_libraries(min_per_thread_cache_size_test tcmalloc_minimal gtest)
+  add_test(min_per_thread_cache_size_test min_per_thread_cache_size_test)
 endif()
 
 ### ------- tcmalloc_minimal_debug (thread-caching malloc with debugallocation)
diff --git a/Makefile.am b/Makefile.am
index 35ff4e1..4fa6349 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -467,6 +467,12 @@
 thread_dealloc_unittest_LDFLAGS = $(TCMALLOC_FLAGS) $(AM_LDFLAGS)
 thread_dealloc_unittest_LDADD = libtcmalloc_minimal.la
 
+TESTS += min_per_thread_cache_size_test
+min_per_thread_cache_size_test_SOURCES = src/tests/min_per_thread_cache_size_test.cc
+min_per_thread_cache_size_test_LDFLAGS = $(TCMALLOC_FLAGS) $(AM_LDFLAGS)
+min_per_thread_cache_size_test_CPPFLAGS = $(gtest_CPPFLAGS)
+min_per_thread_cache_size_test_LDADD = libtcmalloc_minimal.la libgtest.la
+
 ### Documentation
 dist_doc_DATA += docs/tcmalloc.html \
                  docs/overview.gif \
diff --git a/docs/tcmalloc.html b/docs/tcmalloc.html
index 33b8cc5..2096560 100644
--- a/docs/tcmalloc.html
+++ b/docs/tcmalloc.html
@@ -403,7 +403,7 @@
 </table>
 
 
-<ul> 
+<ul>
   <li> TCMalloc is much more consistently scalable than PTMalloc2 - for
        all thread counts &gt;1 it achieves ~7-9 million ops/sec for small
        allocations, falling to ~2 million ops/sec for larger
@@ -517,7 +517,7 @@
 
 <tr valign=top>
   <td><code>TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES</code></td>
-  <td>default: 16777216</td>
+  <td>default: 33554432</td>
   <td>
     Bound on the total amount of bytes allocated to thread caches.  This
     bound is not strict, so it is possible for the cache to go over this
@@ -746,6 +746,16 @@
   </td>
 </tr>
 
+<tr valign=top>
+  <td><code>tcmalloc.min_per_thread_cache_bytes</code></td>
+  <td>
+    A lower limit to how much memory TCMalloc dedicates for small objects per
+    thread. Note that this property only shows effect if per-thread cache
+    calculated using tcmalloc.max_total_thread_cache_bytes ended up being less
+    than tcmalloc.min_per_thread_cache_bytes.
+  </td>
+</tr>
+
 </table>
 
 <h2><A NAME="caveats">Caveats</A></h2>
diff --git a/src/gperftools/malloc_extension.h b/src/gperftools/malloc_extension.h
index cf55939..3f3f0b0 100644
--- a/src/gperftools/malloc_extension.h
+++ b/src/gperftools/malloc_extension.h
@@ -175,7 +175,14 @@
   // --------
   // "tcmalloc.max_total_thread_cache_bytes"
   //      Upper limit on total number of bytes stored across all
-  //      per-thread caches.  Default: 16MB.
+  //      per-thread caches.  Default: 32MB.
+  //
+  // "tcmalloc.min_per_thread_cache_bytes"
+  //      Lower limit on total number of bytes stored per-thread cache.
+  //      Default: 512kB.
+  //      Note that this property only shows effect if per-thread cache
+  //      calculated using tcmalloc.max_total_thread_cache_bytes ended up being
+  //      less than tcmalloc.min_per_thread_cache_bytes.
   //
   // "tcmalloc.current_total_thread_cache_bytes"
   //      Number of bytes used across all thread caches.
diff --git a/src/tcmalloc.cc b/src/tcmalloc.cc
index 98bae87..c51a351 100644
--- a/src/tcmalloc.cc
+++ b/src/tcmalloc.cc
@@ -833,6 +833,11 @@
       return true;
     }
 
+    if (strcmp(name, "tcmalloc.min_per_thread_cache_bytes") == 0) {
+      *value = ThreadCache::min_per_thread_cache_size();
+      return true;
+    }
+
     if (strcmp(name, "tcmalloc.current_total_thread_cache_bytes") == 0) {
       TCMallocStats stats;
       ExtractStats(&stats, NULL, NULL, NULL);
@@ -876,6 +881,11 @@
       return true;
     }
 
+    if (strcmp(name, "tcmalloc.min_per_thread_cache_bytes") == 0) {
+      ThreadCache::set_min_per_thread_cache_size(value);
+      return true;
+    }
+
     if (strcmp(name, "tcmalloc.aggressive_memory_decommit") == 0) {
       SpinLockHolder l(Static::pageheap_lock());
       Static::pageheap()->SetAggressiveDecommit(value != 0);
diff --git a/src/tests/min_per_thread_cache_size_test.cc b/src/tests/min_per_thread_cache_size_test.cc
new file mode 100644
index 0000000..a0c6d24
--- /dev/null
+++ b/src/tests/min_per_thread_cache_size_test.cc
@@ -0,0 +1,124 @@
+/* -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+ * Copyright (c) 2024, gperftools Contributors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config_for_unittests.h"
+
+#include <new>
+#include <thread>
+#include <vector>
+
+#include <gperftools/malloc_extension.h>
+#include <gperftools/malloc_extension_c.h>
+
+#include "base/logging.h"
+#include "gtest/gtest.h"
+
+// Number of allocations per thread.
+static const int kAllocationsPerThread = 10000;
+
+// Number of threads to create.
+static const int kNumThreads = 50;
+
+// Per thread cache size to set.
+static const size_t kPerThreadCacheSize = 64 << 10;
+
+// Number of passes to run.
+static const int kNumPasses = 10;
+
+// Get current total thread-cache size.
+static size_t CurrentThreadCacheSize() {
+  size_t result = 0;
+  EXPECT_TRUE(MallocExtension::instance()->GetNumericProperty(
+                "tcmalloc.current_total_thread_cache_bytes",
+                &result));
+  return result;
+}
+
+// Maximum cache size seen so far.
+static size_t max_cache_size;
+
+// Mutex and condition variable to synchronize threads.
+std::mutex filler_mtx;
+std::condition_variable filler_cv;
+int current_thread = 0;
+
+// A thread that cycles through allocating lots of objects of varying
+// size, in an attempt to fill up its thread cache.
+void Filler(int thread_id, int num_threads) {
+  std::unique_lock<std::mutex> filler_lock(filler_mtx);
+  for (int i = 0; i < kNumPasses; i++) {
+    // Wait for the current thread to be the one that should run.
+    filler_cv.wait(filler_lock, [thread_id] { return thread_id == current_thread; });
+
+    // Fill the cache by allocating and deallocating objects of varying sizes.
+    int size = 0;
+    for (int i = 0; i < kAllocationsPerThread; i++) {
+      void* p = ::operator new(size);
+      ::operator delete(p);
+      size += 64;
+      if (size > (32 << 10)) size = 0;
+    }
+
+    // Get the maximum cache size seen so far.
+    const size_t cache_size = CurrentThreadCacheSize();
+    max_cache_size = std::max(max_cache_size, cache_size);
+
+    // Move to the next thread.
+    current_thread = (current_thread + 1) % num_threads;
+    filler_cv.notify_all();
+  }
+}
+
+TEST(MinPerThreadCacheSizeTest, Basics) {
+  // Start all threads.
+  std::vector<std::thread> threads;
+  threads.reserve(kNumThreads);
+
+  // Set the lower bound on per cache size.
+  CHECK(MallocExtension::instance()->SetNumericProperty(
+        "tcmalloc.min_per_thread_cache_bytes", kPerThreadCacheSize));
+
+  // Setting the max total thread cache size to 0 to ensure that the
+  // per thread cache size is set to the lower bound.
+  CHECK(MallocExtension::instance()->SetNumericProperty(
+        "tcmalloc.max_total_thread_cache_bytes", 0));
+
+  for (int i = 0; i < kNumThreads; i++) {
+    threads.emplace_back(Filler, i, kNumThreads);
+  }
+
+  // Wait for all threads to finish.
+  for (auto& t : threads) { t.join(); }
+
+  // Check that the maximum cache size does not exceed the limit set.
+  ASSERT_LT(max_cache_size, kPerThreadCacheSize * kNumThreads);
+}
+
diff --git a/src/thread_cache.cc b/src/thread_cache.cc
index ba40e61..3ac1019 100644
--- a/src/thread_cache.cc
+++ b/src/thread_cache.cc
@@ -67,6 +67,8 @@
 static bool phinited = false;
 
 volatile size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize;
+
+std::atomic<size_t> ThreadCache::min_per_thread_cache_size_ = kMinThreadCacheSize;
 size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize;
 ssize_t ThreadCache::unclaimed_cache_space_ = kDefaultOverallThreadCacheSize;
 PageHeapAllocator<ThreadCache> threadcache_allocator;
@@ -84,10 +86,11 @@
   if (max_size_ == 0) {
     // There isn't enough memory to go around.  Just give the minimum to
     // this thread.
-    SetMaxSize(kMinThreadCacheSize);
+    size_t min_size = min_per_thread_cache_size_.load(std::memory_order_relaxed);
+    SetMaxSize(min_size);
 
     // Take unclaimed_cache_space_ negative.
-    unclaimed_cache_space_ -= kMinThreadCacheSize;
+    unclaimed_cache_space_ -= min_size;
     ASSERT(unclaimed_cache_space_ < 0);
   }
 
@@ -267,7 +270,8 @@
       next_memory_steal_ = thread_heaps_;
     }
     if (next_memory_steal_ == this ||
-        next_memory_steal_->max_size_ <= kMinThreadCacheSize) {
+        next_memory_steal_->max_size_
+          <= min_per_thread_cache_size_.load(std::memory_order_relaxed)) {
       continue;
     }
     next_memory_steal_->SetMaxSize(next_memory_steal_->max_size_ - kStealAmount);
@@ -352,8 +356,9 @@
   int n = thread_heap_count_ > 0 ? thread_heap_count_ : 1;
   size_t space = overall_thread_cache_size_ / n;
 
+  size_t min_size = min_per_thread_cache_size_.load(std::memory_order_relaxed);
   // Limit to allowed range
-  if (space < kMinThreadCacheSize) space = kMinThreadCacheSize;
+  if (space < min_size) space = min_size;
   if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize;
 
   double ratio = space / max<double>(1, per_thread_cache_size_);
@@ -383,7 +388,10 @@
 
 void ThreadCache::set_overall_thread_cache_size(size_t new_size) {
   // Clip the value to a reasonable range
-  if (new_size < kMinThreadCacheSize) new_size = kMinThreadCacheSize;
+  size_t min_size = min_per_thread_cache_size_.load(std::memory_order_relaxed);
+  if (new_size < min_size) {
+    new_size = min_size;
+  }
   if (new_size > (1<<30)) new_size = (1<<30);     // Limit to 1GB
   overall_thread_cache_size_ = new_size;
 
diff --git a/src/thread_cache.h b/src/thread_cache.h
index 5231e1a..7454688 100644
--- a/src/thread_cache.h
+++ b/src/thread_cache.h
@@ -35,6 +35,7 @@
 #define TCMALLOC_THREAD_CACHE_H_
 
 #include <config.h>
+#include <atomic>
 #include <stddef.h>                     // for size_t, NULL
 #include <stdint.h>                     // for uint32_t, uint64_t
 #include <sys/types.h>                  // for ssize_t
@@ -114,6 +115,15 @@
     return overall_thread_cache_size_;
   }
 
+  // Sets the lower bound on per-thread cache size to new_size.
+  static void set_min_per_thread_cache_size(size_t new_size) {
+    min_per_thread_cache_size_.store(new_size, std::memory_order_relaxed);
+  }
+
+  static size_t min_per_thread_cache_size() {
+    return min_per_thread_cache_size_.load(std::memory_order_relaxed);
+  }
+
   static int thread_heap_count() {
     return thread_heap_count_;
   }
@@ -263,6 +273,9 @@
   // thread_heaps_.  Protected by Static::pageheap_lock.
   static ThreadCache* next_memory_steal_;
 
+  // Lower bound on per thread cache size. Default value is 512 KBs. 
+  static std::atomic<size_t> min_per_thread_cache_size_;
+
   // Overall thread cache size.  Protected by Static::pageheap_lock.
   static size_t overall_thread_cache_size_;