[PartitionAlloc] Add feature to use fewer memory regions.

PartitionAlloc creates a lot of memory regions in its current
configuration, up to 10s of thousands, due to the way it manipulates
page permissions. This is problematic on Linux-based systems, where
there is a per-process limit of memory regions (VMAs), which is a bit
under 2^16 by default, and we sometimes see crashes likely due to
reaching this limit.

This commit adds a feature to use fewer regions in
PartitionAlloc. Locally, on Linux, loading google maps, we see:

Before:
$ wc -l /proc/${pid}/maps
4497 /proc/582913/maps

After:
$ wc -l /proc/${pid}/maps
1900 /proc/551934/maps

That is, the *total* number of regions is <1/2 with the feature
enabled. For more details about the mechanism, see the comment in
PartitionBucket::SlotSpanCommittedSize().

Bug: 385400561
Change-Id: I22b0f07860e92c43a5414742d9a3b5565410ef3f
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6157842
Reviewed-by: Takashi Sakamoto <tasak@google.com>
Reviewed-by: Sergei Glazunov <glazunov@google.com>
Commit-Queue: Benoit Lize <lizeb@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1404104}
NOKEYCHECK=True
GitOrigin-RevId: 53690081eb7c8ce64dd74e1f690918f27b7e45ff
diff --git a/src/partition_alloc/partition_alloc_unittest.cc b/src/partition_alloc/partition_alloc_unittest.cc
index 8db3654..c9e8f11 100644
--- a/src/partition_alloc/partition_alloc_unittest.cc
+++ b/src/partition_alloc/partition_alloc_unittest.cc
@@ -93,6 +93,12 @@
 #include <unistd.h>
 #endif
 
+#if PA_BUILDFLAG(IS_LINUX) || PA_BUILDFLAG(IS_ANDROID) || \
+    PA_BUILDFLAG(IS_CHROMEOS)
+#include "partition_alloc/partition_alloc_base/debug/proc_maps_linux.h"
+#endif  // PA_BUILDFLAG(IS_LINUX) || PA_BUILDFLAG(IS_ANDROID) ||
+        // PA_BUILDFLAG(IS_CHROMEOS)
+
 // In the MTE world, the upper bits of a pointer can be decorated with a tag,
 // thus allowing many versions of the same pointer to exist. These macros take
 // that into account when comparing.
@@ -385,6 +391,7 @@
     // Requires explicit `FreeFlag` to activate, no effect otherwise.
     opts.zapping_by_free_flags = PartitionOptions::kEnabled;
     opts.eventually_zero_freed_memory = PartitionOptions::kEnabled;
+    opts.fewer_memory_regions = PartitionOptions::kDisabled;
     opts.scheduler_loop_quarantine = PartitionOptions::kEnabled;
     opts.scheduler_loop_quarantine_branch_capacity_in_bytes =
         std::numeric_limits<size_t>::max();
@@ -3860,6 +3867,139 @@
   allocator.root()->GetSchedulerLoopQuarantineBranchForTesting().Purge();
 }
 
+#if PA_BUILDFLAG(IS_LINUX) || PA_BUILDFLAG(IS_ANDROID) || \
+    PA_BUILDFLAG(IS_CHROMEOS)
+
+TEST_P(PartitionAllocTest, InaccessibleRegionAfterSlotSpans) {
+  auto* root = allocator.root();
+  ASSERT_FALSE(root->settings.fewer_memory_regions);
+
+  // Look for an allocation size that matches a bucket which doesn't fill its
+  // last PartitionPage.  Scan through allocation sizes rather than buckets, as
+  // depending on the bucket distribution, some buckets may not be active.
+  PartitionBucket* incomplete_bucket = nullptr;
+  // Only regular buckets, give up if we can't find one (and GTEST_SKIP()
+  // below).
+  for (size_t alloc_size = 0;
+       alloc_size < MaxRegularSlotSpanSize() - ExtraAllocSize(allocator);
+       alloc_size++) {
+    size_t index = SizeToIndex(alloc_size + ExtraAllocSize(allocator));
+    auto& bucket = root->buckets[index];
+    if (bucket.get_bytes_per_span() != bucket.get_pages_per_slot_span()
+                                           << PartitionPageShift()) {
+      incomplete_bucket = &bucket;
+      break;
+    }
+  }
+
+  // Didn't find any bucket that doesn't fill the last PartitionPage. Unlikley,
+  // but so be it.
+  if (!incomplete_bucket) {
+    GTEST_SKIP();
+  }
+
+  // Allocate memory, get the end of the slot span, and check that there is an
+  // inaccessible region starting there.
+  void* ptr =
+      root->Alloc(incomplete_bucket->slot_size - ExtraAllocSize(allocator), "");
+  ASSERT_TRUE(ptr);
+  uintptr_t start = SlotSpanMetadata<MetadataKind::kReadOnly>::ToSlotSpanStart(
+      SlotSpanMetadata<MetadataKind::kReadOnly>::FromAddr(
+          reinterpret_cast<uintptr_t>(ptr)));
+  uintptr_t end = start + incomplete_bucket->get_bytes_per_span();
+
+  std::string proc_maps;
+  std::vector<base::debug::MappedMemoryRegion> regions;
+  ASSERT_TRUE(base::debug::ReadProcMaps(&proc_maps));
+  ASSERT_TRUE(base::debug::ParseProcMaps(proc_maps, &regions));
+  bool found = false;
+  for (const auto& region : regions) {
+    if (region.start == end) {
+      found = true;
+      EXPECT_EQ(region.permissions, base::debug::MappedMemoryRegion::PRIVATE);
+      break;
+    }
+  }
+  EXPECT_TRUE(found);
+
+  root->Free(ptr);
+}
+
+TEST_P(PartitionAllocTest, FewerMemoryRegions) {
+  auto* root = allocator.root();
+  ASSERT_FALSE(root->settings.fewer_memory_regions);
+  root->settings.fewer_memory_regions = true;
+
+  // Look for an allocation size that matches a bucket which doesn't fill its
+  // last PartitionPage.  Scan through allocation sizes rather than buckets, as
+  // depending on the bucket distribution, some buckets may not be active.
+  PartitionBucket* incomplete_bucket = nullptr;
+  // Only regular buckets, give up if we can't find one (and GTEST_SKIP()
+  // below).
+  for (size_t alloc_size = 0;
+       alloc_size < MaxRegularSlotSpanSize() - ExtraAllocSize(allocator);
+       alloc_size++) {
+    size_t index = SizeToIndex(alloc_size + ExtraAllocSize(allocator));
+    auto& bucket = root->buckets[index];
+    if (bucket.get_bytes_per_span() != bucket.get_pages_per_slot_span()
+                                           << PartitionPageShift()) {
+      incomplete_bucket = &bucket;
+      break;
+    }
+  }
+
+  // Didn't find any bucket that doesn't fill the last PartitionPage. Unlikley,
+  // but so be it.
+  if (!incomplete_bucket) {
+    GTEST_SKIP();
+  }
+
+  // Allocate memory, get the end of the slot span, and check that there is an
+  // inaccessible region starting there.
+  void* ptr =
+      root->Alloc(incomplete_bucket->slot_size - ExtraAllocSize(allocator), "");
+  ASSERT_TRUE(ptr);
+  uintptr_t start = SlotSpanMetadata<MetadataKind::kReadOnly>::ToSlotSpanStart(
+      SlotSpanMetadata<MetadataKind::kReadOnly>::FromAddr(
+          reinterpret_cast<uintptr_t>(ptr)));
+  uintptr_t end = start + incomplete_bucket->get_bytes_per_span();
+
+  std::string proc_maps;
+  std::vector<base::debug::MappedMemoryRegion> regions;
+  ASSERT_TRUE(base::debug::ReadProcMaps(&proc_maps));
+  ASSERT_TRUE(base::debug::ParseProcMaps(proc_maps, &regions));
+
+  // Cannot find a region that starts exactly at the end of the incomplete slot
+  // span.
+  bool found = false;
+  for (const auto& region : regions) {
+    if (region.start == end) {
+      found = true;
+      break;
+    }
+  }
+  EXPECT_FALSE(found);
+
+  // The enclosing region has RW permissions.
+  found = false;
+  for (const auto& region : regions) {
+    if (region.start <= end && region.end >= end) {
+      EXPECT_EQ(region.permissions,
+                base::debug::MappedMemoryRegion::READ |
+                    base::debug::MappedMemoryRegion::WRITE |
+                    base::debug::MappedMemoryRegion::PRIVATE);
+      found = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(found);
+
+  root->Free(ptr);
+}
+
+#endif  // PA_BUILDFLAG(IS_LINUX) || PA_BUILDFLAG(IS_ANDROID) ||
+        // PA_BUILDFLAG(IS_CHROMEOS)
+
 TEST_P(PartitionAllocTest, ZeroFreedMemory) {
   auto* root = allocator.root();
   ASSERT_TRUE(root->settings.eventually_zero_freed_memory);
diff --git a/src/partition_alloc/partition_bucket.cc b/src/partition_alloc/partition_bucket.cc
index 11e5f72..db2c5da 100644
--- a/src/partition_alloc/partition_bucket.cc
+++ b/src/partition_alloc/partition_bucket.cc
@@ -708,7 +708,7 @@
     PA_DEBUG_DATA_ON_STACK("spancmt", slot_span_committed_size);
 
     root->RecommitSystemPagesForData(
-        slot_span_start, slot_span_committed_size,
+        slot_span_start, SlotSpanCommittedSize(root),
         PageAccessibilityDisposition::kRequireUpdate,
         slot_size <= kMaxMemoryTaggingSize);
   }
@@ -1590,4 +1590,63 @@
   InitializeSlotSpan(slot_span, root);
 }
 
+size_t PartitionBucket::SlotSpanCommittedSize(PartitionRoot* root) const {
+  // With lazy commit, we certainly don't want to commit more than
+  // necessary. This is not reached, but keep the CHECK() as documentation.
+  PA_CHECK(!kUseLazyCommit);
+
+  // Memory is reserved in units of PartitionPage, but a given slot span may be
+  // smaller than the reserved area. For instance (assuming 4k pages), for a
+  // bucket where the slot span size is 40kiB, we reserve 4 PartitionPage = 16 *
+  // 4 = 48kiB, but only ever commit 40kiB out of it.
+  //
+  // This means that the address space then looks like, assuming that the
+  // PartitionPage next to it is committed:
+  //   [SlotSpan range, 40kiB]                       rw-p
+  //   [Unused area in the last PartitionPage, 8kiB] ---p
+  //   [Next PartitionPages, size unknown ]          rw-p
+  //
+  // So we have a "hole" of inaccessible memory, and 3 memory regions. If
+  // instead we commit the full PartitionPages, we get (due to the kernel
+  // merging neighboring regions with uniform permissions):
+  //
+  //   [SlotSpan range, 40kiB + Unused area, 8kiB + next PartitionPages] rw-p
+  //
+  // So 1 memory region rather then 3. This matters, because on Linux kernels,
+  // there is a maximum number of VMAs per process, with the default limit a bit
+  // less than 2^16, and Chromium sometimes hits the limit (see
+  // /proc/sys/vm/max_map_count for the current limit), largely because of
+  // PartitionAlloc contributing thousands of regions. Locally, on a Linux
+  // system, this reduces the number of PartitionAlloc regions by up to ~4x.
+  //
+  // Why is it safe?
+  // The extra memory is not used by anything, so committing it doesn't make a
+  // difference. It makes it accessible though.
+  //
+  // How much does it cost?
+  // Almost nothing. On Linux, "committing" memory merely changes its
+  // permissions, it doesn't cost any memory until the pages are touched, which
+  // they are not. However, mprotect()-ed areas that are writable count towards
+  // the RLIMIT_DATA resource limit, which is used by the sandbox. So, while
+  // this change costs 0 physical memory (and actually saves some, by reducing
+  // the size of the VMA red-black tree in the kernel), it might increase
+  // slightly the cases where we bump into the sandbox memory limit.
+  //
+  // Is it safe to do while running?
+  // Since this is decided through root settings, the value changes at runtime,
+  // so we may decommit memory that was never committed. This is safe onLinux,
+  // since decommitting is just changing permissions back to PROT_NONE, which
+  // the tail end would already have.
+  //
+  // Can we do better?
+  // For simplicity, we do not "fix" the regions that were committed before the
+  // settings are changed (after feature list initialization). This means that
+  // we end up with more regions that we could. The intent is to run a field
+  // experiment, then change the default value, at which point we get the full
+  // impact, so this is only temporary.
+  return root->settings.fewer_memory_regions
+             ? (get_pages_per_slot_span() << PartitionPageShift())
+             : get_bytes_per_span();
+}
+
 }  // namespace partition_alloc::internal
diff --git a/src/partition_alloc/partition_bucket.h b/src/partition_alloc/partition_bucket.h
index 637a6ce..30f55bc 100644
--- a/src/partition_alloc/partition_bucket.h
+++ b/src/partition_alloc/partition_bucket.h
@@ -1,7 +1,6 @@
 // Copyright 2018 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
-
 #ifndef PARTITION_ALLOC_PARTITION_BUCKET_H_
 #define PARTITION_ALLOC_PARTITION_BUCKET_H_
 
@@ -171,6 +170,8 @@
       SlotSpanMetadata<MetadataKind::kReadOnly>* slot_span,
       PartitionRoot* root);
 
+  size_t SlotSpanCommittedSize(PartitionRoot* root) const;
+
  private:
   // Sets `this->can_store_raw_size`.
   void InitCanStoreRawSize(bool use_small_single_slot_spans);
diff --git a/src/partition_alloc/partition_page.cc b/src/partition_alloc/partition_page.cc
index 799eeed..9186149 100644
--- a/src/partition_alloc/partition_page.cc
+++ b/src/partition_alloc/partition_page.cc
@@ -248,7 +248,7 @@
   size_t dirty_size =
       base::bits::AlignUp(GetProvisionedSize(), SystemPageSize());
   size_t size_to_decommit =
-      kUseLazyCommit ? dirty_size : bucket->get_bytes_per_span();
+      kUseLazyCommit ? dirty_size : bucket->SlotSpanCommittedSize(root);
 
   PA_DCHECK(root->empty_slot_spans_dirty_bytes >= dirty_size);
   root->empty_slot_spans_dirty_bytes -= dirty_size;
diff --git a/src/partition_alloc/partition_root.cc b/src/partition_alloc/partition_root.cc
index 96f0cb6..a025fd2 100644
--- a/src/partition_alloc/partition_root.cc
+++ b/src/partition_alloc/partition_root.cc
@@ -1154,6 +1154,8 @@
         opts.zapping_by_free_flags == PartitionOptions::kEnabled;
     settings.eventually_zero_freed_memory =
         opts.eventually_zero_freed_memory == PartitionOptions::kEnabled;
+    settings.fewer_memory_regions =
+        opts.fewer_memory_regions == PartitionOptions::kEnabled;
 
     settings.scheduler_loop_quarantine =
         opts.scheduler_loop_quarantine == PartitionOptions::kEnabled;
diff --git a/src/partition_alloc/partition_root.h b/src/partition_alloc/partition_root.h
index 7667c45..da09b35 100644
--- a/src/partition_alloc/partition_root.h
+++ b/src/partition_alloc/partition_root.h
@@ -182,6 +182,7 @@
   // compression ratio of freed memory inside partially allocated pages (due to
   // fragmentation).
   EnableToggle eventually_zero_freed_memory = kDisabled;
+  EnableToggle fewer_memory_regions = kDisabled;
 
   struct {
     EnableToggle enabled = kDisabled;
@@ -264,6 +265,7 @@
     bool zapping_by_free_flags = false;
     bool eventually_zero_freed_memory = false;
     bool scheduler_loop_quarantine = false;
+    bool fewer_memory_regions = false;
 #if PA_BUILDFLAG(HAS_MEMORY_TAGGING)
     bool memory_tagging_enabled_ = false;
     bool use_random_memory_tagging_ = false;
diff --git a/src/partition_alloc/shim/allocator_shim.h b/src/partition_alloc/shim/allocator_shim.h
index 02d7b0a..c65be20 100644
--- a/src/partition_alloc/shim/allocator_shim.h
+++ b/src/partition_alloc/shim/allocator_shim.h
@@ -150,6 +150,9 @@
                                                  bool>;
 using EventuallyZeroFreedMemory = partition_alloc::internal::base::
     StrongAlias<class EventuallyZeroFreedMemoryTag, bool>;
+using FewerMemoryRegions =
+    partition_alloc::internal::base::StrongAlias<class FewerMemoryRegionsTag,
+                                                 bool>;
 using UsePoolOffsetFreelists = partition_alloc::internal::base::
     StrongAlias<class UsePoolOffsetFreelistsTag, bool>;
 
@@ -170,6 +173,7 @@
     size_t scheduler_loop_quarantine_branch_capacity_in_bytes,
     ZappingByFreeFlags zapping_by_free_flags,
     EventuallyZeroFreedMemory eventually_zero_freed_memory,
+    FewerMemoryRegions fewer_memory_regions,
     UsePoolOffsetFreelists use_pool_offset_freelists,
     UseSmallSingleSlotSpans use_small_single_slot_spans);
 
diff --git a/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.cc b/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.cc
index f3ad2e0..60aae20 100644
--- a/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.cc
+++ b/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.cc
@@ -26,6 +26,7 @@
 #include "partition_alloc/partition_root.h"
 #include "partition_alloc/partition_stats.h"
 #include "partition_alloc/shim/allocator_dispatch.h"
+#include "partition_alloc/shim/allocator_shim.h"
 #include "partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc_internal.h"
 #include "partition_alloc/shim/allocator_shim_internals.h"
 
@@ -619,6 +620,7 @@
     size_t scheduler_loop_quarantine_branch_capacity_in_bytes,
     ZappingByFreeFlags zapping_by_free_flags,
     EventuallyZeroFreedMemory eventually_zero_freed_memory,
+    FewerMemoryRegions fewer_memory_regions,
     UsePoolOffsetFreelists use_pool_offset_freelists,
     UseSmallSingleSlotSpans use_small_single_slot_spans) {
   // Calling Get() is actually important, even if the return value isn't
@@ -651,6 +653,9 @@
             eventually_zero_freed_memory
                 ? partition_alloc::PartitionOptions::kEnabled
                 : partition_alloc::PartitionOptions::kDisabled;
+        opts.fewer_memory_regions =
+            fewer_memory_regions ? partition_alloc::PartitionOptions::kEnabled
+                                 : partition_alloc::PartitionOptions::kDisabled;
         opts.scheduler_loop_quarantine =
             scheduler_loop_quarantine
                 ? partition_alloc::PartitionOptions::kEnabled
diff --git a/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.h b/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.h
index 75e2ad8..7d460c6 100644
--- a/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.h
+++ b/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.h
@@ -182,15 +182,16 @@
   size_t scheduler_loop_quarantine_capacity_in_bytes = 0;
   auto zapping_by_free_flags = ZappingByFreeFlags(false);
   auto eventually_zero_freed_memory = EventuallyZeroFreedMemory(false);
+  auto fewer_memory_regions = FewerMemoryRegions(false);
   auto use_pool_offset_freelists = UsePoolOffsetFreelists(true);
   auto use_small_single_slot_spans = UseSmallSingleSlotSpans(true);
 
-  ConfigurePartitions(enable_brp, brp_extra_extras_size, enable_memory_tagging,
-                      memory_tagging_reporting_mode, distribution,
-                      scheduler_loop_quarantine,
-                      scheduler_loop_quarantine_capacity_in_bytes,
-                      zapping_by_free_flags, eventually_zero_freed_memory,
-                      use_pool_offset_freelists, use_small_single_slot_spans);
+  ConfigurePartitions(
+      enable_brp, brp_extra_extras_size, enable_memory_tagging,
+      memory_tagging_reporting_mode, distribution, scheduler_loop_quarantine,
+      scheduler_loop_quarantine_capacity_in_bytes, zapping_by_free_flags,
+      eventually_zero_freed_memory, fewer_memory_regions,
+      use_pool_offset_freelists, use_small_single_slot_spans);
 }
 #endif  // PA_BUILDFLAG(USE_PARTITION_ALLOC_AS_MALLOC)