[PartitionAlloc] Add feature to use fewer memory regions.
PartitionAlloc creates a lot of memory regions in its current
configuration, up to 10s of thousands, due to the way it manipulates
page permissions. This is problematic on Linux-based systems, where
there is a per-process limit of memory regions (VMAs), which is a bit
under 2^16 by default, and we sometimes see crashes likely due to
reaching this limit.
This commit adds a feature to use fewer regions in
PartitionAlloc. Locally, on Linux, loading google maps, we see:
Before:
$ wc -l /proc/${pid}/maps
4497 /proc/582913/maps
After:
$ wc -l /proc/${pid}/maps
1900 /proc/551934/maps
That is, the *total* number of regions is <1/2 with the feature
enabled. For more details about the mechanism, see the comment in
PartitionBucket::SlotSpanCommittedSize().
Bug: 385400561
Change-Id: I22b0f07860e92c43a5414742d9a3b5565410ef3f
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6157842
Reviewed-by: Takashi Sakamoto <tasak@google.com>
Reviewed-by: Sergei Glazunov <glazunov@google.com>
Commit-Queue: Benoit Lize <lizeb@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1404104}
NOKEYCHECK=True
GitOrigin-RevId: 53690081eb7c8ce64dd74e1f690918f27b7e45ff
diff --git a/src/partition_alloc/partition_alloc_unittest.cc b/src/partition_alloc/partition_alloc_unittest.cc
index 8db3654..c9e8f11 100644
--- a/src/partition_alloc/partition_alloc_unittest.cc
+++ b/src/partition_alloc/partition_alloc_unittest.cc
@@ -93,6 +93,12 @@
#include <unistd.h>
#endif
+#if PA_BUILDFLAG(IS_LINUX) || PA_BUILDFLAG(IS_ANDROID) || \
+ PA_BUILDFLAG(IS_CHROMEOS)
+#include "partition_alloc/partition_alloc_base/debug/proc_maps_linux.h"
+#endif // PA_BUILDFLAG(IS_LINUX) || PA_BUILDFLAG(IS_ANDROID) ||
+ // PA_BUILDFLAG(IS_CHROMEOS)
+
// In the MTE world, the upper bits of a pointer can be decorated with a tag,
// thus allowing many versions of the same pointer to exist. These macros take
// that into account when comparing.
@@ -385,6 +391,7 @@
// Requires explicit `FreeFlag` to activate, no effect otherwise.
opts.zapping_by_free_flags = PartitionOptions::kEnabled;
opts.eventually_zero_freed_memory = PartitionOptions::kEnabled;
+ opts.fewer_memory_regions = PartitionOptions::kDisabled;
opts.scheduler_loop_quarantine = PartitionOptions::kEnabled;
opts.scheduler_loop_quarantine_branch_capacity_in_bytes =
std::numeric_limits<size_t>::max();
@@ -3860,6 +3867,139 @@
allocator.root()->GetSchedulerLoopQuarantineBranchForTesting().Purge();
}
+#if PA_BUILDFLAG(IS_LINUX) || PA_BUILDFLAG(IS_ANDROID) || \
+ PA_BUILDFLAG(IS_CHROMEOS)
+
+TEST_P(PartitionAllocTest, InaccessibleRegionAfterSlotSpans) {
+ auto* root = allocator.root();
+ ASSERT_FALSE(root->settings.fewer_memory_regions);
+
+ // Look for an allocation size that matches a bucket which doesn't fill its
+ // last PartitionPage. Scan through allocation sizes rather than buckets, as
+ // depending on the bucket distribution, some buckets may not be active.
+ PartitionBucket* incomplete_bucket = nullptr;
+ // Only regular buckets, give up if we can't find one (and GTEST_SKIP()
+ // below).
+ for (size_t alloc_size = 0;
+ alloc_size < MaxRegularSlotSpanSize() - ExtraAllocSize(allocator);
+ alloc_size++) {
+ size_t index = SizeToIndex(alloc_size + ExtraAllocSize(allocator));
+ auto& bucket = root->buckets[index];
+ if (bucket.get_bytes_per_span() != bucket.get_pages_per_slot_span()
+ << PartitionPageShift()) {
+ incomplete_bucket = &bucket;
+ break;
+ }
+ }
+
+ // Didn't find any bucket that doesn't fill the last PartitionPage. Unlikley,
+ // but so be it.
+ if (!incomplete_bucket) {
+ GTEST_SKIP();
+ }
+
+ // Allocate memory, get the end of the slot span, and check that there is an
+ // inaccessible region starting there.
+ void* ptr =
+ root->Alloc(incomplete_bucket->slot_size - ExtraAllocSize(allocator), "");
+ ASSERT_TRUE(ptr);
+ uintptr_t start = SlotSpanMetadata<MetadataKind::kReadOnly>::ToSlotSpanStart(
+ SlotSpanMetadata<MetadataKind::kReadOnly>::FromAddr(
+ reinterpret_cast<uintptr_t>(ptr)));
+ uintptr_t end = start + incomplete_bucket->get_bytes_per_span();
+
+ std::string proc_maps;
+ std::vector<base::debug::MappedMemoryRegion> regions;
+ ASSERT_TRUE(base::debug::ReadProcMaps(&proc_maps));
+ ASSERT_TRUE(base::debug::ParseProcMaps(proc_maps, ®ions));
+ bool found = false;
+ for (const auto& region : regions) {
+ if (region.start == end) {
+ found = true;
+ EXPECT_EQ(region.permissions, base::debug::MappedMemoryRegion::PRIVATE);
+ break;
+ }
+ }
+ EXPECT_TRUE(found);
+
+ root->Free(ptr);
+}
+
+TEST_P(PartitionAllocTest, FewerMemoryRegions) {
+ auto* root = allocator.root();
+ ASSERT_FALSE(root->settings.fewer_memory_regions);
+ root->settings.fewer_memory_regions = true;
+
+ // Look for an allocation size that matches a bucket which doesn't fill its
+ // last PartitionPage. Scan through allocation sizes rather than buckets, as
+ // depending on the bucket distribution, some buckets may not be active.
+ PartitionBucket* incomplete_bucket = nullptr;
+ // Only regular buckets, give up if we can't find one (and GTEST_SKIP()
+ // below).
+ for (size_t alloc_size = 0;
+ alloc_size < MaxRegularSlotSpanSize() - ExtraAllocSize(allocator);
+ alloc_size++) {
+ size_t index = SizeToIndex(alloc_size + ExtraAllocSize(allocator));
+ auto& bucket = root->buckets[index];
+ if (bucket.get_bytes_per_span() != bucket.get_pages_per_slot_span()
+ << PartitionPageShift()) {
+ incomplete_bucket = &bucket;
+ break;
+ }
+ }
+
+ // Didn't find any bucket that doesn't fill the last PartitionPage. Unlikley,
+ // but so be it.
+ if (!incomplete_bucket) {
+ GTEST_SKIP();
+ }
+
+ // Allocate memory, get the end of the slot span, and check that there is an
+ // inaccessible region starting there.
+ void* ptr =
+ root->Alloc(incomplete_bucket->slot_size - ExtraAllocSize(allocator), "");
+ ASSERT_TRUE(ptr);
+ uintptr_t start = SlotSpanMetadata<MetadataKind::kReadOnly>::ToSlotSpanStart(
+ SlotSpanMetadata<MetadataKind::kReadOnly>::FromAddr(
+ reinterpret_cast<uintptr_t>(ptr)));
+ uintptr_t end = start + incomplete_bucket->get_bytes_per_span();
+
+ std::string proc_maps;
+ std::vector<base::debug::MappedMemoryRegion> regions;
+ ASSERT_TRUE(base::debug::ReadProcMaps(&proc_maps));
+ ASSERT_TRUE(base::debug::ParseProcMaps(proc_maps, ®ions));
+
+ // Cannot find a region that starts exactly at the end of the incomplete slot
+ // span.
+ bool found = false;
+ for (const auto& region : regions) {
+ if (region.start == end) {
+ found = true;
+ break;
+ }
+ }
+ EXPECT_FALSE(found);
+
+ // The enclosing region has RW permissions.
+ found = false;
+ for (const auto& region : regions) {
+ if (region.start <= end && region.end >= end) {
+ EXPECT_EQ(region.permissions,
+ base::debug::MappedMemoryRegion::READ |
+ base::debug::MappedMemoryRegion::WRITE |
+ base::debug::MappedMemoryRegion::PRIVATE);
+ found = true;
+ break;
+ }
+ }
+ EXPECT_TRUE(found);
+
+ root->Free(ptr);
+}
+
+#endif // PA_BUILDFLAG(IS_LINUX) || PA_BUILDFLAG(IS_ANDROID) ||
+ // PA_BUILDFLAG(IS_CHROMEOS)
+
TEST_P(PartitionAllocTest, ZeroFreedMemory) {
auto* root = allocator.root();
ASSERT_TRUE(root->settings.eventually_zero_freed_memory);
diff --git a/src/partition_alloc/partition_bucket.cc b/src/partition_alloc/partition_bucket.cc
index 11e5f72..db2c5da 100644
--- a/src/partition_alloc/partition_bucket.cc
+++ b/src/partition_alloc/partition_bucket.cc
@@ -708,7 +708,7 @@
PA_DEBUG_DATA_ON_STACK("spancmt", slot_span_committed_size);
root->RecommitSystemPagesForData(
- slot_span_start, slot_span_committed_size,
+ slot_span_start, SlotSpanCommittedSize(root),
PageAccessibilityDisposition::kRequireUpdate,
slot_size <= kMaxMemoryTaggingSize);
}
@@ -1590,4 +1590,63 @@
InitializeSlotSpan(slot_span, root);
}
+size_t PartitionBucket::SlotSpanCommittedSize(PartitionRoot* root) const {
+ // With lazy commit, we certainly don't want to commit more than
+ // necessary. This is not reached, but keep the CHECK() as documentation.
+ PA_CHECK(!kUseLazyCommit);
+
+ // Memory is reserved in units of PartitionPage, but a given slot span may be
+ // smaller than the reserved area. For instance (assuming 4k pages), for a
+ // bucket where the slot span size is 40kiB, we reserve 4 PartitionPage = 16 *
+ // 4 = 48kiB, but only ever commit 40kiB out of it.
+ //
+ // This means that the address space then looks like, assuming that the
+ // PartitionPage next to it is committed:
+ // [SlotSpan range, 40kiB] rw-p
+ // [Unused area in the last PartitionPage, 8kiB] ---p
+ // [Next PartitionPages, size unknown ] rw-p
+ //
+ // So we have a "hole" of inaccessible memory, and 3 memory regions. If
+ // instead we commit the full PartitionPages, we get (due to the kernel
+ // merging neighboring regions with uniform permissions):
+ //
+ // [SlotSpan range, 40kiB + Unused area, 8kiB + next PartitionPages] rw-p
+ //
+ // So 1 memory region rather then 3. This matters, because on Linux kernels,
+ // there is a maximum number of VMAs per process, with the default limit a bit
+ // less than 2^16, and Chromium sometimes hits the limit (see
+ // /proc/sys/vm/max_map_count for the current limit), largely because of
+ // PartitionAlloc contributing thousands of regions. Locally, on a Linux
+ // system, this reduces the number of PartitionAlloc regions by up to ~4x.
+ //
+ // Why is it safe?
+ // The extra memory is not used by anything, so committing it doesn't make a
+ // difference. It makes it accessible though.
+ //
+ // How much does it cost?
+ // Almost nothing. On Linux, "committing" memory merely changes its
+ // permissions, it doesn't cost any memory until the pages are touched, which
+ // they are not. However, mprotect()-ed areas that are writable count towards
+ // the RLIMIT_DATA resource limit, which is used by the sandbox. So, while
+ // this change costs 0 physical memory (and actually saves some, by reducing
+ // the size of the VMA red-black tree in the kernel), it might increase
+ // slightly the cases where we bump into the sandbox memory limit.
+ //
+ // Is it safe to do while running?
+ // Since this is decided through root settings, the value changes at runtime,
+ // so we may decommit memory that was never committed. This is safe onLinux,
+ // since decommitting is just changing permissions back to PROT_NONE, which
+ // the tail end would already have.
+ //
+ // Can we do better?
+ // For simplicity, we do not "fix" the regions that were committed before the
+ // settings are changed (after feature list initialization). This means that
+ // we end up with more regions that we could. The intent is to run a field
+ // experiment, then change the default value, at which point we get the full
+ // impact, so this is only temporary.
+ return root->settings.fewer_memory_regions
+ ? (get_pages_per_slot_span() << PartitionPageShift())
+ : get_bytes_per_span();
+}
+
} // namespace partition_alloc::internal
diff --git a/src/partition_alloc/partition_bucket.h b/src/partition_alloc/partition_bucket.h
index 637a6ce..30f55bc 100644
--- a/src/partition_alloc/partition_bucket.h
+++ b/src/partition_alloc/partition_bucket.h
@@ -1,7 +1,6 @@
// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-
#ifndef PARTITION_ALLOC_PARTITION_BUCKET_H_
#define PARTITION_ALLOC_PARTITION_BUCKET_H_
@@ -171,6 +170,8 @@
SlotSpanMetadata<MetadataKind::kReadOnly>* slot_span,
PartitionRoot* root);
+ size_t SlotSpanCommittedSize(PartitionRoot* root) const;
+
private:
// Sets `this->can_store_raw_size`.
void InitCanStoreRawSize(bool use_small_single_slot_spans);
diff --git a/src/partition_alloc/partition_page.cc b/src/partition_alloc/partition_page.cc
index 799eeed..9186149 100644
--- a/src/partition_alloc/partition_page.cc
+++ b/src/partition_alloc/partition_page.cc
@@ -248,7 +248,7 @@
size_t dirty_size =
base::bits::AlignUp(GetProvisionedSize(), SystemPageSize());
size_t size_to_decommit =
- kUseLazyCommit ? dirty_size : bucket->get_bytes_per_span();
+ kUseLazyCommit ? dirty_size : bucket->SlotSpanCommittedSize(root);
PA_DCHECK(root->empty_slot_spans_dirty_bytes >= dirty_size);
root->empty_slot_spans_dirty_bytes -= dirty_size;
diff --git a/src/partition_alloc/partition_root.cc b/src/partition_alloc/partition_root.cc
index 96f0cb6..a025fd2 100644
--- a/src/partition_alloc/partition_root.cc
+++ b/src/partition_alloc/partition_root.cc
@@ -1154,6 +1154,8 @@
opts.zapping_by_free_flags == PartitionOptions::kEnabled;
settings.eventually_zero_freed_memory =
opts.eventually_zero_freed_memory == PartitionOptions::kEnabled;
+ settings.fewer_memory_regions =
+ opts.fewer_memory_regions == PartitionOptions::kEnabled;
settings.scheduler_loop_quarantine =
opts.scheduler_loop_quarantine == PartitionOptions::kEnabled;
diff --git a/src/partition_alloc/partition_root.h b/src/partition_alloc/partition_root.h
index 7667c45..da09b35 100644
--- a/src/partition_alloc/partition_root.h
+++ b/src/partition_alloc/partition_root.h
@@ -182,6 +182,7 @@
// compression ratio of freed memory inside partially allocated pages (due to
// fragmentation).
EnableToggle eventually_zero_freed_memory = kDisabled;
+ EnableToggle fewer_memory_regions = kDisabled;
struct {
EnableToggle enabled = kDisabled;
@@ -264,6 +265,7 @@
bool zapping_by_free_flags = false;
bool eventually_zero_freed_memory = false;
bool scheduler_loop_quarantine = false;
+ bool fewer_memory_regions = false;
#if PA_BUILDFLAG(HAS_MEMORY_TAGGING)
bool memory_tagging_enabled_ = false;
bool use_random_memory_tagging_ = false;
diff --git a/src/partition_alloc/shim/allocator_shim.h b/src/partition_alloc/shim/allocator_shim.h
index 02d7b0a..c65be20 100644
--- a/src/partition_alloc/shim/allocator_shim.h
+++ b/src/partition_alloc/shim/allocator_shim.h
@@ -150,6 +150,9 @@
bool>;
using EventuallyZeroFreedMemory = partition_alloc::internal::base::
StrongAlias<class EventuallyZeroFreedMemoryTag, bool>;
+using FewerMemoryRegions =
+ partition_alloc::internal::base::StrongAlias<class FewerMemoryRegionsTag,
+ bool>;
using UsePoolOffsetFreelists = partition_alloc::internal::base::
StrongAlias<class UsePoolOffsetFreelistsTag, bool>;
@@ -170,6 +173,7 @@
size_t scheduler_loop_quarantine_branch_capacity_in_bytes,
ZappingByFreeFlags zapping_by_free_flags,
EventuallyZeroFreedMemory eventually_zero_freed_memory,
+ FewerMemoryRegions fewer_memory_regions,
UsePoolOffsetFreelists use_pool_offset_freelists,
UseSmallSingleSlotSpans use_small_single_slot_spans);
diff --git a/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.cc b/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.cc
index f3ad2e0..60aae20 100644
--- a/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.cc
+++ b/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.cc
@@ -26,6 +26,7 @@
#include "partition_alloc/partition_root.h"
#include "partition_alloc/partition_stats.h"
#include "partition_alloc/shim/allocator_dispatch.h"
+#include "partition_alloc/shim/allocator_shim.h"
#include "partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc_internal.h"
#include "partition_alloc/shim/allocator_shim_internals.h"
@@ -619,6 +620,7 @@
size_t scheduler_loop_quarantine_branch_capacity_in_bytes,
ZappingByFreeFlags zapping_by_free_flags,
EventuallyZeroFreedMemory eventually_zero_freed_memory,
+ FewerMemoryRegions fewer_memory_regions,
UsePoolOffsetFreelists use_pool_offset_freelists,
UseSmallSingleSlotSpans use_small_single_slot_spans) {
// Calling Get() is actually important, even if the return value isn't
@@ -651,6 +653,9 @@
eventually_zero_freed_memory
? partition_alloc::PartitionOptions::kEnabled
: partition_alloc::PartitionOptions::kDisabled;
+ opts.fewer_memory_regions =
+ fewer_memory_regions ? partition_alloc::PartitionOptions::kEnabled
+ : partition_alloc::PartitionOptions::kDisabled;
opts.scheduler_loop_quarantine =
scheduler_loop_quarantine
? partition_alloc::PartitionOptions::kEnabled
diff --git a/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.h b/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.h
index 75e2ad8..7d460c6 100644
--- a/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.h
+++ b/src/partition_alloc/shim/allocator_shim_default_dispatch_to_partition_alloc.h
@@ -182,15 +182,16 @@
size_t scheduler_loop_quarantine_capacity_in_bytes = 0;
auto zapping_by_free_flags = ZappingByFreeFlags(false);
auto eventually_zero_freed_memory = EventuallyZeroFreedMemory(false);
+ auto fewer_memory_regions = FewerMemoryRegions(false);
auto use_pool_offset_freelists = UsePoolOffsetFreelists(true);
auto use_small_single_slot_spans = UseSmallSingleSlotSpans(true);
- ConfigurePartitions(enable_brp, brp_extra_extras_size, enable_memory_tagging,
- memory_tagging_reporting_mode, distribution,
- scheduler_loop_quarantine,
- scheduler_loop_quarantine_capacity_in_bytes,
- zapping_by_free_flags, eventually_zero_freed_memory,
- use_pool_offset_freelists, use_small_single_slot_spans);
+ ConfigurePartitions(
+ enable_brp, brp_extra_extras_size, enable_memory_tagging,
+ memory_tagging_reporting_mode, distribution, scheduler_loop_quarantine,
+ scheduler_loop_quarantine_capacity_in_bytes, zapping_by_free_flags,
+ eventually_zero_freed_memory, fewer_memory_regions,
+ use_pool_offset_freelists, use_small_single_slot_spans);
}
#endif // PA_BUILDFLAG(USE_PARTITION_ALLOC_AS_MALLOC)