tools/memory/partition_allocator/pa_tcache_inspect.cc - chromium/src - Git at Google

 // Copyright 2021 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // Connects to a running Chrome process, and outputs statistics about its thread
 // caches.

 #include <fcntl.h>
 #include <signal.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <algorithm>
 #include <cstring>
 #include <ios>
 #include <iostream>
 #include <map>
 #include <string>
 #include <vector>

 #include "base/allocator/partition_allocator/partition_root.h"
 #include "base/allocator/partition_allocator/thread_cache.h"
 #include "base/check_op.h"
 #include "base/debug/proc_maps_linux.h"
 #include "base/files/file.h"
 #include "base/files/file_enumerator.h"
 #include "base/files/scoped_file.h"
 #include "base/logging.h"
 #include "base/posix/eintr_wrapper.h"
 #include "base/strings/string_number_conversions.h"
 #include "base/strings/stringprintf.h"
 #include "base/thread_annotations.h"
 #include "build/build_config.h"
 #include "third_party/abseil-cpp/absl/types/optional.h"

 namespace {

 // SIGSTOPs a process.
 class ScopedSigStopper {
  public:
   explicit ScopedSigStopper(pid_t pid) : pid_(pid) { kill(pid_, SIGSTOP); }
   ~ScopedSigStopper() { kill(pid_, SIGCONT); }

  private:
   const pid_t pid_;
 };

 base::ScopedFD OpenProcMem(pid_t pid) {
   std::string path = base::StringPrintf("/proc/%d/mem", pid);
   int fd = open(path.c_str(), O_RDONLY);
   CHECK_NE(fd, -1)
       << "Do you have 0 set in /proc/sys/kernel/yama/ptrace_scope?";

   return base::ScopedFD(fd);
 }

 // Reads a remote process memory.
 bool ReadMemory(int fd, unsigned long address, size_t size, char* buffer) {
   if (HANDLE_EINTR(pread(fd, buffer, size, address)) ==
       static_cast<ssize_t>(size)) {
     return true;
   }

   return false;
 }

 // Scans the process memory to look for the thread cache registry address. This
 // does not need symbols.
 uintptr_t FindThreadCacheRegistry(pid_t pid, int mem_fd) {
   std::vector<base::debug::MappedMemoryRegion> regions;

   {
     // Ensures that the mappings are not going to change.
     ScopedSigStopper stop{pid};

     // There are subtleties when trying to read this file, which we blissfully
     // ignore here. See //base/debug/proc_maps_linux.h for details. We don't use
     // it, since we don't read the maps for ourselves, and everything is already
     // extremely racy. At worst we have to retry.
     LOG(INFO) << "Opening /proc/PID/maps";
     std::string path = base::StringPrintf("/proc/%d/maps", pid);
     auto file = base::File(base::FilePath(path),
                            base::File::FLAG_OPEN | base::File::FLAG_READ);
     CHECK(file.IsValid());
     std::vector<char> data(1e7);
     int bytes_read =
         file.ReadAtCurrentPos(&data[0], static_cast<int>(data.size()) - 1);
     CHECK_GT(bytes_read, 0) << "Cannot read " << path;
     data[bytes_read] = '\0';
     std::string proc_maps(&data[0]);

     LOG(INFO) << "Parsing the maps";
     CHECK(base::debug::ParseProcMaps(proc_maps, &regions));
     LOG(INFO) << "Found " << regions.size() << " regions";
   }

   for (auto& region : regions) {
     using base::debug::MappedMemoryRegion;

     // The array is in .data, meaning that it's mapped from the executable, and
     // has rw-p permissions. For Chrome, .data is quite small, hence the size
     // limit.
     uint8_t expected_permissions = MappedMemoryRegion::Permission::READ |
                                    MappedMemoryRegion::Permission::WRITE |
                                    MappedMemoryRegion::Permission::PRIVATE;
     size_t region_size = region.end - region.start;
     if (region.permissions != expected_permissions || region_size > 1e7 ||
         region.path.empty()) {
       continue;
     }

     LOG(INFO) << "Found a candidate region between " << std::hex << region.start
               << " and " << region.end << std::dec
               << " (size = " << region.end - region.start
               << ") path = " << region.path;
     // Scan the region, looking for the needles.
     uintptr_t needle_array_candidate[3];
     for (uintptr_t address = region.start;
          address < region.end - sizeof(needle_array_candidate);
          address += sizeof(uintptr_t)) {
       bool ok = ReadMemory(mem_fd, reinterpret_cast<unsigned long>(address),
                            sizeof(needle_array_candidate),
                            reinterpret_cast<char*>(needle_array_candidate));
       if (!ok) {
         LOG(WARNING) << "Failed to read";
         continue;
       }

       if (needle_array_candidate[0] == base::internal::tools::kNeedle1 &&
           needle_array_candidate[2] == base::internal::tools::kNeedle2) {
         LOG(INFO) << "Got it! Address = 0x" << std::hex
                   << needle_array_candidate[1];
         return needle_array_candidate[1];
       }
     }
   }

   LOG(ERROR) << "Failed to find the address";
   return 0;
 }

 // Allows to access an object copied from remote memory "as if" it were
 // local. Of course, dereferencing any pointer from within it will at best
 // fault.
 template <typename T>
 class RawBuffer {
  public:
   RawBuffer() = default;
   const T* get() const { return reinterpret_cast<const T*>(buffer_); }
   char* get_buffer() { return buffer_; }

   static absl::optional<RawBuffer<T>> ReadFromMemFd(int mem_fd,
                                                     uintptr_t address) {
     RawBuffer<T> buf;
     bool ok = ReadMemory(mem_fd, reinterpret_cast<unsigned long>(address),
                          sizeof(T), buf.get_buffer());
     if (!ok)
       return absl::nullopt;

     return {buf};
   }

  private:
   alignas(T) char buffer_[sizeof(T)];
 };

 // List all thread names for a given PID.
 std::map<base::PlatformThreadId, std::string> ThreadNames(pid_t pid) {
   std::map<base::PlatformThreadId, std::string> result;

   base::FilePath root_path =
       base::FilePath(base::StringPrintf("/proc/%d/task", pid));
   base::FileEnumerator enumerator{root_path, false,
                                   base::FileEnumerator::DIRECTORIES};

   for (base::FilePath path = enumerator.Next(); !path.empty();
        path = enumerator.Next()) {
     auto stat_path = path.Append("stat");
     base::File stat_file{stat_path,
                          base::File::FLAG_OPEN | base::File::FLAG_READ};
     if (!stat_file.IsValid()) {
       LOG(WARNING) << "Invalid file: " << stat_path.value();
       continue;
     }

     char buffer[4096 + 1];
     int bytes_read = stat_file.ReadAtCurrentPos(buffer, 4096);
     if (bytes_read <= 0)
       continue;
     buffer[bytes_read] = '\0';

     int pid, ppid, pgrp;
     char name[256];
     char state;
     sscanf(buffer, "%d %s %c %d %d", &pid, name, &state, &ppid, &pgrp);
     result[base::PlatformThreadId(pid)] = std::string(name);
   }

   return result;
 }

 }  // namespace

 namespace base {
 namespace internal {
 namespace tools {

 class ThreadCacheInspector {
  public:
   // Distinct from ThreadCache::Bucket because |count| is uint8_t.
   struct BucketStats {
     int count = 0;
     int per_thread_limit = 0;
     size_t size = 0;
   };

   ThreadCacheInspector(uintptr_t registry_addr, int mem_fd, pid_t pid);
   bool GetAllThreadCaches();
   size_t CachedMemory() const;
   uintptr_t GetRootAddress();

   const std::vector<RawBuffer<ThreadCache>>& thread_caches() const {
     return thread_caches_;
   }

   static bool should_purge(const RawBuffer<ThreadCache>& tcache) {
     return tcache.get()->should_purge_;
   }

   std::vector<BucketStats> AccumulateThreadCacheBuckets();
   std::uint8_t largest_active_bucket_index() {
     return registry_.get()->largest_active_bucket_index_;
   }

  private:
   uintptr_t registry_addr_;
   int mem_fd_;
   pid_t pid_;
   RawBuffer<ThreadCacheRegistry> registry_;
   std::vector<RawBuffer<ThreadCache>> thread_caches_;
 };

 class PartitionRootInspector {
  public:
   struct BucketStats {
     size_t slot_size = 0;
     size_t allocated_slots = 0;
     size_t freelist_size = 0;
     size_t active_slot_spans = 0;
   };

   PartitionRootInspector(uintptr_t root_addr, int mem_fd, pid_t pid)
       : root_addr_(root_addr), mem_fd_(mem_fd), pid_(pid) {}
   // Returns true for success.
   bool GatherStatistics();
   const std::vector<BucketStats>& bucket_stats() const { return bucket_stats_; }
   const PartitionRoot<ThreadSafe>* root() { return root_.get(); }

  private:
   void Update();

   uintptr_t root_addr_;
   int mem_fd_;
   pid_t pid_;
   RawBuffer<PartitionRoot<ThreadSafe>> root_;
   std::vector<BucketStats> bucket_stats_;
 };

 ThreadCacheInspector::ThreadCacheInspector(uintptr_t registry_addr,
                                            int mem_fd,
                                            pid_t pid)
     : registry_addr_(registry_addr), mem_fd_(mem_fd), pid_(pid) {}

 // NO_THREAD_SAFETY_ANALYSIS: Well, reading a running process' memory is not
 // really thread-safe.
 bool ThreadCacheInspector::GetAllThreadCaches() NO_THREAD_SAFETY_ANALYSIS {
   thread_caches_.clear();

   // This is going to take a while, make sure that the metadata don't change.
   ScopedSigStopper stopper{pid_};

   auto registry =
       RawBuffer<ThreadCacheRegistry>::ReadFromMemFd(mem_fd_, registry_addr_);
   if (!registry.has_value())
     return false;

   registry_ = *registry;
   ThreadCache* head = registry_.get()->list_head_;
   while (head) {
     auto tcache = RawBuffer<ThreadCache>::ReadFromMemFd(
         mem_fd_, reinterpret_cast<uintptr_t>(head));
     if (!tcache.has_value()) {
       LOG(WARNING) << "Failed to read a ThreadCache";
       return false;
     }
     thread_caches_.push_back(tcache.value());
     head = tcache->get()->next_;
   }
   return true;
 }

 size_t ThreadCacheInspector::CachedMemory() const {
   size_t total_memory = 0;

   for (auto& tcache : thread_caches_) {
     size_t cached_memory = tcache.get()->CachedMemory();
     total_memory += cached_memory;
   }

   return total_memory;
 }

 uintptr_t ThreadCacheInspector::GetRootAddress() {
   CHECK(!thread_caches_.empty());
   return reinterpret_cast<uintptr_t>(thread_caches_[0].get()->root_);
 }

 std::vector<ThreadCacheInspector::BucketStats>
 ThreadCacheInspector::AccumulateThreadCacheBuckets() {
   std::vector<BucketStats> result(ThreadCache::kBucketCount);
   for (auto& tcache : thread_caches_) {
     for (int i = 0; i < ThreadCache::kBucketCount; i++) {
       result[i].count += tcache.get()->buckets_[i].count;
       result[i].per_thread_limit = tcache.get()->buckets_[i].limit;
     }
   }

   BucketIndexLookup lookup{};
   for (int i = 0; i < ThreadCache::kBucketCount; i++) {
     result[i].size = lookup.bucket_sizes()[i];
   }
   return result;
 }

 void PartitionRootInspector::Update() {
   auto root =
       RawBuffer<PartitionRoot<ThreadSafe>>::ReadFromMemFd(mem_fd_, root_addr_);
   if (root.has_value())
     root_ = *root;
 }

 bool PartitionRootInspector::GatherStatistics() {
   // This is going to take a while, make sure that the metadata don't change.
   ScopedSigStopper stopper{pid_};

   Update();
   bucket_stats_.clear();

   for (auto& bucket : root_.get()->buckets) {
     BucketStats stats;
     stats.slot_size = bucket.slot_size;

     // Only look at the small buckets.
     if (bucket.slot_size > 1024)
       return true;

     absl::optional<RawBuffer<SlotSpanMetadata<ThreadSafe>>> metadata;
     for (auto* active_slot_span = bucket.active_slot_spans_head;
          active_slot_span; active_slot_span = metadata->get()->next_slot_span) {
       metadata = RawBuffer<SlotSpanMetadata<ThreadSafe>>::ReadFromMemFd(
           mem_fd_, reinterpret_cast<uintptr_t>(active_slot_span));
       if (!metadata.has_value())
         return false;

       int16_t allocated_slots = metadata->get()->num_allocated_slots;
       // Negative number for a full slot span.
       if (allocated_slots < 0)
         allocated_slots = -allocated_slots;

       stats.allocated_slots += allocated_slots;
       size_t allocated_unprovisioned = metadata->get()->num_allocated_slots +
                                        metadata->get()->num_unprovisioned_slots;
       // Inconsistent data. This can happen since we stopped the process at an
       // arbitrary point.
       if (allocated_unprovisioned > bucket.get_slots_per_span())
         return false;

       size_t freelist_size =
           bucket.get_slots_per_span() - allocated_unprovisioned;

       stats.freelist_size += freelist_size;
       stats.active_slot_spans++;
     }
     bucket_stats_.push_back(stats);
   }

   // We should have found at least one bucket too large, and returned earlier.
   return false;
 }

 void DisplayPerThreadData(
     ThreadCacheInspector& inspector,
     std::map<base::PlatformThreadId, std::string>& tid_to_name) {
   std::cout << "Found " << inspector.thread_caches().size()
             << " caches, total cached memory = "
             << inspector.CachedMemory() / 1024 << "kiB"
             << "\n";

   std::cout << "Per thread:\n"
             << "Thread Name         Size\tPurge\n"
             << std::string(80, '-') << "\n";
   base::ThreadCacheStats all_threads_stats = {0};
   for (const auto& tcache : inspector.thread_caches()) {
     base::ThreadCacheStats stats = {0};
     // No alloc stats, they reach into tcache->root_, which is not valid.
     tcache.get()->AccumulateStats(&stats);
     tcache.get()->AccumulateStats(&all_threads_stats);
     uint64_t count = stats.alloc_count;
     uint64_t hit_rate = (100 * stats.alloc_hits) / count;
     uint64_t too_large = (100 * stats.alloc_miss_too_large) / count;
     uint64_t empty = (100 * stats.alloc_miss_empty) / count;

     std::string thread_name = tid_to_name[tcache.get()->thread_id()];
     std::string padding(20 - thread_name.size(), ' ');
     std::cout << thread_name << padding << tcache.get()->CachedMemory() / 1024
               << "kiB\t" << (inspector.should_purge(tcache) ? 'X' : ' ')
               << "\tHit Rate = " << hit_rate << "%"
               << "\tToo Large = " << too_large << "%"
               << "\tEmpty = " << empty << "%"
               << "\t Count = " << count / 1000 << "k"
               << "\n";
   }

   uint64_t count = all_threads_stats.alloc_count;
   uint64_t hit_rate = (100 * all_threads_stats.alloc_hits) / count;
   uint64_t too_large = (100 * all_threads_stats.alloc_miss_too_large) / count;
   uint64_t empty = (100 * all_threads_stats.alloc_miss_empty) / count;
   std::cout << "\nALL THREADS:        "
             << all_threads_stats.bucket_total_memory / 1024 << "kiB"
             << "\t\tHit Rate = " << hit_rate << "%"
             << "\tToo Large = " << too_large << "%"
             << "\tEmpty = " << empty << "%"
             << "\t Count = " << count / 1000 << "k"
             << "\n";
 }

 void DisplayPerBucketData(ThreadCacheInspector& inspector) {
   std::cout << "Per-bucket stats (All Threads):"
             << "\nBucket Size\tPer-thread Limit\tCount\tTotal Memory\n"
             << std::string(80, '-') << "\n";

   size_t total_memory = 0;
   auto bucket_stats = inspector.AccumulateThreadCacheBuckets();
   for (size_t index = 0; index < bucket_stats.size(); index++) {
     const auto& bucket = bucket_stats[index];
     size_t bucket_memory = bucket.size * bucket.count;

     std::cout << bucket.size << "\t\t" << bucket.per_thread_limit << "\t\t\t"
               << bucket.count << "\t" << bucket_memory / 1024 << "kiB";
     if (inspector.largest_active_bucket_index() == index)
       std::cout << "  <---- Limit";
     std::cout << "\n";

     total_memory += bucket_memory;
   }
   std::cout << "\nALL THREADS TOTAL: " << total_memory / 1024 << "kiB\n";
 }

 void DisplayRootData(PartitionRootInspector& root_inspector) {
   std::cout << "Per-bucket size / allocated slots / free slots / slot span "
                "count:\n";
   for (size_t i = 0; i < root_inspector.bucket_stats().size(); i++) {
     const auto& bucket_stats = root_inspector.bucket_stats()[i];

     std::string line = base::StringPrintf(
         "|% 5d % 6d % 6d % 4d|", static_cast<int>(bucket_stats.slot_size),
         static_cast<int>(bucket_stats.allocated_slots),
         static_cast<int>(bucket_stats.freelist_size),
         static_cast<int>(bucket_stats.active_slot_spans));

     std::cout << line;
     if (i % 4 == 3)
       std::cout << "\n";
     else
       std::cout << "\t";
   }

   auto* root = root_inspector.root();
   uint64_t syscall_count = root->syscall_count.load(std::memory_order_relaxed);
   uint64_t total_duration_ms =
       root->syscall_total_time_ns.load(std::memory_order_relaxed) / 1e6;

   std::cout << "\n\nSyscall count = " << syscall_count
             << "\tTotal duration = " << total_duration_ms << "ms\n"
             << "Max committed size = "
             << root->max_size_of_committed_pages.load(
                    std::memory_order_relaxed) /
                    1024
             << "kiB";
 }

 }  // namespace tools
 }  // namespace internal
 }  // namespace base

 int main(int argc, char** argv) {
   if (argc < 2) {
     LOG(ERROR) << "Usage:" << argv[0] << " <PID> "
                << "[address. 0 to scan the process memory]";
     return 1;
   }

   int pid = atoi(argv[1]);
   uintptr_t registry_address = 0;

   auto mem_fd = OpenProcMem(pid);

   if (argc == 3) {
     uint64_t address;
     CHECK(base::StringToUint64(argv[2], &address));
     registry_address = static_cast<uintptr_t>(address);
   } else {
     // Scan the memory.
     registry_address = FindThreadCacheRegistry(pid, mem_fd.get());
   }

   CHECK(registry_address);

   LOG(INFO) << "Getting the thread cache registry";
   base::internal::tools::ThreadCacheInspector thread_cache_inspector{
       registry_address, mem_fd.get(), pid};
   std::map<base::PlatformThreadId, std::string> tid_to_name;

   while (true) {
     bool ok = thread_cache_inspector.GetAllThreadCaches();
     if (!ok)
       continue;

     base::internal::tools::PartitionRootInspector root_inspector{
         thread_cache_inspector.GetRootAddress(), mem_fd.get(), pid};
     bool has_bucket_stats = root_inspector.GatherStatistics();

     for (const auto& tcache : thread_cache_inspector.thread_caches()) {
       // Note: this is not robust when TIDs are reused, but here this is fine,
       // as at worst we would display wrong data, and TID reuse is very unlikely
       // in normal scenarios.
       if (tid_to_name.find(tcache.get()->thread_id()) == tid_to_name.end()) {
         tid_to_name = ThreadNames(pid);
         break;
       }
     }

     constexpr const char* kClearScreen = "\033[2J\033[1;1H";
     std::cout << kClearScreen;
     DisplayPerThreadData(thread_cache_inspector, tid_to_name);

     std::cout << "\n\n";
     DisplayPerBucketData(thread_cache_inspector);

     if (has_bucket_stats) {
       std::cout << "\n\n";
       DisplayRootData(root_inspector);
     }

     std::cout << std::endl;
     usleep(200000);
   }
 }
	// Copyright 2021 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	// Connects to a running Chrome process, and outputs statistics about its thread
	// caches.

	#include <fcntl.h>
	#include <signal.h>
	#include <sys/stat.h>
	#include <sys/types.h>
	#include <unistd.h>
	#include <algorithm>
	#include <cstring>
	#include <ios>
	#include <iostream>
	#include <map>
	#include <string>
	#include <vector>

	#include "base/allocator/partition_allocator/partition_root.h"
	#include "base/allocator/partition_allocator/thread_cache.h"
	#include "base/check_op.h"
	#include "base/debug/proc_maps_linux.h"
	#include "base/files/file.h"
	#include "base/files/file_enumerator.h"
	#include "base/files/scoped_file.h"
	#include "base/logging.h"
	#include "base/posix/eintr_wrapper.h"
	#include "base/strings/string_number_conversions.h"
	#include "base/strings/stringprintf.h"
	#include "base/thread_annotations.h"
	#include "build/build_config.h"
	#include "third_party/abseil-cpp/absl/types/optional.h"

	namespace {

	// SIGSTOPs a process.
	class ScopedSigStopper {
	public:
	explicit ScopedSigStopper(pid_t pid) : pid_(pid) { kill(pid_, SIGSTOP); }
	~ScopedSigStopper() { kill(pid_, SIGCONT); }

	private:
	const pid_t pid_;
	};

	base::ScopedFD OpenProcMem(pid_t pid) {
	std::string path = base::StringPrintf("/proc/%d/mem", pid);
	int fd = open(path.c_str(), O_RDONLY);
	CHECK_NE(fd, -1)
	<< "Do you have 0 set in /proc/sys/kernel/yama/ptrace_scope?";

	return base::ScopedFD(fd);
	}

	// Reads a remote process memory.
	bool ReadMemory(int fd, unsigned long address, size_t size, char* buffer) {
	if (HANDLE_EINTR(pread(fd, buffer, size, address)) ==
	static_cast<ssize_t>(size)) {
	return true;
	}

	return false;
	}

	// Scans the process memory to look for the thread cache registry address. This
	// does not need symbols.
	uintptr_t FindThreadCacheRegistry(pid_t pid, int mem_fd) {
	std::vector<base::debug::MappedMemoryRegion> regions;

	{
	// Ensures that the mappings are not going to change.
	ScopedSigStopper stop{pid};

	// There are subtleties when trying to read this file, which we blissfully
	// ignore here. See //base/debug/proc_maps_linux.h for details. We don't use
	// it, since we don't read the maps for ourselves, and everything is already
	// extremely racy. At worst we have to retry.
	LOG(INFO) << "Opening /proc/PID/maps";
	std::string path = base::StringPrintf("/proc/%d/maps", pid);
	auto file = base::File(base::FilePath(path),
	base::File::FLAG_OPEN \| base::File::FLAG_READ);
	CHECK(file.IsValid());
	std::vector<char> data(1e7);
	int bytes_read =
	file.ReadAtCurrentPos(&data[0], static_cast<int>(data.size()) - 1);
	CHECK_GT(bytes_read, 0) << "Cannot read " << path;
	data[bytes_read] = '\0';
	std::string proc_maps(&data[0]);

	LOG(INFO) << "Parsing the maps";
	CHECK(base::debug::ParseProcMaps(proc_maps, &regions));
	LOG(INFO) << "Found " << regions.size() << " regions";
	}

	for (auto& region : regions) {
	using base::debug::MappedMemoryRegion;

	// The array is in .data, meaning that it's mapped from the executable, and
	// has rw-p permissions. For Chrome, .data is quite small, hence the size
	// limit.
	uint8_t expected_permissions = MappedMemoryRegion::Permission::READ \|
	MappedMemoryRegion::Permission::WRITE \|
	MappedMemoryRegion::Permission::PRIVATE;
	size_t region_size = region.end - region.start;
	if (region.permissions != expected_permissions \|\| region_size > 1e7 \|\|
	region.path.empty()) {
	continue;
	}

	LOG(INFO) << "Found a candidate region between " << std::hex << region.start
	<< " and " << region.end << std::dec
	<< " (size = " << region.end - region.start
	<< ") path = " << region.path;
	// Scan the region, looking for the needles.
	uintptr_t needle_array_candidate[3];
	for (uintptr_t address = region.start;
	address < region.end - sizeof(needle_array_candidate);
	address += sizeof(uintptr_t)) {
	bool ok = ReadMemory(mem_fd, reinterpret_cast<unsigned long>(address),
	sizeof(needle_array_candidate),
	reinterpret_cast<char*>(needle_array_candidate));
	if (!ok) {
	LOG(WARNING) << "Failed to read";
	continue;
	}

	if (needle_array_candidate[0] == base::internal::tools::kNeedle1 &&
	needle_array_candidate[2] == base::internal::tools::kNeedle2) {
	LOG(INFO) << "Got it! Address = 0x" << std::hex
	<< needle_array_candidate[1];
	return needle_array_candidate[1];
	}
	}
	}

	LOG(ERROR) << "Failed to find the address";
	return 0;
	}

	// Allows to access an object copied from remote memory "as if" it were
	// local. Of course, dereferencing any pointer from within it will at best
	// fault.
	template <typename T>
	class RawBuffer {
	public:
	RawBuffer() = default;
	const T* get() const { return reinterpret_cast<const T*>(buffer_); }
	char* get_buffer() { return buffer_; }

	static absl::optional<RawBuffer<T>> ReadFromMemFd(int mem_fd,
	uintptr_t address) {
	RawBuffer<T> buf;
	bool ok = ReadMemory(mem_fd, reinterpret_cast<unsigned long>(address),
	sizeof(T), buf.get_buffer());
	if (!ok)
	return absl::nullopt;

	return {buf};
	}

	private:
	alignas(T) char buffer_[sizeof(T)];
	};

	// List all thread names for a given PID.
	std::map<base::PlatformThreadId, std::string> ThreadNames(pid_t pid) {
	std::map<base::PlatformThreadId, std::string> result;

	base::FilePath root_path =
	base::FilePath(base::StringPrintf("/proc/%d/task", pid));
	base::FileEnumerator enumerator{root_path, false,
	base::FileEnumerator::DIRECTORIES};

	for (base::FilePath path = enumerator.Next(); !path.empty();
	path = enumerator.Next()) {
	auto stat_path = path.Append("stat");
	base::File stat_file{stat_path,
	base::File::FLAG_OPEN \| base::File::FLAG_READ};
	if (!stat_file.IsValid()) {
	LOG(WARNING) << "Invalid file: " << stat_path.value();
	continue;
	}

	char buffer[4096 + 1];
	int bytes_read = stat_file.ReadAtCurrentPos(buffer, 4096);
	if (bytes_read <= 0)
	continue;
	buffer[bytes_read] = '\0';

	int pid, ppid, pgrp;
	char name[256];
	char state;
	sscanf(buffer, "%d %s %c %d %d", &pid, name, &state, &ppid, &pgrp);
	result[base::PlatformThreadId(pid)] = std::string(name);
	}

	return result;
	}

	} // namespace

	namespace base {
	namespace internal {
	namespace tools {

	class ThreadCacheInspector {
	public:
	// Distinct from ThreadCache::Bucket because \|count\| is uint8_t.
	struct BucketStats {
	int count = 0;
	int per_thread_limit = 0;
	size_t size = 0;
	};

	ThreadCacheInspector(uintptr_t registry_addr, int mem_fd, pid_t pid);
	bool GetAllThreadCaches();
	size_t CachedMemory() const;
	uintptr_t GetRootAddress();

	const std::vector<RawBuffer<ThreadCache>>& thread_caches() const {
	return thread_caches_;
	}

	static bool should_purge(const RawBuffer<ThreadCache>& tcache) {
	return tcache.get()->should_purge_;
	}

	std::vector<BucketStats> AccumulateThreadCacheBuckets();
	std::uint8_t largest_active_bucket_index() {
	return registry_.get()->largest_active_bucket_index_;
	}

	private:
	uintptr_t registry_addr_;
	int mem_fd_;
	pid_t pid_;
	RawBuffer<ThreadCacheRegistry> registry_;
	std::vector<RawBuffer<ThreadCache>> thread_caches_;
	};

	class PartitionRootInspector {
	public:
	struct BucketStats {
	size_t slot_size = 0;
	size_t allocated_slots = 0;
	size_t freelist_size = 0;
	size_t active_slot_spans = 0;
	};

	PartitionRootInspector(uintptr_t root_addr, int mem_fd, pid_t pid)
	: root_addr_(root_addr), mem_fd_(mem_fd), pid_(pid) {}
	// Returns true for success.
	bool GatherStatistics();
	const std::vector<BucketStats>& bucket_stats() const { return bucket_stats_; }
	const PartitionRoot<ThreadSafe>* root() { return root_.get(); }

	private:
	void Update();

	uintptr_t root_addr_;
	int mem_fd_;
	pid_t pid_;
	RawBuffer<PartitionRoot<ThreadSafe>> root_;
	std::vector<BucketStats> bucket_stats_;
	};

	ThreadCacheInspector::ThreadCacheInspector(uintptr_t registry_addr,
	int mem_fd,
	pid_t pid)
	: registry_addr_(registry_addr), mem_fd_(mem_fd), pid_(pid) {}

	// NO_THREAD_SAFETY_ANALYSIS: Well, reading a running process' memory is not
	// really thread-safe.
	bool ThreadCacheInspector::GetAllThreadCaches() NO_THREAD_SAFETY_ANALYSIS {
	thread_caches_.clear();

	// This is going to take a while, make sure that the metadata don't change.
	ScopedSigStopper stopper{pid_};

	auto registry =
	RawBuffer<ThreadCacheRegistry>::ReadFromMemFd(mem_fd_, registry_addr_);
	if (!registry.has_value())
	return false;

	registry_ = *registry;
	ThreadCache* head = registry_.get()->list_head_;
	while (head) {
	auto tcache = RawBuffer<ThreadCache>::ReadFromMemFd(
	mem_fd_, reinterpret_cast<uintptr_t>(head));
	if (!tcache.has_value()) {
	LOG(WARNING) << "Failed to read a ThreadCache";
	return false;
	}
	thread_caches_.push_back(tcache.value());
	head = tcache->get()->next_;
	}
	return true;
	}

	size_t ThreadCacheInspector::CachedMemory() const {
	size_t total_memory = 0;

	for (auto& tcache : thread_caches_) {
	size_t cached_memory = tcache.get()->CachedMemory();
	total_memory += cached_memory;
	}

	return total_memory;
	}

	uintptr_t ThreadCacheInspector::GetRootAddress() {
	CHECK(!thread_caches_.empty());
	return reinterpret_cast<uintptr_t>(thread_caches_[0].get()->root_);
	}

	std::vector<ThreadCacheInspector::BucketStats>
	ThreadCacheInspector::AccumulateThreadCacheBuckets() {
	std::vector<BucketStats> result(ThreadCache::kBucketCount);
	for (auto& tcache : thread_caches_) {
	for (int i = 0; i < ThreadCache::kBucketCount; i++) {
	result[i].count += tcache.get()->buckets_[i].count;
	result[i].per_thread_limit = tcache.get()->buckets_[i].limit;
	}
	}

	BucketIndexLookup lookup{};
	for (int i = 0; i < ThreadCache::kBucketCount; i++) {
	result[i].size = lookup.bucket_sizes()[i];
	}
	return result;
	}

	void PartitionRootInspector::Update() {
	auto root =
	RawBuffer<PartitionRoot<ThreadSafe>>::ReadFromMemFd(mem_fd_, root_addr_);
	if (root.has_value())
	root_ = *root;
	}

	bool PartitionRootInspector::GatherStatistics() {
	// This is going to take a while, make sure that the metadata don't change.
	ScopedSigStopper stopper{pid_};

	Update();
	bucket_stats_.clear();

	for (auto& bucket : root_.get()->buckets) {
	BucketStats stats;
	stats.slot_size = bucket.slot_size;

	// Only look at the small buckets.
	if (bucket.slot_size > 1024)
	return true;

	absl::optional<RawBuffer<SlotSpanMetadata<ThreadSafe>>> metadata;
	for (auto* active_slot_span = bucket.active_slot_spans_head;
	active_slot_span; active_slot_span = metadata->get()->next_slot_span) {
	metadata = RawBuffer<SlotSpanMetadata<ThreadSafe>>::ReadFromMemFd(
	mem_fd_, reinterpret_cast<uintptr_t>(active_slot_span));
	if (!metadata.has_value())
	return false;

	int16_t allocated_slots = metadata->get()->num_allocated_slots;
	// Negative number for a full slot span.
	if (allocated_slots < 0)
	allocated_slots = -allocated_slots;

	stats.allocated_slots += allocated_slots;
	size_t allocated_unprovisioned = metadata->get()->num_allocated_slots +
	metadata->get()->num_unprovisioned_slots;
	// Inconsistent data. This can happen since we stopped the process at an
	// arbitrary point.
	if (allocated_unprovisioned > bucket.get_slots_per_span())
	return false;

	size_t freelist_size =
	bucket.get_slots_per_span() - allocated_unprovisioned;

	stats.freelist_size += freelist_size;
	stats.active_slot_spans++;
	}
	bucket_stats_.push_back(stats);
	}

	// We should have found at least one bucket too large, and returned earlier.
	return false;
	}

	void DisplayPerThreadData(
	ThreadCacheInspector& inspector,
	std::map<base::PlatformThreadId, std::string>& tid_to_name) {
	std::cout << "Found " << inspector.thread_caches().size()
	<< " caches, total cached memory = "
	<< inspector.CachedMemory() / 1024 << "kiB"
	<< "\n";

	std::cout << "Per thread:\n"
	<< "Thread Name Size\tPurge\n"
	<< std::string(80, '-') << "\n";
	base::ThreadCacheStats all_threads_stats = {0};
	for (const auto& tcache : inspector.thread_caches()) {
	base::ThreadCacheStats stats = {0};
	// No alloc stats, they reach into tcache->root_, which is not valid.
	tcache.get()->AccumulateStats(&stats);
	tcache.get()->AccumulateStats(&all_threads_stats);
	uint64_t count = stats.alloc_count;
	uint64_t hit_rate = (100 * stats.alloc_hits) / count;
	uint64_t too_large = (100 * stats.alloc_miss_too_large) / count;
	uint64_t empty = (100 * stats.alloc_miss_empty) / count;

	std::string thread_name = tid_to_name[tcache.get()->thread_id()];
	std::string padding(20 - thread_name.size(), ' ');
	std::cout << thread_name << padding << tcache.get()->CachedMemory() / 1024
	<< "kiB\t" << (inspector.should_purge(tcache) ? 'X' : ' ')
	<< "\tHit Rate = " << hit_rate << "%"
	<< "\tToo Large = " << too_large << "%"
	<< "\tEmpty = " << empty << "%"
	<< "\t Count = " << count / 1000 << "k"
	<< "\n";
	}

	uint64_t count = all_threads_stats.alloc_count;
	uint64_t hit_rate = (100 * all_threads_stats.alloc_hits) / count;
	uint64_t too_large = (100 * all_threads_stats.alloc_miss_too_large) / count;
	uint64_t empty = (100 * all_threads_stats.alloc_miss_empty) / count;
	std::cout << "\nALL THREADS: "
	<< all_threads_stats.bucket_total_memory / 1024 << "kiB"
	<< "\t\tHit Rate = " << hit_rate << "%"
	<< "\tToo Large = " << too_large << "%"
	<< "\tEmpty = " << empty << "%"
	<< "\t Count = " << count / 1000 << "k"
	<< "\n";
	}

	void DisplayPerBucketData(ThreadCacheInspector& inspector) {
	std::cout << "Per-bucket stats (All Threads):"
	<< "\nBucket Size\tPer-thread Limit\tCount\tTotal Memory\n"
	<< std::string(80, '-') << "\n";

	size_t total_memory = 0;
	auto bucket_stats = inspector.AccumulateThreadCacheBuckets();
	for (size_t index = 0; index < bucket_stats.size(); index++) {
	const auto& bucket = bucket_stats[index];
	size_t bucket_memory = bucket.size * bucket.count;

	std::cout << bucket.size << "\t\t" << bucket.per_thread_limit << "\t\t\t"
	<< bucket.count << "\t" << bucket_memory / 1024 << "kiB";
	if (inspector.largest_active_bucket_index() == index)
	std::cout << " <---- Limit";
	std::cout << "\n";

	total_memory += bucket_memory;
	}
	std::cout << "\nALL THREADS TOTAL: " << total_memory / 1024 << "kiB\n";
	}

	void DisplayRootData(PartitionRootInspector& root_inspector) {
	std::cout << "Per-bucket size / allocated slots / free slots / slot span "
	"count:\n";
	for (size_t i = 0; i < root_inspector.bucket_stats().size(); i++) {
	const auto& bucket_stats = root_inspector.bucket_stats()[i];

	std::string line = base::StringPrintf(
	"\|% 5d % 6d % 6d % 4d\|", static_cast<int>(bucket_stats.slot_size),
	static_cast<int>(bucket_stats.allocated_slots),
	static_cast<int>(bucket_stats.freelist_size),
	static_cast<int>(bucket_stats.active_slot_spans));

	std::cout << line;
	if (i % 4 == 3)
	std::cout << "\n";
	else
	std::cout << "\t";
	}

	auto* root = root_inspector.root();
	uint64_t syscall_count = root->syscall_count.load(std::memory_order_relaxed);
	uint64_t total_duration_ms =
	root->syscall_total_time_ns.load(std::memory_order_relaxed) / 1e6;

	std::cout << "\n\nSyscall count = " << syscall_count
	<< "\tTotal duration = " << total_duration_ms << "ms\n"
	<< "Max committed size = "
	<< root->max_size_of_committed_pages.load(
	std::memory_order_relaxed) /
	1024
	<< "kiB";
	}

	} // namespace tools
	} // namespace internal
	} // namespace base

	int main(int argc, char** argv) {
	if (argc < 2) {
	LOG(ERROR) << "Usage:" << argv[0] << " <PID> "
	<< "[address. 0 to scan the process memory]";
	return 1;
	}

	int pid = atoi(argv[1]);
	uintptr_t registry_address = 0;

	auto mem_fd = OpenProcMem(pid);

	if (argc == 3) {
	uint64_t address;
	CHECK(base::StringToUint64(argv[2], &address));
	registry_address = static_cast<uintptr_t>(address);
	} else {
	// Scan the memory.
	registry_address = FindThreadCacheRegistry(pid, mem_fd.get());
	}

	CHECK(registry_address);

	LOG(INFO) << "Getting the thread cache registry";
	base::internal::tools::ThreadCacheInspector thread_cache_inspector{
	registry_address, mem_fd.get(), pid};
	std::map<base::PlatformThreadId, std::string> tid_to_name;

	while (true) {
	bool ok = thread_cache_inspector.GetAllThreadCaches();
	if (!ok)
	continue;

	base::internal::tools::PartitionRootInspector root_inspector{
	thread_cache_inspector.GetRootAddress(), mem_fd.get(), pid};
	bool has_bucket_stats = root_inspector.GatherStatistics();

	for (const auto& tcache : thread_cache_inspector.thread_caches()) {
	// Note: this is not robust when TIDs are reused, but here this is fine,
	// as at worst we would display wrong data, and TID reuse is very unlikely
	// in normal scenarios.
	if (tid_to_name.find(tcache.get()->thread_id()) == tid_to_name.end()) {
	tid_to_name = ThreadNames(pid);
	break;
	}
	}

	constexpr const char* kClearScreen = "\033[2J\033[1;1H";
	std::cout << kClearScreen;
	DisplayPerThreadData(thread_cache_inspector, tid_to_name);

	std::cout << "\n\n";
	DisplayPerBucketData(thread_cache_inspector);

	if (has_bucket_stats) {
	std::cout << "\n\n";
	DisplayRootData(root_inspector);
	}

	std::cout << std::endl;
	usleep(200000);
	}
	}