Revert "[heap profiler] Make use of thread_local instead of base::TLS"

This reverts commit 684f41852a1ac8403b6c2447db684a3ee4002619.

Reason for revert: It broke the ios-simulator-cronet builder: 
https://ci.chromium.org/p/chromium/builders/luci.chromium.try/ios-simulator-cronet

The ios-simulator-cronet compiler says:

../../base/sampling_heap_profiler/poisson_allocation_sampler.cc:101:1: error: thread-local storage is not supported for the current target
thread_local bool g_internal_reentry_guard;
^
../../base/sampling_heap_profiler/poisson_allocation_sampler.cc:104:1: error: thread-local storage is not supported for the current target
thread_local intptr_t g_accumulated_bytes_tls;
^
../../base/sampling_heap_profiler/poisson_allocation_sampler.cc:113:1: error: thread-local storage is not supported for the current target
thread_local bool g_sampling_interval_initialized_tls;
^
3 errors generated.

This blocks CLs that trigger that builder, including 
https://chromium-review.googlesource.com/c/chromium/src/+/1475156
https://chromium-review.googlesource.com/c/chromium/src/+/1430089


Original change's description:
> [heap profiler] Make use of thread_local instead of base::TLS
> 
> The C++ thread_local is slightly faster while making the code clear.
> Besides that calls to TlsGetValue on Windows may alter the result of
> GetLastError, thus changing behavior of the underlying code.
> 
> BUG=920440
> 
> Change-Id: Ic89632f4a54f35d58b93cdecfffc68fc1a94dac1
> Reviewed-on: https://chromium-review.googlesource.com/c/1461681
> Reviewed-by: Vlad Tsyrklevich <vtsyrklevich@chromium.org>
> Reviewed-by: Erik Chen <erikchen@chromium.org>
> Commit-Queue: Alexei Filippov <alph@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#632819}

TBR=alph@chromium.org,erikchen@chromium.org,vtsyrklevich@chromium.org

# Not skipping CQ checks because original CL landed > 1 day ago.

Bug: 920440
Change-Id: Iad4158b406c1b28d14382a8a711a0304c094389c
Reviewed-on: https://chromium-review.googlesource.com/c/1477076
Reviewed-by: Daniel Bratell <bratell@opera.com>
Commit-Queue: Daniel Bratell <bratell@opera.com>
Cr-Commit-Position: refs/heads/master@{#633092}
diff --git a/base/sampling_heap_profiler/poisson_allocation_sampler.cc b/base/sampling_heap_profiler/poisson_allocation_sampler.cc
index 3522e328..1f393456 100644
--- a/base/sampling_heap_profiler/poisson_allocation_sampler.cc
+++ b/base/sampling_heap_profiler/poisson_allocation_sampler.cc
@@ -19,49 +19,87 @@
 #include "base/sampling_heap_profiler/lock_free_address_hash_set.h"
 #include "build/build_config.h"
 
-#if defined(OS_MACOSX) || defined(OS_ANDROID)
+#if defined(OS_POSIX)
 #include <pthread.h>
 #endif
 
+#if defined(OS_WIN)
+#include <windows.h>
+#endif
+
 namespace base {
 
 using allocator::AllocatorDispatch;
 
 namespace {
 
-#if defined(OS_MACOSX) || defined(OS_ANDROID)
+// PoissonAllocationSampler cannot use ThreadLocalStorage, as during thread
+// exiting when TLS storage is already released, there might be a call to
+// |free| which would trigger the profiler hook and would make it access TLS.
+// It instead uses OS primitives directly. As it only stores POD types it
+// does not need thread exit callbacks.
+#if defined(OS_WIN)
 
-// The macOS implementation of libmalloc sometimes calls malloc recursively,
+using TLSKey = DWORD;
+
+void TLSInit(TLSKey* key) {
+  *key = ::TlsAlloc();
+  CHECK_NE(TLS_OUT_OF_INDEXES, *key);
+}
+
+uintptr_t TLSGetValue(const TLSKey& key) {
+  return reinterpret_cast<uintptr_t>(::TlsGetValue(key));
+}
+
+void TLSSetValue(const TLSKey& key, uintptr_t value) {
+  ::TlsSetValue(key, reinterpret_cast<LPVOID>(value));
+}
+
+#else  // defined(OS_WIN)
+
+using TLSKey = pthread_key_t;
+
+void TLSInit(TLSKey* key) {
+  int result = pthread_key_create(key, nullptr);
+  CHECK_EQ(0, result);
+}
+
+uintptr_t TLSGetValue(const TLSKey& key) {
+  return reinterpret_cast<uintptr_t>(pthread_getspecific(key));
+}
+
+void TLSSetValue(const TLSKey& key, uintptr_t value) {
+  pthread_setspecific(key, reinterpret_cast<void*>(value));
+}
+
+#endif
+
+// On MacOS the implementation of libmalloc sometimes calls malloc recursively,
 // delegating allocations between zones. That causes our hooks being called
 // twice. The scoped guard allows us to detect that.
-//
-// Besides that the implementations of thread_local on macOS and Android
-// seem to allocate memory lazily on the first access to thread_local variables.
-// Make use of pthread TLS instead of C++ thread_local there.
+#if defined(OS_MACOSX)
+
 class ReentryGuard {
  public:
-  ReentryGuard() : allowed_(!pthread_getspecific(entered_key_)) {
-    pthread_setspecific(entered_key_, reinterpret_cast<void*>(true));
+  ReentryGuard() : allowed_(!TLSGetValue(entered_key_)) {
+    TLSSetValue(entered_key_, true);
   }
 
   ~ReentryGuard() {
     if (LIKELY(allowed_))
-      pthread_setspecific(entered_key_, nullptr);
+      TLSSetValue(entered_key_, false);
   }
 
   operator bool() { return allowed_; }
 
-  static void Init() {
-    int error = pthread_key_create(&entered_key_, nullptr);
-    CHECK(!error);
-  }
+  static void Init() { TLSInit(&entered_key_); }
 
  private:
   bool allowed_;
-  static pthread_key_t entered_key_;
+  static TLSKey entered_key_;
 };
 
-pthread_key_t ReentryGuard::entered_key_;
+TLSKey ReentryGuard::entered_key_;
 
 #else
 
@@ -73,44 +111,21 @@
 
 #endif
 
+TLSKey g_internal_reentry_guard;
+
 const size_t kDefaultSamplingIntervalBytes = 128 * 1024;
 
-// Notes on TLS usage:
-//
-// * There's no safe way to use TLS in malloc() as both C++ thread_local and
-//   pthread do not pose any guarantees on whether they allocate or not.
-// * We think that we can safely use thread_local w/o re-entrancy guard because
-//   the compiler will use "tls static access model" for static builds of
-//   Chrome [https://www.uclibc.org/docs/tls.pdf].
-//   But there's no guarantee that this will stay true, and in practice
-//   it seems to have problems on macOS/Android. These platforms do allocate
-//   on the very first access to a thread_local on each thread.
-// * Directly using/warming-up platform TLS seems to work on all platforms,
-//   but is also not guaranteed to stay true. We make use of it for reentrancy
-//   guards on macOS/Android.
-// * We cannot use Windows Tls[GS]etValue API as it modifies the result of
-//   GetLastError.
-//
-// Android thread_local seems to be using __emutls_get_address from libgcc:
-// https://github.com/gcc-mirror/gcc/blob/master/libgcc/emutls.c
-// macOS version is based on _tlv_get_addr from dyld:
-// https://opensource.apple.com/source/dyld/dyld-635.2/src/threadLocalHelpers.s.auto.html
-
-// The guard protects against reentering on platforms other the macOS and
-// Android.
-thread_local bool g_internal_reentry_guard;
-
 // Accumulated bytes towards sample thread local key.
-thread_local intptr_t g_accumulated_bytes_tls;
+TLSKey g_accumulated_bytes_tls;
 
-// A boolean used to distinguish first allocation on a thread:
-//   false - first allocation on the thread;
-//   true  - otherwise.
+// A boolean used to distinguish first allocation on a thread.
+//   false - first allocation on the thread.
+//   true  - otherwise
 // Since g_accumulated_bytes_tls is initialized with zero the very first
 // allocation on a thread would always trigger the sample, thus skewing the
 // profile towards such allocations. To mitigate that we use the flag to
 // ensure the first allocation is properly accounted.
-thread_local bool g_sampling_interval_initialized_tls;
+TLSKey g_sampling_interval_initialized_tls;
 
 // Controls if sample intervals should not be randomized. Used for testing.
 bool g_deterministic;
@@ -299,18 +314,18 @@
 }  // namespace
 
 PoissonAllocationSampler::ScopedMuteThreadSamples::ScopedMuteThreadSamples() {
-  DCHECK(!g_internal_reentry_guard);
-  g_internal_reentry_guard = true;
+  DCHECK(!TLSGetValue(g_internal_reentry_guard));
+  TLSSetValue(g_internal_reentry_guard, true);
 }
 
 PoissonAllocationSampler::ScopedMuteThreadSamples::~ScopedMuteThreadSamples() {
-  DCHECK(g_internal_reentry_guard);
-  g_internal_reentry_guard = false;
+  DCHECK(TLSGetValue(g_internal_reentry_guard));
+  TLSSetValue(g_internal_reentry_guard, false);
 }
 
 // static
 bool PoissonAllocationSampler::ScopedMuteThreadSamples::IsMuted() {
-  return g_internal_reentry_guard;
+  return TLSGetValue(g_internal_reentry_guard);
 }
 
 PoissonAllocationSampler* PoissonAllocationSampler::instance_;
@@ -328,6 +343,9 @@
 void PoissonAllocationSampler::Init() {
   static bool init_once = []() {
     ReentryGuard::Init();
+    TLSInit(&g_internal_reentry_guard);
+    TLSInit(&g_accumulated_bytes_tls);
+    TLSInit(&g_sampling_interval_initialized_tls);
     return true;
   }();
   ignore_result(init_once);
@@ -409,11 +427,11 @@
                                            const char* context) {
   if (UNLIKELY(!g_running.load(std::memory_order_relaxed)))
     return;
-  g_accumulated_bytes_tls += size;
-  intptr_t accumulated_bytes = g_accumulated_bytes_tls;
+  intptr_t accumulated_bytes = TLSGetValue(g_accumulated_bytes_tls) + size;
   if (LIKELY(accumulated_bytes < 0))
-    return;
-  instance_->DoRecordAlloc(accumulated_bytes, size, address, type, context);
+    TLSSetValue(g_accumulated_bytes_tls, accumulated_bytes);
+  else
+    instance_->DoRecordAlloc(accumulated_bytes, size, address, type, context);
 }
 
 void PoissonAllocationSampler::DoRecordAlloc(intptr_t accumulated_bytes,
@@ -434,10 +452,10 @@
     ++samples;
   } while (accumulated_bytes >= 0);
 
-  g_accumulated_bytes_tls = accumulated_bytes;
+  TLSSetValue(g_accumulated_bytes_tls, accumulated_bytes);
 
-  if (UNLIKELY(!g_sampling_interval_initialized_tls)) {
-    g_sampling_interval_initialized_tls = true;
+  if (UNLIKELY(!TLSGetValue(g_sampling_interval_initialized_tls))) {
+    TLSSetValue(g_sampling_interval_initialized_tls, true);
     // This is the very first allocation on the thread. It always produces an
     // extra sample because g_accumulated_bytes_tls is initialized with zero
     // due to TLS semantics.
@@ -446,7 +464,7 @@
       return;
   }
 
-  if (UNLIKELY(ScopedMuteThreadSamples::IsMuted()))
+  if (UNLIKELY(TLSGetValue(g_internal_reentry_guard)))
     return;
 
   ScopedMuteThreadSamples no_reentrancy_scope;
@@ -473,7 +491,7 @@
 }
 
 void PoissonAllocationSampler::DoRecordFree(void* address) {
-  if (UNLIKELY(ScopedMuteThreadSamples::IsMuted()))
+  if (UNLIKELY(TLSGetValue(g_internal_reentry_guard)))
     return;
   ScopedMuteThreadSamples no_reentrancy_scope;
   AutoLock lock(mutex_);
diff --git a/base/sampling_heap_profiler/sampling_heap_profiler_unittest.cc b/base/sampling_heap_profiler/sampling_heap_profiler_unittest.cc
index 8e08e01..110d8256 100644
--- a/base/sampling_heap_profiler/sampling_heap_profiler_unittest.cc
+++ b/base/sampling_heap_profiler/sampling_heap_profiler_unittest.cc
@@ -9,7 +9,6 @@
 
 #include "base/allocator/allocator_shim.h"
 #include "base/debug/alias.h"
-#include "base/rand_util.h"
 #include "base/threading/simple_thread.h"
 #include "build/build_config.h"
 #include "testing/gtest/include/gtest/gtest.h"
@@ -201,40 +200,4 @@
   });
 }
 
-// Platform TLS: alloc+free[ns]: 22.184  alloc[ns]: 8.910  free[ns]: 13.274
-// thread_local: alloc+free[ns]: 18.353  alloc[ns]: 5.021  free[ns]: 13.331
-TEST_F(SamplingHeapProfilerTest, MANUAL_SamplerMicroBenchmark) {
-  // With the sampling interval of 100KB it happens to record ~ every 450th
-  // allocation in the browser process. We model this pattern here.
-  constexpr size_t sampling_interval = 100000;
-  constexpr size_t allocation_size = sampling_interval / 450;
-  SamplesCollector collector(0);
-  auto* sampler = PoissonAllocationSampler::Get();
-  sampler->SetSamplingInterval(sampling_interval);
-  sampler->AddSamplesObserver(&collector);
-  int kNumAllocations = 50000000;
-
-  base::TimeTicks t0 = base::TimeTicks::Now();
-  for (int i = 1; i <= kNumAllocations; ++i) {
-    sampler->RecordAlloc(
-        reinterpret_cast<void*>(static_cast<intptr_t>(i)), allocation_size,
-        PoissonAllocationSampler::AllocatorType::kMalloc, nullptr);
-  }
-  base::TimeTicks t1 = base::TimeTicks::Now();
-  for (int i = 1; i <= kNumAllocations; ++i)
-    sampler->RecordFree(reinterpret_cast<void*>(static_cast<intptr_t>(i)));
-  base::TimeTicks t2 = base::TimeTicks::Now();
-
-  printf(
-      "alloc+free[ns]: %.3f   alloc[ns]: %.3f   free[ns]: %.3f   "
-      "alloc+free[mln/s]: %.1f   total[ms]: %.1f\n",
-      (t2 - t0).InNanoseconds() * 1. / kNumAllocations,
-      (t1 - t0).InNanoseconds() * 1. / kNumAllocations,
-      (t2 - t1).InNanoseconds() * 1. / kNumAllocations,
-      kNumAllocations / (t2 - t0).InMicrosecondsF(),
-      (t2 - t0).InMillisecondsF());
-
-  sampler->RemoveSamplesObserver(&collector);
-}
-
 }  // namespace base