Add Setup/Teardown option on Benchmark. (#1269)

* Add Setup/Teardown option on Benchmark.

Motivations:
- feature parity with our internal library. (which has ~718 callers)
- more flexible than cordinating setup/teardown inside the benchmark routine.

* change Setup/Teardown callback type to raw function pointers

* add test file to cmake file

* move b.Teardown() up

* add const to param of Setup/Teardown callbacks

* fix  comment and add doc to user_guide

* fix typo

* fix doc, fix test and add bindings to python/benchmark.cc

* fix binding again

* remove explicit C cast - that was wrong

* change policy to reference_internal

* try removing the bindinds ...

* clean up

* add more tests with repetitions and fixtures

* more comments

* init setup/teardown callbacks to NULL

* s/nullptr/NULL

* removed unused var

* change assertion on fixture_interaction::fixture_setup

* move NULL init to .cc file
diff --git a/docs/user_guide.md b/docs/user_guide.md
index e08ed5b..d1a4e1e 100644
--- a/docs/user_guide.md
+++ b/docs/user_guide.md
@@ -18,6 +18,8 @@
 
 [Runtime and Reporting Considerations](#runtime-and-reporting-considerations)
 
+[Setup/Teardown] (#setup-teardown)
+
 [Passing Arguments](#passing-arguments)
 
 [Custom Benchmark Name](#custom-benchmark-name)
@@ -238,6 +240,38 @@
 As well as the per-benchmark entries, a preamble in the report will include
 information about the machine on which the benchmarks are run.
 
+<a name="setup-teardown" />
+
+## Setup/Teardown
+
+Global setup/teardown specific to each benchmark can be done by
+passing a callback to Setup/Teardown:
+
+The setup/teardown callbacks will be invoked once for each benchmark.
+If the benchmark is multi-threaded (will run in k threads), they will be invoked exactly once before
+each run with k threads.
+If the benchmark uses different size groups of threads, the above will be true for each size group.
+
+Eg.,
+
+```c++
+static void DoSetup(const benchmark::State& state) {
+}
+
+static void DoTeardown(const benchmark::State& state) {
+}
+
+static void BM_func(benchmark::State& state) {...}
+
+BENCHMARK(BM_func)->Arg(1)->Arg(3)->Threads(16)->Threads(32)->Setup(DoSetup)->Teardown(DoTeardown);
+
+```
+
+In this example, `DoSetup` and `DoTearDown` will be invoked 4 times each,
+specifically, once for each of this family:
+ - BM_func_Arg_1_Threads_16, BM_func_Arg_1_Threads_32
+ - BM_func_Arg_3_Threads_16, BM_func_Arg_3_Threads_32
+
 <a name="passing-arguments" />
 
 ## Passing Arguments
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index 31f6173..4fdb545 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -969,6 +969,23 @@
     return Ranges(ranges);
   }
 
+  // Have "setup" and/or "teardown" invoked once for every benchmark run.
+  // If the benchmark is multi-threaded (will run in k threads concurrently),
+  // the setup callback will be be invoked exactly once (not k times) before
+  // each run with k threads. Time allowing (e.g. for a short benchmark), there
+  // may be multiple such runs per benchmark, each run with its own
+  // "setup"/"teardown".
+  //
+  // If the benchmark uses different size groups of threads (e.g. via
+  // ThreadRange), the above will be true for each size group.
+  //
+  // The callback will be passed a State object, which includes the number
+  // of threads, thread-index, benchmark arguments, etc.
+  //
+  // The callback must not be NULL or self-deleting.
+  Benchmark* Setup(void (*setup)(const benchmark::State&));
+  Benchmark* Teardown(void (*teardown)(const benchmark::State&));
+
   // Pass this benchmark object to *func, which can customize
   // the benchmark by calling various methods like Arg, Args,
   // Threads, etc.
@@ -1100,6 +1117,10 @@
   std::vector<Statistics> statistics_;
   std::vector<int> thread_counts_;
 
+  typedef void (*callback_function)(const benchmark::State&);
+  callback_function setup_;
+  callback_function teardown_;
+
   Benchmark& operator=(Benchmark const&);
 };
 
diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc
index 89da519..4de36e3 100644
--- a/src/benchmark_api_internal.cc
+++ b/src/benchmark_api_internal.cc
@@ -78,6 +78,9 @@
   if (!benchmark_.thread_counts_.empty()) {
     name_.threads = StrFormat("threads:%d", threads_);
   }
+
+  setup_ = benchmark_.setup_;
+  teardown_ = benchmark_.teardown_;
 }
 
 State BenchmarkInstance::Run(
@@ -90,5 +93,20 @@
   return st;
 }
 
+void BenchmarkInstance::Setup() const {
+  if (setup_) {
+    State st(/*iters*/ 1, args_, /*thread_id*/ 0, threads_, nullptr, nullptr,
+             nullptr);
+    setup_(st);
+  }
+}
+
+void BenchmarkInstance::Teardown() const {
+  if (teardown_) {
+    State st(/*iters*/ 1, args_, /*thread_id*/ 0, threads_, nullptr, nullptr,
+             nullptr);
+    teardown_(st);
+  }
+}
 }  // namespace internal
 }  // namespace benchmark
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
index 592dd46..94c2b29 100644
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@@ -38,6 +38,8 @@
   double min_time() const { return min_time_; }
   IterationCount iterations() const { return iterations_; }
   int threads() const { return threads_; }
+  void Setup() const;
+  void Teardown() const;
 
   State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
             internal::ThreadManager* manager,
@@ -62,6 +64,10 @@
   double min_time_;
   IterationCount iterations_;
   int threads_;  // Number of concurrent threads to us
+
+  typedef void (*callback_function)(const benchmark::State&);
+  callback_function setup_ = nullptr;
+  callback_function teardown_ = nullptr;
 };
 
 bool FindBenchmarksInternal(const std::string& re,
diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc
index 0b148e2..b3f85dc 100644
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@@ -211,7 +211,9 @@
       use_real_time_(false),
       use_manual_time_(false),
       complexity_(oNone),
-      complexity_lambda_(nullptr) {
+      complexity_lambda_(nullptr),
+      setup_(nullptr),
+      teardown_(nullptr) {
   ComputeStatistics("mean", StatisticsMean);
   ComputeStatistics("median", StatisticsMedian);
   ComputeStatistics("stddev", StatisticsStdDev);
@@ -321,6 +323,18 @@
   return this;
 }
 
+Benchmark* Benchmark::Setup(void (*setup)(const benchmark::State&)) {
+  BM_CHECK(setup != nullptr);
+  setup_ = setup;
+  return this;
+}
+
+Benchmark* Benchmark::Teardown(void (*teardown)(const benchmark::State&)) {
+  BM_CHECK(teardown != nullptr);
+  teardown_ = teardown;
+  return this;
+}
+
 Benchmark* Benchmark::RangeMultiplier(int multiplier) {
   BM_CHECK(multiplier > 1);
   range_multiplier_ = multiplier;
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
index fd23d46..eac807b 100644
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@@ -279,7 +279,9 @@
   // is *only* calculated for the *first* repetition, and other repetitions
   // simply use that precomputed iteration count.
   for (;;) {
+    b.Setup();
     i = DoNIterations();
+    b.Teardown();
 
     // Do we consider the results to be significant?
     // If we are doing repetitions, and the first repetition was already done,
@@ -316,10 +318,12 @@
     memory_manager->Start();
     std::unique_ptr<internal::ThreadManager> manager;
     manager.reset(new internal::ThreadManager(1));
+    b.Setup();
     RunInThread(&b, memory_iterations, 0, manager.get(),
                 perf_counters_measurement_ptr);
     manager->WaitForAllThreads();
     manager.reset();
+    b.Teardown();
 
     BENCHMARK_DISABLE_DEPRECATED_WARNING
     memory_manager->Stop(memory_result);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 0f7b265..162af53 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -59,6 +59,9 @@
 compile_benchmark_test(spec_arg_test)
 add_test(NAME spec_arg COMMAND spec_arg_test --benchmark_filter=BM_NotChosen)
 
+compile_benchmark_test(benchmark_setup_teardown_test)
+add_test(NAME benchmark_setup_teardown COMMAND benchmark_setup_teardown_test)
+
 compile_benchmark_test(filter_test)
 macro(add_filter_test name filter expect)
   add_test(NAME ${name} COMMAND filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
diff --git a/test/benchmark_setup_teardown_test.cc b/test/benchmark_setup_teardown_test.cc
new file mode 100644
index 0000000..f18c570
--- /dev/null
+++ b/test/benchmark_setup_teardown_test.cc
@@ -0,0 +1,157 @@
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+// Test that Setup() and Teardown() are called exactly once
+// for each benchmark run (single-threaded).
+namespace single {
+static int setup_call = 0;
+static int teardown_call = 0;
+}  // namespace single
+static void DoSetup1(const benchmark::State& state) {
+  ++single::setup_call;
+
+  // Setup/Teardown should never be called with any thread_idx != 0.
+  assert(state.thread_index() == 0);
+}
+
+static void DoTeardown1(const benchmark::State& state) {
+  ++single::teardown_call;
+  assert(state.thread_index() == 0);
+}
+
+static void BM_with_setup(benchmark::State& state) {
+  for (auto s : state) {
+  }
+}
+BENCHMARK(BM_with_setup)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Iterations(100)
+    ->Setup(DoSetup1)
+    ->Teardown(DoTeardown1);
+
+// Test that Setup() and Teardown() are called once for each group of threads.
+namespace concurrent {
+static std::atomic<int> setup_call(0);
+static std::atomic<int> teardown_call(0);
+static std::atomic<int> func_call(0);
+}  // namespace concurrent
+
+static void DoSetup2(const benchmark::State& state) {
+  concurrent::setup_call.fetch_add(1, std::memory_order_acquire);
+  assert(state.thread_index() == 0);
+}
+
+static void DoTeardown2(const benchmark::State& state) {
+  concurrent::teardown_call.fetch_add(1, std::memory_order_acquire);
+  assert(state.thread_index() == 0);
+}
+
+static void BM_concurrent(benchmark::State& state) {
+  for (auto s : state) {
+  }
+  concurrent::func_call.fetch_add(1, std::memory_order_acquire);
+}
+
+BENCHMARK(BM_concurrent)
+    ->Setup(DoSetup2)
+    ->Teardown(DoTeardown2)
+    ->Iterations(100)
+    ->Threads(5)
+    ->Threads(10)
+    ->Threads(15);
+
+// Testing interaction with Fixture::Setup/Teardown
+namespace fixture_interaction {
+int setup = 0;
+int fixture_setup = 0;
+}  // namespace fixture_interaction
+
+#define FIXTURE_BECHMARK_NAME MyFixture
+
+class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
+ public:
+  void SetUp(const ::benchmark::State& state) BENCHMARK_OVERRIDE {
+    fixture_interaction::fixture_setup++;
+  }
+
+  ~FIXTURE_BECHMARK_NAME() {}
+};
+
+BENCHMARK_F(FIXTURE_BECHMARK_NAME, BM_WithFixture)(benchmark::State& st) {
+  for (auto _ : st) {
+  }
+}
+
+static void DoSetupWithFixture(const benchmark::State& state) {
+  fixture_interaction::setup++;
+}
+
+BENCHMARK_REGISTER_F(FIXTURE_BECHMARK_NAME, BM_WithFixture)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Setup(DoSetupWithFixture)
+    ->Repetitions(1)
+    ->Iterations(100);
+
+// Testing repetitions.
+namespace repetitions {
+int setup = 0;
+}
+
+static void DoSetupWithRepetitions(const benchmark::State& state) {
+  repetitions::setup++;
+}
+static void BM_WithRep(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+
+BENCHMARK(BM_WithRep)
+    ->Arg(1)
+    ->Arg(3)
+    ->Arg(5)
+    ->Arg(7)
+    ->Setup(DoSetupWithRepetitions)
+    ->Iterations(100)
+    ->Repetitions(4);
+
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+
+  size_t ret = benchmark::RunSpecifiedBenchmarks(".");
+  assert(ret > 0);
+
+  // Setup/Teardown is called once for each arg group (1,3,5,7).
+  assert(single::setup_call == 4);
+  assert(single::teardown_call == 4);
+
+  // 3 group of threads calling this function (3,5,10).
+  assert(concurrent::setup_call.load(std::memory_order_relaxed) == 3);
+  assert(concurrent::teardown_call.load(std::memory_order_relaxed) == 3);
+  assert((5 + 10 + 15) ==
+         concurrent::func_call.load(std::memory_order_relaxed));
+
+  // Setup is called 4 times, once for each arg group (1,3,5,7)
+  assert(fixture_interaction::setup == 4);
+  // Fixture::Setup is called everytime the bm routine is run.
+  // The exact number is indeterministic, so we just assert that
+  // it's more than setup.
+  assert(fixture_interaction::fixture_setup > fixture_interaction::setup);
+
+  // Setup is call once for each repetition * num_arg =  4 * 4 = 16.
+  assert(repetitions::setup == 16);
+
+  return 0;
+}