[MicroBenchmarks] Add benchmarks for vector memory check generation.

This patch adds an initial set of benchmarks for memory runtime check
generation.

It comes with a function that takes 4 pointers and accesses them in a
vectorizable inner loop, but it requires runtime checks. The outer loop
should increase the contribution of the runtime checks to the overall
runtime.

The function is then used to benchmark multiple different scenarios
where all pointers are completely disjoint or overlap in different ways.

The goal of the initial benchmarks is to measure the impact of D119078.

The current benchmarks focus on cases that can be supported by the initial
version of  D119078, but should be extended as more cases can be handled.

Reviewed By: Meinersbur

Differential Revision: https://reviews.llvm.org/D121008
diff --git a/MicroBenchmarks/LoopVectorization/CMakeLists.txt b/MicroBenchmarks/LoopVectorization/CMakeLists.txt
index 99e94d8..a19a7c0 100644
--- a/MicroBenchmarks/LoopVectorization/CMakeLists.txt
+++ b/MicroBenchmarks/LoopVectorization/CMakeLists.txt
@@ -11,6 +11,8 @@
 
 llvm_test_executable(LoopVectorizationBenchmarks
   main.cpp
-  MathFunctions.cpp)
+  MathFunctions.cpp
+  RuntimeChecks.cpp
+)
 
 target_link_libraries(LoopVectorizationBenchmarks benchmark)
diff --git a/MicroBenchmarks/LoopVectorization/RuntimeChecks.cpp b/MicroBenchmarks/LoopVectorization/RuntimeChecks.cpp
new file mode 100644
index 0000000..51ec2a1
--- /dev/null
+++ b/MicroBenchmarks/LoopVectorization/RuntimeChecks.cpp
@@ -0,0 +1,128 @@
+#include <memory>
+#include <random>
+#include <stdint.h>
+
+#include "benchmark/benchmark.h"
+
+static std::mt19937 rng;
+
+// Initialize array A with random numbers.
+template <typename Ty>
+static void init_data(const std::unique_ptr<Ty[]> &A, unsigned N) {
+  std::uniform_int_distribution<int64_t> distrib(
+      std::numeric_limits<Ty>::min(), std::numeric_limits<Ty>::max());
+  for (unsigned i = 0; i < N; i++)
+    A[i] = static_cast<Ty>(distrib(rng));
+}
+
+static void vecWithRuntimeChecks4Pointers(uint32_t *A, uint32_t *B, uint32_t *C,
+                                          uint32_t *D, unsigned TC,
+                                          unsigned Step) {
+  // Prevent the unroller from interfering.
+#pragma clang loop unroll(disable)
+  for (unsigned I = 0; I < 1000; I++) {
+    // Make sure the inner loop cannot be optimized out.
+    benchmark::ClobberMemory();
+
+#pragma clang loop interleave_count(1)
+#pragma clang loop unroll(disable)
+    for (unsigned J = 0; J < TC; ++J) {
+      A[J] = B[J] + C[J] + D[J];
+    }
+
+    // Make sure the runtime checks cannot be hoisted out of the outer loop.
+    A += Step;
+    B++;
+    C++;
+    D++;
+  }
+}
+
+/// Helper to block optimizing \p F based on its arguments.
+template <typename F, typename... Args>
+__attribute__((optnone)) static void callThroughOptnone(F &&f, Args &&...args) {
+  f(std::forward<Args>(args)...);
+}
+
+// Benchmark for when runtime checks are passing.
+void benchVecWithRuntimeChecks4PointersAllDisjointIncreasing(
+    benchmark::State &state) {
+  unsigned TC = state.range(0);
+  unsigned Size = 4 * TC + 1000;
+  std::unique_ptr<uint32_t[]> A(new uint32_t[Size]);
+
+  init_data(A, Size);
+  for (auto _ : state) {
+    callThroughOptnone(vecWithRuntimeChecks4Pointers, &A[0], &A[TC], &A[2 * TC],
+                       &A[3 * TC], TC, 1);
+    benchmark::DoNotOptimize(A);
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(benchVecWithRuntimeChecks4PointersAllDisjointIncreasing)
+    ->Arg(32)
+    ->Arg(1000);
+
+void benchVecWithRuntimeChecks4PointersAllDisjointDecreasing(
+    benchmark::State &state) {
+  unsigned TC = state.range(0);
+  unsigned Size = 4 * TC + 1000;
+  std::unique_ptr<uint32_t[]> A(new uint32_t[Size]);
+  init_data(A, Size);
+
+  for (auto _ : state) {
+    callThroughOptnone(vecWithRuntimeChecks4Pointers, &A[3 * TC], &A[2 * TC],
+                       &A[1 * TC], &A[0], TC, 1);
+    benchmark::DoNotOptimize(A);
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(benchVecWithRuntimeChecks4PointersAllDisjointDecreasing)
+    ->Arg(32)
+    ->Arg(1000);
+
+void benchVecWithRuntimeChecks4PointersDEqualsA(benchmark::State &state) {
+  unsigned TC = state.range(0);
+  unsigned Size = 4 * TC + 1000;
+  std::unique_ptr<uint32_t[]> A(new uint32_t[Size]);
+  init_data(A, Size);
+
+  for (auto _ : state) {
+    callThroughOptnone(vecWithRuntimeChecks4Pointers, &A[0], &A[TC], &A[2 * TC],
+                       &A[0], TC, 1);
+    benchmark::DoNotOptimize(A);
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(benchVecWithRuntimeChecks4PointersDEqualsA)->Arg(32)->Arg(1000);
+
+// Benchmark for when runtime checks are failing.
+void benchVecWithRuntimeChecks4PointersDBeforeA(benchmark::State &state) {
+  unsigned TC = state.range(0);
+  unsigned Size = 4 * TC + 1000;
+  std::unique_ptr<uint32_t[]> A(new uint32_t[Size]);
+  init_data(A, Size);
+
+  for (auto _ : state) {
+    callThroughOptnone(vecWithRuntimeChecks4Pointers, &A[2], &A[2 * TC],
+                       &A[3 * TC], &A[0], TC, 1);
+    benchmark::DoNotOptimize(A);
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(benchVecWithRuntimeChecks4PointersDBeforeA)->Arg(32)->Arg(1000);
+
+void benchVecWithRuntimeChecks4PointersDAfterA(benchmark::State &state) {
+  unsigned TC = state.range(0);
+  unsigned Size = 4 * TC + 1000;
+  std::unique_ptr<uint32_t[]> A(new uint32_t[Size]);
+  init_data(A, Size);
+
+  for (auto _ : state) {
+    callThroughOptnone(vecWithRuntimeChecks4Pointers, &A[0], &A[2 * TC],
+                       &A[3 * TC], &A[2], TC, 1);
+    benchmark::DoNotOptimize(A);
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(benchVecWithRuntimeChecks4PointersDAfterA)->Arg(32)->Arg(1000);