blob: e0842373ea6e8e25433206040c383ac9be407f89 [file] [log] [blame]
/*
* Copyright (c) 2012 The Native Client Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include <limits.h>
#include <math.h>
#include <stdio.h>
#include <time.h>
#include "native_client/src/include/build_config.h"
#include "native_client/src/include/nacl_assert.h"
#include "native_client/src/include/nacl_macros.h"
#include "native_client/tests/performance/perf_test_compat_osx.h"
#include "native_client/tests/performance/perf_test_runner.h"
double TimeIterations(PerfTest *test, int iterations) {
struct timespec start_time;
struct timespec end_time;
ASSERT_EQ(clock_gettime(CLOCK_MONOTONIC, &start_time), 0);
for (int i = 0; i < iterations; i++) {
test->run();
}
ASSERT_EQ(clock_gettime(CLOCK_MONOTONIC, &end_time), 0);
double total_time =
(end_time.tv_sec - start_time.tv_sec
+ (double) (end_time.tv_nsec - start_time.tv_nsec) / 1e9);
// Output the raw data.
printf(" %.3f usec (%g sec) per iteration: %g sec for %i iterations\n",
total_time / iterations * 1e6,
total_time / iterations,
total_time, iterations);
return total_time;
}
int CalibrateIterationCount(PerfTest *test, double target_time,
int sample_count) {
int calibration_iterations = 100;
double calibration_time;
for (;;) {
calibration_time = TimeIterations(test, calibration_iterations);
// If the test completed too quickly to get an accurate
// measurement, try a larger number of iterations.
if (calibration_time >= 1e-5)
break;
ASSERT_LE(calibration_iterations, INT_MAX / 10);
calibration_iterations *= 10;
}
double iterations_d =
(target_time / (calibration_time / calibration_iterations)
/ sample_count);
// Sanity checks for very fast or very slow tests.
ASSERT_LE(iterations_d, INT_MAX);
int iterations = iterations_d;
if (iterations < 1)
iterations = 1;
return iterations;
}
void TimePerfTest(PerfTest *test, double *mean, double *stddev) {
// 'target_time' is the amount of time we aim to run this perf test
// for in total.
double target_time = 0.5; // seconds
// 'sample_count' is the number of separate timings we take in order
// to measure the variability of the results.
int sample_count = 5;
int iterations = CalibrateIterationCount(test, target_time, sample_count);
double sum = 0;
double sum_of_squares = 0;
for (int i = 0; i < sample_count; i++) {
double time = TimeIterations(test, iterations) / iterations;
sum += time;
sum_of_squares += time * time;
}
*mean = sum / sample_count;
*stddev = sqrt(sum_of_squares / sample_count - *mean * *mean);
}
void PerfTestRealTime(const char *description_string, const char *test_name,
PerfTest *test, double *result_mean) {
double mean;
double stddev;
printf("Measuring real time:\n");
TimePerfTest(test, &mean, &stddev);
printf(" mean: %.6f usec\n", mean * 1e6);
printf(" stddev: %.6f usec\n", stddev * 1e6);
printf(" relative stddev: %.2f%%\n", stddev / mean * 100);
// Output the result in a format that Buildbot will recognise in the
// logs and record, using the Chromium perf testing infrastructure.
printf("RESULT %s: %s= {%.6f, %.6f} us\n",
test_name, description_string, mean * 1e6, stddev * 1e6);
*result_mean = mean;
}
#if defined(__i386__) || defined(__x86_64__)
static INLINE uint64_t ReadTimestampCounter() {
uint32_t edx; // Top 32 bits of timestamp
uint32_t eax; // Bottom 32 bits of timestamp
// NaCl's x86 validators don't allow rdtscp, so we can't check
// whether the thread has been moved to a different core.
__asm__ volatile("rdtsc" : "=d"(edx), "=a"(eax));
return (((uint64_t) edx) << 32) | eax;
}
static int CompareUint64(const void *val1, const void *val2) {
uint64_t i1 = *(uint64_t *) val1;
uint64_t i2 = *(uint64_t *) val2;
if (i1 == i2)
return 0;
return i1 < i2 ? -1 : 1;
}
void PerfTestCycleCount(const char *description_string, const char *test_name,
PerfTest *test, uint64_t *result_cycles) {
printf("Measuring clock cycles:\n");
uint64_t times[101];
for (size_t i = 0; i < NACL_ARRAY_SIZE(times); i++) {
uint64_t start_time = ReadTimestampCounter();
test->run();
uint64_t end_time = ReadTimestampCounter();
times[i] = end_time - start_time;
}
// We expect the first run to be slower because caches won't be
// warm. We print the first and slowest runs so that we can verify
// this.
printf(" first runs (cycles): ");
for (size_t i = 0; i < 10; i++)
printf(" %" PRId64, times[i]);
printf(" ...\n");
qsort(times, NACL_ARRAY_SIZE(times), sizeof(times[0]), CompareUint64);
printf(" slowest runs (cycles): ...");
for (size_t i = NACL_ARRAY_SIZE(times) - 10; i < NACL_ARRAY_SIZE(times); i++)
printf(" %" PRId64, times[i]);
printf("\n");
int count = NACL_ARRAY_SIZE(times) - 1;
uint64_t q1 = times[count * 1 / 4]; // First quartile
uint64_t q2 = times[count * 1 / 2]; // Median
uint64_t q3 = times[count * 3 / 4]; // Third quartile
printf(" min: %" PRId64 " cycles\n", times[0]);
printf(" q1: %" PRId64 " cycles\n", q1);
printf(" median: %" PRId64 " cycles\n", q2);
printf(" q3: %" PRId64 " cycles\n", q3);
printf(" max: %" PRId64 " cycles\n", times[count]);
// The "{...}" RESULT syntax usually means standard deviation but
// here we report the interquartile range.
printf("RESULT %s_CycleCount: %s= {%" PRId64 ", %" PRId64 "} count\n",
test_name, description_string, q2, q3 - q1);
*result_cycles = q2;
}
#endif
void RunPerfTest(const char *description_string, const char *test_name,
PerfTest *test) {
printf("\n%s:\n", test_name);
double mean_time;
PerfTestRealTime(description_string, test_name, test, &mean_time);
#if defined(__i386__) || defined(__x86_64__)
uint64_t cycles;
PerfTestCycleCount(description_string, test_name, test, &cycles);
// The apparent clock speed can be used to sanity-check the results,
// e.g. to see whether the CPU is in power-saving mode.
printf("Apparent clock speed: %.0f MHz\n", cycles / mean_time / 1e6);
#endif
delete test;
}
int main(int argc, char **argv) {
const char *description_string = argc >= 2 ? argv[1] : "time";
// Turn off stdout buffering to aid debugging.
setvbuf(stdout, NULL, _IONBF, 0);
#define RUN_TEST(class_name) \
extern PerfTest *Make##class_name(); \
RunPerfTest(description_string, #class_name, Make##class_name());
RUN_TEST(TestNull);
#if defined(__native_client__)
RUN_TEST(TestNaClSyscall);
#endif
#if NACL_LINUX || NACL_OSX
RUN_TEST(TestHostSyscall);
#endif
RUN_TEST(TestSetjmpLongjmp);
RUN_TEST(TestClockGetTime);
#if !NACL_OSX
RUN_TEST(TestTlsVariable);
#endif
RUN_TEST(TestMmapAnonymous);
RUN_TEST(TestAtomicIncrement);
RUN_TEST(TestUncontendedMutexLock);
RUN_TEST(TestCondvarSignalNoOp);
RUN_TEST(TestThreadCreateAndJoin);
RUN_TEST(TestThreadWakeup);
#if defined(__native_client__)
// Test untrusted fault handling. This should come last because, on
// Windows, registering a fault handler has a performance impact on
// thread creation and exit. This is because when the Windows debug
// exception handler is attached to sel_ldr as a debugger, Windows
// suspends the whole sel_ldr process every time a thread is created
// or exits.
RUN_TEST(TestCatchingFault);
// Measure that overhead by running MakeTestThreadCreateAndJoin again.
RunPerfTest(description_string,
"TestThreadCreateAndJoinAfterSettingFaultHandler",
MakeTestThreadCreateAndJoin());
#endif
#undef RUN_TEST
return 0;
}