#include "SpeedTest.h" | |
#include "Random.h" | |
#include <stdio.h> // for printf | |
#include <memory.h> // for memset | |
#include <math.h> // for sqrt | |
#include <algorithm> // for sort | |
//----------------------------------------------------------------------------- | |
// We view our timing values as a series of random variables V that has been | |
// contaminated with occasional outliers due to cache misses, thread | |
// preemption, etcetera. To filter out the outliers, we search for the largest | |
// subset of V such that all its values are within three standard deviations | |
// of the mean. | |
double CalcMean ( std::vector<double> & v ) | |
{ | |
double mean = 0; | |
for(int i = 0; i < (int)v.size(); i++) | |
{ | |
mean += v[i]; | |
} | |
mean /= double(v.size()); | |
return mean; | |
} | |
double CalcMean ( std::vector<double> & v, int a, int b ) | |
{ | |
double mean = 0; | |
for(int i = a; i <= b; i++) | |
{ | |
mean += v[i]; | |
} | |
mean /= (b-a+1); | |
return mean; | |
} | |
double CalcStdv ( std::vector<double> & v, int a, int b ) | |
{ | |
double mean = CalcMean(v,a,b); | |
double stdv = 0; | |
for(int i = a; i <= b; i++) | |
{ | |
double x = v[i] - mean; | |
stdv += x*x; | |
} | |
stdv = sqrt(stdv / (b-a+1)); | |
return stdv; | |
} | |
// Return true if the largest value in v[0,len) is more than three | |
// standard deviations from the mean | |
bool ContainsOutlier ( std::vector<double> & v, size_t len ) | |
{ | |
double mean = 0; | |
for(size_t i = 0; i < len; i++) | |
{ | |
mean += v[i]; | |
} | |
mean /= double(len); | |
double stdv = 0; | |
for(size_t i = 0; i < len; i++) | |
{ | |
double x = v[i] - mean; | |
stdv += x*x; | |
} | |
stdv = sqrt(stdv / double(len)); | |
double cutoff = mean + stdv*3; | |
return v[len-1] > cutoff; | |
} | |
// Do a binary search to find the largest subset of v that does not contain | |
// outliers. | |
void FilterOutliers ( std::vector<double> & v ) | |
{ | |
std::sort(v.begin(),v.end()); | |
size_t len = 0; | |
for(size_t x = 0x40000000; x; x = x >> 1 ) | |
{ | |
if((len | x) >= v.size()) continue; | |
if(!ContainsOutlier(v,len | x)) | |
{ | |
len |= x; | |
} | |
} | |
v.resize(len); | |
} | |
// Iteratively tighten the set to find a subset that does not contain | |
// outliers. I'm not positive this works correctly in all cases. | |
void FilterOutliers2 ( std::vector<double> & v ) | |
{ | |
std::sort(v.begin(),v.end()); | |
int a = 0; | |
int b = (int)(v.size() - 1); | |
for(int i = 0; i < 10; i++) | |
{ | |
//printf("%d %d\n",a,b); | |
double mean = CalcMean(v,a,b); | |
double stdv = CalcStdv(v,a,b); | |
double cutA = mean - stdv*3; | |
double cutB = mean + stdv*3; | |
while((a < b) && (v[a] < cutA)) a++; | |
while((b > a) && (v[b] > cutB)) b--; | |
} | |
std::vector<double> v2; | |
v2.insert(v2.begin(),v.begin()+a,v.begin()+b+1); | |
v.swap(v2); | |
} | |
//----------------------------------------------------------------------------- | |
// We really want the rdtsc() calls to bracket the function call as tightly | |
// as possible, but that's hard to do portably. We'll try and get as close as | |
// possible by marking the function as NEVER_INLINE (to keep the optimizer from | |
// moving it) and marking the timing variables as "volatile register". | |
NEVER_INLINE int64_t timehash ( pfHash hash, const void * key, int len, int seed ) | |
{ | |
volatile register int64_t begin,end; | |
uint32_t temp[16]; | |
begin = rdtsc(); | |
hash(key,len,seed,temp); | |
end = rdtsc(); | |
return end-begin; | |
} | |
//----------------------------------------------------------------------------- | |
double SpeedTest ( pfHash hash, uint32_t seed, const int trials, const int blocksize, const int align ) | |
{ | |
Rand r(seed); | |
uint8_t * buf = new uint8_t[blocksize + 512]; | |
uint64_t t1 = reinterpret_cast<uint64_t>(buf); | |
t1 = (t1 + 255) & BIG_CONSTANT(0xFFFFFFFFFFFFFF00); | |
t1 += align; | |
uint8_t * block = reinterpret_cast<uint8_t*>(t1); | |
r.rand_p(block,blocksize); | |
//---------- | |
std::vector<double> times; | |
times.reserve(trials); | |
for(int itrial = 0; itrial < trials; itrial++) | |
{ | |
r.rand_p(block,blocksize); | |
double t = (double)timehash(hash,block,blocksize,itrial); | |
if(t > 0) times.push_back(t); | |
} | |
//---------- | |
std::sort(times.begin(),times.end()); | |
FilterOutliers(times); | |
delete [] buf; | |
return CalcMean(times); | |
} | |
//----------------------------------------------------------------------------- | |
// 256k blocks seem to give the best results. | |
void BulkSpeedTest ( pfHash hash, uint32_t seed ) | |
{ | |
const int trials = 2999; | |
const int blocksize = 256 * 1024; | |
printf("Bulk speed test - %d-byte keys\n",blocksize); | |
for(int align = 0; align < 8; align++) | |
{ | |
double cycles = SpeedTest(hash,seed,trials,blocksize,align); | |
double bestbpc = double(blocksize)/cycles; | |
double bestbps = (bestbpc * 3000000000.0 / 1048576.0); | |
printf("Alignment %2d - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n",align,bestbpc,bestbps); | |
} | |
} | |
//----------------------------------------------------------------------------- | |
void TinySpeedTest ( pfHash hash, int hashsize, int keysize, uint32_t seed, bool verbose, double & /*outCycles*/ ) | |
{ | |
const int trials = 999999; | |
if(verbose) printf("Small key speed test - %4d-byte keys - ",keysize); | |
double cycles = SpeedTest(hash,seed,trials,keysize,0); | |
printf("%8.2f cycles/hash\n",cycles); | |
} | |
//----------------------------------------------------------------------------- |