tests/sse/benchmark_sse.h - external/github.com/kripken/emscripten - Git at Google

 #pragma once

 #include <stdio.h>
 #include <math.h>
 #include <time.h>
 #include <inttypes.h>
 #include <stdlib.h>
 #include <assert.h>

 #ifdef __EMSCRIPTEN__
 #include <emscripten/emscripten.h>
 #endif

 #if defined(__unix__) && !defined(__EMSCRIPTEN__) // Native build without Emscripten.
 #include <time.h>
 #include <errno.h>
 #include <string.h>
 #endif

 #ifdef __APPLE__
 #define aligned_alloc(align, size) malloc((size))
 #endif

 #ifdef WIN32
 #include <Windows.h>
 #define aligned_alloc(align, size) _aligned_malloc((size), (align))
 #endif

 // Scalar horizonal max across four lanes.
 float hmax(__m128 m)
 {
 	float f[4];
 	_mm_storeu_ps(f, m);
 	return fmax(fmax(f[0], f[1]), fmax(f[2], f[3]));
 }

 #include "../tick.h"

 const int N = 8*1024*1024;

 tick_t scalarTotalTicks = 0;
 tick_t simdTotalTicks = 0;
 tick_t scalarTicks = 0;
 const char *chartName = "";
 #define SETCHART(x) chartName = (x);

 #define START() \
 	do { \
 		tick_t start = tick();

 bool comma=false;
 #define END(result, name) \
 		tick_t end = tick(); \
 		tick_t ticks = end - start; \
 		scalarTotalTicks += scalarTicks; \
 		simdTotalTicks += ticks; \
 		double nsecs = (double)ticks * 1000.0 * 1000.0 * 1000.0 / ticks_per_sec() / N; \
 		printf("%s{ \"chart\": \"%s\", \"category\": \"%s\", \"scalar\": %f, \"simd\": %f }\n", comma?",":"", chartName, name, scalarTime, nsecs); \
 		comma = true; \
 		printf("%s", (result) != 0 ? "Error!" : ""); \
 	} while(0)

 #define ENDSCALAR(result, name) \
 		tick_t end = tick(); \
 		scalarTicks = end - start; \
 		scalarTime = (double)scalarTicks * 1000.0 * 1000.0 * 1000.0 / ticks_per_sec() / N; \
 		printf("%s", (result) != 0 ? "Error!" : ""); \
 	} while(0)

 void Print(__m128 m)
 {
 	float val[4];
 	_mm_storeu_ps(val, m);
 	fprintf(stderr, "[%g, %g, %g, %g]\n", val[3], val[2], val[1], val[0]);
 }

 bool always_true() { return time(NULL) != 0; } // This function always returns true, but the compiler should not know this.

 #ifdef _MSC_VER
 #define NOINLINE __declspec(noinline)
 #define INLINE __forceinline
 #else
 #define NOINLINE __attribute__((noinline))
 #define INLINE __inline__
 #endif

 // Slightly awkward way to allocate so that compiler will definitely not see this memory area as compile-time optimizable:
 int NOINLINE *alloc_int_buffer() { return always_true() ? (int*)aligned_alloc(16, (N+16)*sizeof(int)) : 0; }
 float NOINLINE *alloc_float_buffer() { return always_true() ? (float*)aligned_alloc(16, (N+16)*sizeof(float)) : 0; }
 double NOINLINE *alloc_double_buffer() { return always_true() ? (double*)aligned_alloc(16, (N+16)*sizeof(double)) : 0; }

 template<typename T>
 T checksum_dst(T *dst)
 {
 	if (always_true()) return 0.f;
 	else
 	{
 		T s = 0.f; for(int i = 0; i < N; ++i) s += dst[i];
 		return s;
 	}
 }

 uint32_t fcastu(float f) { return *(uint32_t*)&f; }
 uint64_t dcastu(double f) { return *(uint64_t*)&f; }
 float ucastf(uint32_t t) { return *(float*)&t; }
 double ucastd(uint64_t t) { return *(double*)&t; }

 #define LOAD_STORE_F(msg, load_instr, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride) \
 	START(); \
 		for(int i = 0; i < N; i += num_elems_stride) \
 			store_instr((store_ptr_type)dst_flt+store_offset+i, load_instr(src_flt+load_offset+i)); \
 	END(checksum_dst(dst_flt), msg);

 #define LOAD_STORE_D(msg, load_instr, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride) \
 	START(); \
 		for(int i = 0; i < N; i += num_elems_stride) \
 			store_instr((store_ptr_type)dst_dbl+store_offset+i, load_instr(src_dbl+load_offset+i)); \
 	END(checksum_dst(dst_dbl), msg);

 #define LOAD_STORE_I(msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride) \
 	START(); \
 		for(int i = 0; i < N; i += num_elems_stride) \
 			store_instr((__m128i*)(dst_int+store_offset+i), load_instr((__m128i*)(src_int+load_offset+i))); \
 	END(checksum_dst(dst_int), msg);

 // load M64*, store M128
 #define LOAD_STORE_M64(msg, reg, load_instr, load_ptr_type, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride) \
 	START(); \
 		for(int i = 0; i < N; i += num_elems_stride) \
 			store_instr((store_ptr_type)dst_flt+store_offset+i, load_instr(reg, (load_ptr_type)(src_flt+load_offset+i))); \
 	END(checksum_dst(dst_flt), msg);

 #define LOAD_STORE_64_F(msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride) \
 	START(); \
 		for(int i = 0; i < N; i += num_elems_stride) \
 			store_instr((__m64*)(dst_flt+store_offset+i), load_instr(src_flt+load_offset+i)); \
 	END(checksum_dst(dst_flt), msg);

 #define LOAD_STORE_64_D(msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride) \
 	START(); \
 		for(int i = 0; i < N; i += num_elems_stride) \
 			store_instr((__m64*)(dst_dbl+store_offset+i), load_instr(src_dbl+load_offset+i)); \
 	END(checksum_dst(dst_dbl), msg);

 #define SET_STORE_F(msg, set_instr) \
 	START(); \
 		for(int i = 0; i < N; i += 4) \
 			_mm_store_ps(dst_flt+i, set_instr); \
 	END(checksum_dst(dst_flt), msg);

 #define SET_STORE_D(msg, set_instr) \
 	START(); \
 		for(int i = 0; i < N; i += 4) \
 			_mm_store_pd(dst_dbl+i, set_instr); \
 	END(checksum_dst(dst_dbl), msg);

 #define UNARYOP_F_F(msg, instr, op0) \
 	START(); \
 		__m128 o = op0; \
 		for(int i = 0; i < N; i += 4) \
 			o = instr(o); \
 		_mm_store_ps(dst_flt, o); \
 	END(checksum_dst(dst_flt), msg);

 #define UNARYOP_I_I(msg, instr, op0) \
 	START(); \
 		__m128 o = op0; \
 		for(int i = 0; i < N; i += 4) \
 			o = instr(o); \
 		_mm_store_si128((__m128i*)dst_int, o); \
 	END(checksum_dst(dst_int), msg);

 #define UNARYOP_i_F(msg, instr) \
 	START(); \
 		for(int i = 0; i < N; i += 4) \
 			dst_int_scalar += instr; \
 	END(dst_int_scalar, msg);

 #define UNARYOP_D_D(msg, instr, op0) \
 	START(); \
 		__m128d o = op0; \
 		for(int i = 0; i < N; i += 2) \
 			o = instr(o); \
 		_mm_store_pd(dst_dbl, o); \
 	END(checksum_dst(dst_dbl), msg);

 #define BINARYOP_F_FF(msg, instr, op0, op1) \
 	START(); \
 		__m128 o0 = op0; \
 		__m128 o1 = op1; \
 		for(int i = 0; i < N; i += 4) \
 			o0 = instr(o0, o1); \
 		_mm_store_ps(dst_flt, o0); \
 	END(checksum_dst(dst_flt), msg);

 #define BINARYOP_I_II(msg, instr, op0, op1) \
 	START(); \
 		__m128 o0 = op0; \
 		__m128 o1 = op1; \
 		for(int i = 0; i < N; i += 4) \
 			o0 = instr(o0, o1); \
 		_mm_store_si128((__m128i*)dst_int, o0); \
 	END(checksum_dst(dst_int), msg);

 #define BINARYOP_D_DD(msg, instr, op0, op1) \
 	START(); \
 		__m128d o0 = op0; \
 		__m128d o1 = op1; \
 		for(int i = 0; i < N; i += 2) \
 			o0 = instr(o0, o1); \
 		_mm_store_pd(dst_dbl, o0); \
 	END(checksum_dst(dst_dbl), msg);

 #define Max(a,b) ((a) >= (b) ? (a) : (b))
 #define Min(a,b) ((a) <= (b) ? (a) : (b))

 static INLINE int Isnan(float __f)
 {
   return (*(unsigned int*)&__f << 1) > 0xFF000000u;
 }
	#pragma once

	#include <stdio.h>
	#include <math.h>
	#include <time.h>
	#include <inttypes.h>
	#include <stdlib.h>
	#include <assert.h>

	#ifdef __EMSCRIPTEN__
	#include <emscripten/emscripten.h>
	#endif

	#if defined(__unix__) && !defined(__EMSCRIPTEN__) // Native build without Emscripten.
	#include <time.h>
	#include <errno.h>
	#include <string.h>
	#endif

	#ifdef __APPLE__
	#define aligned_alloc(align, size) malloc((size))
	#endif

	#ifdef WIN32
	#include <Windows.h>
	#define aligned_alloc(align, size) _aligned_malloc((size), (align))
	#endif

	// Scalar horizonal max across four lanes.
	float hmax(__m128 m)
	{
	float f[4];
	_mm_storeu_ps(f, m);
	return fmax(fmax(f[0], f[1]), fmax(f[2], f[3]));
	}

	#include "../tick.h"

	const int N = 810241024;

	tick_t scalarTotalTicks = 0;
	tick_t simdTotalTicks = 0;
	tick_t scalarTicks = 0;
	const char *chartName = "";
	#define SETCHART(x) chartName = (x);

	#define START() \
	do { \
	tick_t start = tick();

	bool comma=false;
	#define END(result, name) \
	tick_t end = tick(); \
	tick_t ticks = end - start; \
	scalarTotalTicks += scalarTicks; \
	simdTotalTicks += ticks; \
	double nsecs = (double)ticks * 1000.0 * 1000.0 * 1000.0 / ticks_per_sec() / N; \
	printf("%s{ \"chart\": \"%s\", \"category\": \"%s\", \"scalar\": %f, \"simd\": %f }\n", comma?",":"", chartName, name, scalarTime, nsecs); \
	comma = true; \
	printf("%s", (result) != 0 ? "Error!" : ""); \
	} while(0)

	#define ENDSCALAR(result, name) \
	tick_t end = tick(); \
	scalarTicks = end - start; \
	scalarTime = (double)scalarTicks * 1000.0 * 1000.0 * 1000.0 / ticks_per_sec() / N; \
	printf("%s", (result) != 0 ? "Error!" : ""); \
	} while(0)

	void Print(__m128 m)
	{
	float val[4];
	_mm_storeu_ps(val, m);
	fprintf(stderr, "[%g, %g, %g, %g]\n", val[3], val[2], val[1], val[0]);
	}

	bool always_true() { return time(NULL) != 0; } // This function always returns true, but the compiler should not know this.

	#ifdef _MSC_VER
	#define NOINLINE __declspec(noinline)
	#define INLINE __forceinline
	#else
	#define NOINLINE __attribute__((noinline))
	#define INLINE __inline__
	#endif

	// Slightly awkward way to allocate so that compiler will definitely not see this memory area as compile-time optimizable:
	int NOINLINE alloc_int_buffer() { return always_true() ? (int)aligned_alloc(16, (N+16)*sizeof(int)) : 0; }
	float NOINLINE alloc_float_buffer() { return always_true() ? (float)aligned_alloc(16, (N+16)*sizeof(float)) : 0; }
	double NOINLINE alloc_double_buffer() { return always_true() ? (double)aligned_alloc(16, (N+16)*sizeof(double)) : 0; }

	template<typename T>
	T checksum_dst(T *dst)
	{
	if (always_true()) return 0.f;
	else
	{
	T s = 0.f; for(int i = 0; i < N; ++i) s += dst[i];
	return s;
	}
	}

	uint32_t fcastu(float f) { return (uint32_t)&f; }
	uint64_t dcastu(double f) { return (uint64_t)&f; }
	float ucastf(uint32_t t) { return (float)&t; }
	double ucastd(uint64_t t) { return (double)&t; }

	#define LOAD_STORE_F(msg, load_instr, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride) \
	START(); \
	for(int i = 0; i < N; i += num_elems_stride) \
	store_instr((store_ptr_type)dst_flt+store_offset+i, load_instr(src_flt+load_offset+i)); \
	END(checksum_dst(dst_flt), msg);

	#define LOAD_STORE_D(msg, load_instr, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride) \
	START(); \
	for(int i = 0; i < N; i += num_elems_stride) \
	store_instr((store_ptr_type)dst_dbl+store_offset+i, load_instr(src_dbl+load_offset+i)); \
	END(checksum_dst(dst_dbl), msg);

	#define LOAD_STORE_I(msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride) \
	START(); \
	for(int i = 0; i < N; i += num_elems_stride) \
	store_instr((__m128i)(dst_int+store_offset+i), load_instr((__m128i)(src_int+load_offset+i))); \
	END(checksum_dst(dst_int), msg);

	// load M64*, store M128
	#define LOAD_STORE_M64(msg, reg, load_instr, load_ptr_type, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride) \
	START(); \
	for(int i = 0; i < N; i += num_elems_stride) \
	store_instr((store_ptr_type)dst_flt+store_offset+i, load_instr(reg, (load_ptr_type)(src_flt+load_offset+i))); \
	END(checksum_dst(dst_flt), msg);

	#define LOAD_STORE_64_F(msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride) \
	START(); \
	for(int i = 0; i < N; i += num_elems_stride) \
	store_instr((__m64*)(dst_flt+store_offset+i), load_instr(src_flt+load_offset+i)); \
	END(checksum_dst(dst_flt), msg);

	#define LOAD_STORE_64_D(msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride) \
	START(); \
	for(int i = 0; i < N; i += num_elems_stride) \
	store_instr((__m64*)(dst_dbl+store_offset+i), load_instr(src_dbl+load_offset+i)); \
	END(checksum_dst(dst_dbl), msg);

	#define SET_STORE_F(msg, set_instr) \
	START(); \
	for(int i = 0; i < N; i += 4) \
	_mm_store_ps(dst_flt+i, set_instr); \
	END(checksum_dst(dst_flt), msg);

	#define SET_STORE_D(msg, set_instr) \
	START(); \
	for(int i = 0; i < N; i += 4) \
	_mm_store_pd(dst_dbl+i, set_instr); \
	END(checksum_dst(dst_dbl), msg);

	#define UNARYOP_F_F(msg, instr, op0) \
	START(); \
	__m128 o = op0; \
	for(int i = 0; i < N; i += 4) \
	o = instr(o); \
	_mm_store_ps(dst_flt, o); \
	END(checksum_dst(dst_flt), msg);

	#define UNARYOP_I_I(msg, instr, op0) \
	START(); \
	__m128 o = op0; \
	for(int i = 0; i < N; i += 4) \
	o = instr(o); \
	_mm_store_si128((__m128i*)dst_int, o); \
	END(checksum_dst(dst_int), msg);

	#define UNARYOP_i_F(msg, instr) \
	START(); \
	for(int i = 0; i < N; i += 4) \
	dst_int_scalar += instr; \
	END(dst_int_scalar, msg);

	#define UNARYOP_D_D(msg, instr, op0) \
	START(); \
	__m128d o = op0; \
	for(int i = 0; i < N; i += 2) \
	o = instr(o); \
	_mm_store_pd(dst_dbl, o); \
	END(checksum_dst(dst_dbl), msg);

	#define BINARYOP_F_FF(msg, instr, op0, op1) \
	START(); \
	__m128 o0 = op0; \
	__m128 o1 = op1; \
	for(int i = 0; i < N; i += 4) \
	o0 = instr(o0, o1); \
	_mm_store_ps(dst_flt, o0); \
	END(checksum_dst(dst_flt), msg);

	#define BINARYOP_I_II(msg, instr, op0, op1) \
	START(); \
	__m128 o0 = op0; \
	__m128 o1 = op1; \
	for(int i = 0; i < N; i += 4) \
	o0 = instr(o0, o1); \
	_mm_store_si128((__m128i*)dst_int, o0); \
	END(checksum_dst(dst_int), msg);

	#define BINARYOP_D_DD(msg, instr, op0, op1) \
	START(); \
	__m128d o0 = op0; \
	__m128d o1 = op1; \
	for(int i = 0; i < N; i += 2) \
	o0 = instr(o0, o1); \
	_mm_store_pd(dst_dbl, o0); \
	END(checksum_dst(dst_dbl), msg);

	#define Max(a,b) ((a) >= (b) ? (a) : (b))
	#define Min(a,b) ((a) <= (b) ? (a) : (b))

	static INLINE int Isnan(float __f)
	{
	return ((unsigned int)&__f << 1) > 0xFF000000u;
	}