blob: b8d4d50a7aae6598ebef7b5fecf4b3addf588988 [file] [log] [blame]
// Compute sin() in 4 lanes:
// Algorithm adopted from: http://gruntthepeon.free.fr/ssemath/
// Author: Peter Jensen
(function () {
// Kernel configuration
var kernelConfig = {
kernelName: "Sine",
kernelInit: init,
kernelCleanup: cleanup,
kernelSimd: simd,
kernelNonSimd: nonSimd,
kernelIterations: 100000000
};
// Hook up to the harness
benchmarks.add (new Benchmark (kernelConfig));
// Kernel Initializer
function init () {
// Do initial sanity check and initialize data for the kernels.
// The sanity check should verify that the simd and nonSimd results
// are the same.
// It is recommended to do minimal object creation in the kernels
// themselves. If global data needs to be initialized, here would
// be the place to do it.
// If the sanity checks fails the kernels will not be executed
// Returns:
// true: First run (unoptimized) of the kernels passed
// false: First run (unoptimized) of the kernels failed
var simdResult = simd(1);
var nonSimdResult = nonSimd(1);
return almostEqual (simdResult, nonSimdResult);
}
// Kernel Cleanup
function cleanup () {
// Do final sanity check and perform cleanup.
// This function is called when all the kernel iterations have been
// executed, so they should be in their final optimized version. The
// sanity check done during initialization will probably be of the
// initial unoptimized version.
// Returns:
// true: Last run (optimized) of the kernels passed
// false: last run (optimized) of the kernels failed
var simdResult = simd(1);
var nonSimdResult = nonSimd(1);
return almostEqual (simdResult, nonSimdResult);
}
function almostEqual(a, b) {
for (var i = 0; i < 4; ++i) {
if (Math.abs (a - b) > 0.00001) {
return false;
}
}
return true;
}
function printFloat32x4(msg, v) {
print (msg, SIMD.Float32x4.extractLane(v, 0).toFixed(6),
SIMD.Float32x4.extractLane(v, 1).toFixed(6),
SIMD.Float32x4.extractLane(v, 2).toFixed(6),
SIMD.Float32x4.extractLane(v, 3).toFixed(6));
}
function printInt32x4(msg, v) {
print (msg, SIMD.Float32x4.extractLane(v, 0),
SIMD.Float32x4.extractLane(v, 1),
SIMD.Float32x4.extractLane(v, 2),
SIMD.Float32x4.extractLane(v, 3));
}
function sinx4Test() {
var x = SIMD.Float32x4(1.0, 2.0, 3.0, 4.0);
var sinx4 = simdSin(x);
print (SIMD.Float32x4.extractLane(sinx4, 0),
SIMD.Float32x4.extractLane(sinx4, 1),
SIMD.Float32x4.extractLane(sinx4, 2),
SIMD.Float32x4.extractLane(sinx4, 3));
print (Math.sin(SIMD.Float32x4.extractLane(x, 0)),
Math.sin(SIMD.Float32x4.extractLane(x, 1)),
Math.sin(SIMD.Float32x4.extractLane(x, 2)),
Math.sin(SIMD.Float32x4.extractLane(x, 3)));
}
var _ps_sign_mask = SIMD.Int32x4.splat(0x80000000);
var _ps_inv_sign_mask = SIMD.Int32x4.not(_ps_sign_mask);
var _ps_cephes_FOPI = SIMD.Float32x4.splat(1.27323954473516);
var _pi32_1 = SIMD.Int32x4.splat(1);
var _pi32_inv1 = SIMD.Int32x4.not(_pi32_1);
var _pi32_4 = SIMD.Int32x4.splat(4);
var _pi32_2 = SIMD.Int32x4.splat(2);
var _ps_minus_cephes_DP1 = SIMD.Float32x4.splat(-0.78515625);
var _ps_minus_cephes_DP2 = SIMD.Float32x4.splat(-2.4187564849853515625E-4);
var _ps_minus_cephes_DP3 = SIMD.Float32x4.splat(-3.77489497744594108E-8);
var _ps_coscof_p0 = SIMD.Float32x4.splat(2.443315711809948E-5);
var _ps_coscof_p1 = SIMD.Float32x4.splat(-1.388731625493765E-3);
var _ps_coscof_p2 = SIMD.Float32x4.splat(4.166664568298827E-2);
var _ps_0p5 = SIMD.Float32x4.splat(0.5);
var _ps_1 = SIMD.Float32x4.splat(1.0);
var _ps_sincof_p0 = SIMD.Float32x4.splat(-1.9515295891E-4);
var _ps_sincof_p1 = SIMD.Float32x4.splat(8.3321608736E-3);
var _ps_sincof_p2 = SIMD.Float32x4.splat(-1.6666654611E-1);
function sinx4 (x) {
var xmm1;
var xmm2;
var xmm3;
var sign_bit;
var swap_sign_bit;
var poly_mask;
var y;
var y2;
var z;
var tmp;
var emm0;
var emm2;
sign_bit = x;
x = SIMD.Float32x4.fromInt32x4Bits(SIMD.Int32x4.and(SIMD.Int32x4.fromFloat32x4Bits(x), _ps_inv_sign_mask));
sign_bit = SIMD.Float32x4.fromInt32x4Bits(SIMD.Int32x4.and(SIMD.Int32x4.fromFloat32x4Bits(sign_bit), _ps_sign_mask));
y = SIMD.Float32x4.mul(x, _ps_cephes_FOPI);
//printFloat32x4 ("Probe 6", y);
emm2 = SIMD.Int32x4.fromFloat32x4(y);
emm2 = SIMD.Int32x4.add(emm2, _pi32_1);
emm2 = SIMD.Int32x4.and(emm2, _pi32_inv1);
//printInt32x4 ("Probe 8", emm2);
y = SIMD.Float32x4.fromInt32x4(emm2);
//printFloat32x4 ("Probe 7", y);
emm0 = SIMD.Int32x4.and(emm2, _pi32_4);
emm0 = SIMD.Int32x4.shiftLeftByScalar(emm0, 29);
emm2 = SIMD.Int32x4.and(emm2, _pi32_2);
emm2 = SIMD.Int32x4.equal(emm2, SIMD.Int32x4.splat(0));
swap_sign_bit = SIMD.Float32x4.fromInt32x4Bits(emm0);
poly_mask = SIMD.Float32x4.fromInt32x4Bits(emm2);
sign_bit = SIMD.Float32x4.fromInt32x4Bits(SIMD.Int32x4.xor(SIMD.Int32x4.fromFloat32x4Bits(sign_bit), SIMD.Int32x4.fromFloat32x4Bits(swap_sign_bit)));
//printFloat32x4 ("Probe 1", sign_bit);
//printFloat32x4 ("Probe 4", y);
//printFloat32x4 ("Probe 5", _ps_minus_cephes_DP1);
xmm1 = SIMD.Float32x4.mul(y, _ps_minus_cephes_DP1);
//printFloat32x4 ("Probe 3", xmm1);
xmm2 = SIMD.Float32x4.mul(y, _ps_minus_cephes_DP2);
xmm3 = SIMD.Float32x4.mul(y, _ps_minus_cephes_DP3);
x = SIMD.Float32x4.add(x, xmm1);
x = SIMD.Float32x4.add(x, xmm2);
x = SIMD.Float32x4.add(x, xmm3);
//printFloat32x4 ("Probe 2", x);
y = _ps_coscof_p0;
z = SIMD.Float32x4.mul(x, x);
y = SIMD.Float32x4.mul(y, z);
y = SIMD.Float32x4.add(y, _ps_coscof_p1);
y = SIMD.Float32x4.mul(y, z);
y = SIMD.Float32x4.add(y, _ps_coscof_p2);
y = SIMD.Float32x4.mul(y, z);
y = SIMD.Float32x4.mul(y, z);
tmp = SIMD.Float32x4.mul(z, _ps_0p5);
y = SIMD.Float32x4.sub(y, tmp);
y = SIMD.Float32x4.add(y, _ps_1);
y2 = _ps_sincof_p0;
//printFloat32x4 ("Probe 11", y2);
//printFloat32x4 ("Probe 12", z);
y2 = SIMD.Float32x4.mul(y2, z);
y2 = SIMD.Float32x4.add(y2, _ps_sincof_p1);
//printFloat32x4 ("Probe 13", y2);
y2 = SIMD.Float32x4.mul(y2, z);
y2 = SIMD.Float32x4.add(y2, _ps_sincof_p2);
y2 = SIMD.Float32x4.mul(y2, z);
y2 = SIMD.Float32x4.mul(y2, x);
y2 = SIMD.Float32x4.add(y2, x);
xmm3 = poly_mask;
y2 = SIMD.Float32x4.fromInt32x4Bits(SIMD.Int32x4.and(SIMD.Int32x4.fromFloat32x4Bits(xmm3), SIMD.Int32x4.fromFloat32x4Bits(y2)));
//printFloat32x4 ("Probe 10", y2);
y = SIMD.Float32x4.fromInt32x4Bits(SIMD.Int32x4.and(SIMD.Int32x4.not(SIMD.Int32x4.fromFloat32x4Bits(xmm3)), SIMD.Int32x4.fromFloat32x4Bits(y)));
y = SIMD.Float32x4.add(y, y2);
//printFloat32x4 ("Probe 9", y);
y = SIMD.Float32x4.fromInt32x4Bits(SIMD.Int32x4.xor(SIMD.Int32x4.fromFloat32x4Bits(y), SIMD.Int32x4.fromFloat32x4Bits(sign_bit)));
return y;
}
var simdInput = SIMD.Float32x4 (1.0, 2.0, 3.0, 4.0);
var nonSimdInput = [1.0, 2.0, 3.0, 4.0];
// SIMD version of the kernel
function simd (n) {
var result ;
for (var i = 0; i < n; ++i) {
result = sinx4 (simdInput);
}
return [SIMD.Float32x4.extractLane(result, 0),
SIMD.Float32x4.extractLane(result, 1),
SIMD.Float32x4.extractLane(result, 2),
SIMD.Float32x4.extractLane(result, 3)];
}
// Non SIMD version of the kernel
function nonSimd (n) {
var s = 0;
var x = nonSimdInput[0];
var y = nonSimdInput[1];
var z = nonSimdInput[2];
var w = nonSimdInput[3];
var rx, ry, rz, rw;
for (var i = 0; i < n; ++i) {
rx = Math.sin(x);
ry = Math.sin(y);
rz = Math.sin(z);
rw = Math.sin(w);
}
return [rx, ry, rz, rw];
}
} ());