blob: 6c7a220801ef7eabc389ce6bb0ee155ac25a5b1e [file] [log] [blame] [edit]
// Kernel for doing a 4x4 Matrix Inverse operation
// Based on Cramer's rule.
// See: ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf
// Author: Peter Jensen
(function () {
// Kernel configuration
var kernelConfig = {
kernelName: "Matrix4x4Inverse",
kernelInit: init,
kernelCleanup: cleanup,
kernelSimd: simdMatrixInverseN,
kernelNonSimd: nonSimdMatrixInverseN,
kernelIterations: 1000
};
// Hook up to the harness
benchmarks.add (new Benchmark (kernelConfig));
// Global Variables
var src = new Float32Array(16); // Source matrix
var dst = new Float32Array(16); // Result matrix
var tsrc = new Float32Array(16); // Transposed version of 'src'
var tmp = new Float32Array(12); // Temporary array of multiply results
var ident = new Float32Array(
[1,0,0,0,
0,1,0,0,
0,0,1,0,
0,0,0,1]);
function printMatrix(matrix) {
for (var r = 0; r < 4; ++r) {
var str = "";
var ri = r*4;
for (var c = 0; c < 4; ++c) {
var value = matrix[ri + c];
str += " " + value.toFixed(2);
}
print(str);
}
}
function initMatrix(matrix) {
// These values were chosen somewhat randomly, but they will at least yield a solution.
matrix [0] = 0; matrix[1] = 1; matrix[2] = 2; matrix[3] = 3;
matrix [4] = -1; matrix[5] = -2; matrix[6] = -3; matrix[7] = -4;
matrix [8] = 0; matrix[9] = 0; matrix[10] = 2; matrix[11] = 3;
matrix [12] = -1; matrix[13] = -2; matrix[14] = 0; matrix[15] = -4;
}
function mulMatrix(dst, op1, op2) {
for (var r = 0; r < 4; ++r) {
for (var c = 0; c < 4; ++c) {
var ri = 4*r;
dst[ri + c] = op1[ri]*op2[c] + op1[ri+1]*op2[c+4] + op1[ri+2]*op2[c+8] + op1[ri+3]*op2[c+12]
}
}
}
function checkMatrix(matrix) {
// when multiplied with the src matrix it should yield the identity matrix
mulMatrix(tsrc, src, matrix);
for (var i = 0; i < 16; ++i) {
if (Math.abs (tsrc[i] - ident[i]) > 0.00001) {
return false;
}
}
// printMatrix (tsrc);
return true;
}
// Kernel Initializer
function init() {
initMatrix(src);
// printMatrix(src);
nonSimdMatrixInverseN(1);
// printMatrix(dst);
if (!checkMatrix(dst)) {
return false;
}
initMatrix(src);
simdMatrixInverseN(1);
// printMatrix(dst);
if (!checkMatrix(dst)) {
return false;
}
return true;
}
function cleanup() {
return init();
}
function simdMatrixInverse() {
var src0, src1, src2, src3;
var row0, row1, row2, row3;
var tmp1;
var minor0, minor1, minor2, minor3;
var det;
// Load the 4 rows
var src0 = SIMD.Float32x4.load(src, 0);
var src1 = SIMD.Float32x4.load(src, 4);
var src2 = SIMD.Float32x4.load(src, 8);
var src3 = SIMD.Float32x4.load(src, 16);
// Transpose the source matrix. Sort of. Not a true transpose operation
tmp1 = SIMD.Float32x4.shuffle(src0, src1, 0, 1, 4, 5);
row1 = SIMD.Float32x4.shuffle(src2, src3, 0, 1, 4, 5);
row0 = SIMD.Float32x4.shuffle(tmp1, row1, 0, 2, 4, 6);
row1 = SIMD.Float32x4.shuffle(row1, tmp1, 1, 3, 5, 7);
tmp1 = SIMD.Float32x4.shuffle(src0, src1, 2, 3, 6, 7);
row3 = SIMD.Float32x4.shuffle(src2, src3, 2, 3, 6, 7);
row2 = SIMD.Float32x4.shuffle(tmp1, row3, 0, 2, 4, 6);
row3 = SIMD.Float32x4.shuffle(row3, tmp1, 1, 3, 5, 7);
// This is a true transposition, but it will lead to an incorrect result
//tmp1 = SIMD.Float32x4.shuffle(src0, src1, 0, 1, 4, 5);
//tmp2 = SIMD.Float32x4.shuffle(src2, src3, 0, 1, 4, 5);
//row0 = SIMD.Float32x4.shuffle(tmp1, tmp2, 0, 2, 4, 6);
//row1 = SIMD.Float32x4.shuffle(tmp1, tmp2, 1, 3, 5, 7);
//tmp1 = SIMD.Float32x4.shuffle(src0, src1, 2, 3, 6, 7);
//tmp2 = SIMD.Float32x4.shuffle(src2, src3, 2, 3, 6, 7);
//row2 = SIMD.Float32x4.shuffle(tmp1, tmp2, 0, 2, 4, 6);
//row3 = SIMD.Float32x4.shuffle(tmp1, tmp2, 1, 3, 5, 7);
// ----
tmp1 = SIMD.Float32x4.mul(row2, row3);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 1, 0, 3, 2); // 0xB1 = 10110001
minor0 = SIMD.Float32x4.mul(row1, tmp1);
minor1 = SIMD.Float32x4.mul(row0, tmp1);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 2, 3, 0, 1); // 0x4E = 01001110
minor0 = SIMD.Float32x4.sub(SIMD.Float32x4.mul(row1, tmp1), minor0);
minor1 = SIMD.Float32x4.sub(SIMD.Float32x4.mul(row0, tmp1), minor1);
minor1 = SIMD.Float32x4.swizzle(minor1, 2, 3, 0, 1); // 0x4E = 01001110
// ----
tmp1 = SIMD.Float32x4.mul(row1, row2);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 1, 0, 3, 2); // 0xB1 = 10110001
minor0 = SIMD.Float32x4.add(SIMD.Float32x4.mul(row3, tmp1), minor0);
minor3 = SIMD.Float32x4.mul(row0, tmp1);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 2, 3, 0, 1); // 0x4E = 01001110
minor0 = SIMD.Float32x4.sub(minor0, SIMD.Float32x4.mul(row3, tmp1));
minor3 = SIMD.Float32x4.sub(SIMD.Float32x4.mul(row0, tmp1), minor3);
minor3 = SIMD.Float32x4.swizzle(minor3, 2, 3, 0, 1); // 0x4E = 01001110
// ----
tmp1 = SIMD.Float32x4.mul(SIMD.Float32x4.swizzle(row1, 2, 3, 0, 1), row3); // 0x4E = 01001110
tmp1 = SIMD.Float32x4.swizzle(tmp1, 1, 0, 3, 2); // 0xB1 = 10110001
row2 = SIMD.Float32x4.swizzle(row2, 2, 3, 0, 1); // 0x4E = 01001110
minor0 = SIMD.Float32x4.add(SIMD.Float32x4.mul(row2, tmp1), minor0);
minor2 = SIMD.Float32x4.mul(row0, tmp1);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 2, 3, 0, 1); // 0x4E = 01001110
minor0 = SIMD.Float32x4.sub(minor0, SIMD.Float32x4.mul(row2, tmp1));
minor2 = SIMD.Float32x4.sub(SIMD.Float32x4.mul(row0, tmp1), minor2);
minor2 = SIMD.Float32x4.swizzle(minor2, 2, 3, 0, 1); // 0x4E = 01001110
// ----
tmp1 = SIMD.Float32x4.mul(row0, row1);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 1, 0, 3, 2); // 0xB1 = 10110001
minor2 = SIMD.Float32x4.add(SIMD.Float32x4.mul(row3, tmp1), minor2);
minor3 = SIMD.Float32x4.sub(SIMD.Float32x4.mul(row2, tmp1), minor3);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 2, 3, 0, 1); // 0x4E = 01001110
minor2 = SIMD.Float32x4.sub(SIMD.Float32x4.mul(row3, tmp1), minor2);
minor3 = SIMD.Float32x4.sub(minor3, SIMD.Float32x4.mul(row2, tmp1));
// ----
tmp1 = SIMD.Float32x4.mul(row0, row3);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 1, 0, 3, 2); // 0xB1 = 10110001
minor1 = SIMD.Float32x4.sub(minor1, SIMD.Float32x4.mul(row2, tmp1));
minor2 = SIMD.Float32x4.add(SIMD.Float32x4.mul(row1, tmp1), minor2);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 2, 3, 0, 1); // 0x4E = 01001110
minor1 = SIMD.Float32x4.add(SIMD.Float32x4.mul(row2, tmp1), minor1);
minor2 = SIMD.Float32x4.sub(minor2, SIMD.Float32x4.mul(row1, tmp1));
// ----
tmp1 = SIMD.Float32x4.mul(row0, row2);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 1, 0, 3, 2); // 0xB1 = 10110001
minor1 = SIMD.Float32x4.add(SIMD.Float32x4.mul(row3, tmp1), minor1);
minor3 = SIMD.Float32x4.sub(minor3, SIMD.Float32x4.mul(row1, tmp1));
tmp1 = SIMD.Float32x4.swizzle(tmp1, 2, 3, 0, 1); // 0x4E = 01001110
minor1 = SIMD.Float32x4.sub(minor1, SIMD.Float32x4.mul(row3, tmp1));
minor3 = SIMD.Float32x4.add(SIMD.Float32x4.mul(row1, tmp1), minor3);
// Compute determinant
det = SIMD.Float32x4.mul(row0, minor0);
det = SIMD.Float32x4.add(SIMD.Float32x4.swizzle(det, 2, 3, 0, 1), det); // 0x4E = 01001110
det = SIMD.Float32x4.add(SIMD.Float32x4.swizzle(det, 1, 0, 3, 2), det); // 0xB1 = 10110001
tmp1 = SIMD.Float32x4.reciprocalApproximation(det);
det = SIMD.Float32x4.sub(SIMD.Float32x4.add(tmp1, tmp1), SIMD.Float32x4.mul(det, SIMD.Float32x4.mul(tmp1, tmp1)));
det = SIMD.Float32x4.swizzle(det, 0, 0, 0, 0);
// These shuffles aren't necessary if the faulty transposition is done
// up at the top of this function.
//minor0 = SIMD.Float32x4.swizzle(minor0, 2, 1, 0, 3);
//minor1 = SIMD.Float32x4.swizzle(minor1, 2, 1, 0, 3);
//minor2 = SIMD.Float32x4.swizzle(minor2, 2, 1, 0, 3);
//minor3 = SIMD.Float32x4.swizzle(minor3, 2, 1, 0, 3);
// Compute final values by multiplying with 1/det
minor0 = SIMD.Float32x4.mul(det, minor0);
minor1 = SIMD.Float32x4.mul(det, minor1);
minor2 = SIMD.Float32x4.mul(det, minor2);
minor3 = SIMD.Float32x4.mul(det, minor3);
SIMD.Float32x4.store(dst, 0, minor0);
SIMD.Float32x4.store(dst, 4, minor1);
SIMD.Float32x4.store(dst, 8, minor2);
SIMD.Float32x4.store(dst, 12, minor3);
}
function nonSimdMatrixInverse() {
// Transpose the source matrix
for (var i = 0; i < 4; i++) {
tsrc[i] = src[i*4];
tsrc[i + 4] = src[i*4 + 1];
tsrc[i + 8] = src[i*4 + 2];
tsrc[i + 12] = src[i*4 + 3];
}
// Calculate pairs for first 8 elements (cofactors)
tmp[0] = tsrc[10] * tsrc[15];
tmp[1] = tsrc[11] * tsrc[14];
tmp[2] = tsrc[9] * tsrc[15];
tmp[3] = tsrc[11] * tsrc[13];
tmp[4] = tsrc[9] * tsrc[14];
tmp[5] = tsrc[10] * tsrc[13];
tmp[6] = tsrc[8] * tsrc[15];
tmp[7] = tsrc[11] * tsrc[12];
tmp[8] = tsrc[8] * tsrc[14];
tmp[9] = tsrc[10] * tsrc[12];
tmp[10] = tsrc[8] * tsrc[13];
tmp[11] = tsrc[9] * tsrc[12];
// calculate first 8 elements (cofactors)
dst[0] = tmp[0]*tsrc[5] + tmp[3]*tsrc[6] + tmp[4]*tsrc[7];
dst[0] -= tmp[1]*tsrc[5] + tmp[2]*tsrc[6] + tmp[5]*tsrc[7];
dst[1] = tmp[1]*tsrc[4] + tmp[6]*tsrc[6] + tmp[9]*tsrc[7];
dst[1] -= tmp[0]*tsrc[4] + tmp[7]*tsrc[6] + tmp[8]*tsrc[7];
dst[2] = tmp[2]*tsrc[4] + tmp[7]*tsrc[5] + tmp[10]*tsrc[7];
dst[2] -= tmp[3]*tsrc[4] + tmp[6]*tsrc[5] + tmp[11]*tsrc[7];
dst[3] = tmp[5]*tsrc[4] + tmp[8]*tsrc[5] + tmp[11]*tsrc[6];
dst[3] -= tmp[4]*tsrc[4] + tmp[9]*tsrc[5] + tmp[10]*tsrc[6];
dst[4] = tmp[1]*tsrc[1] + tmp[2]*tsrc[2] + tmp[5]*tsrc[3];
dst[4] -= tmp[0]*tsrc[1] + tmp[3]*tsrc[2] + tmp[4]*tsrc[3];
dst[5] = tmp[0]*tsrc[0] + tmp[7]*tsrc[2] + tmp[8]*tsrc[3];
dst[5] -= tmp[1]*tsrc[0] + tmp[6]*tsrc[2] + tmp[9]*tsrc[3];
dst[6] = tmp[3]*tsrc[0] + tmp[6]*tsrc[1] + tmp[11]*tsrc[3];
dst[6] -= tmp[2]*tsrc[0] + tmp[7]*tsrc[1] + tmp[10]*tsrc[3];
dst[7] = tmp[4]*tsrc[0] + tmp[9]*tsrc[1] + tmp[10]*tsrc[2];
dst[7] -= tmp[5]*tsrc[0] + tmp[8]*tsrc[1] + tmp[11]*tsrc[2];
// calculate pairs for second 8 elements (cofactors)
tmp[0] = tsrc[2]*tsrc[7];
tmp[1] = tsrc[3]*tsrc[6];
tmp[2] = tsrc[1]*tsrc[7];
tmp[3] = tsrc[3]*tsrc[5];
tmp[4] = tsrc[1]*tsrc[6];
tmp[5] = tsrc[2]*tsrc[5];
tmp[6] = tsrc[0]*tsrc[7];
tmp[7] = tsrc[3]*tsrc[4];
tmp[8] = tsrc[0]*tsrc[6];
tmp[9] = tsrc[2]*tsrc[4];
tmp[10] = tsrc[0]*tsrc[5];
tmp[11] = tsrc[1]*tsrc[4];
// calculate second 8 elements (cofactors)
dst[8] = tmp[0]*tsrc[13] + tmp[3]*tsrc[14] + tmp[4]*tsrc[15];
dst[8] -= tmp[1]*tsrc[13] + tmp[2]*tsrc[14] + tmp[5]*tsrc[15];
dst[9] = tmp[1]*tsrc[12] + tmp[6]*tsrc[14] + tmp[9]*tsrc[15];
dst[9] -= tmp[0]*tsrc[12] + tmp[7]*tsrc[14] + tmp[8]*tsrc[15];
dst[10] = tmp[2]*tsrc[12] + tmp[7]*tsrc[13] + tmp[10]*tsrc[15];
dst[10]-= tmp[3]*tsrc[12] + tmp[6]*tsrc[13] + tmp[11]*tsrc[15];
dst[11] = tmp[5]*tsrc[12] + tmp[8]*tsrc[13] + tmp[11]*tsrc[14];
dst[11]-= tmp[4]*tsrc[12] + tmp[9]*tsrc[13] + tmp[10]*tsrc[14];
dst[12] = tmp[2]*tsrc[10] + tmp[5]*tsrc[11] + tmp[1]*tsrc[9];
dst[12]-= tmp[4]*tsrc[11] + tmp[0]*tsrc[9] + tmp[3]*tsrc[10];
dst[13] = tmp[8]*tsrc[11] + tmp[0]*tsrc[8] + tmp[7]*tsrc[10];
dst[13]-= tmp[6]*tsrc[10] + tmp[9]*tsrc[11] + tmp[1]*tsrc[8];
dst[14] = tmp[6]*tsrc[9] + tmp[11]*tsrc[11] + tmp[3]*tsrc[8];
dst[14]-= tmp[10]*tsrc[11] + tmp[2]*tsrc[8] + tmp[7]*tsrc[9];
dst[15] = tmp[10]*tsrc[10] + tmp[4]*tsrc[8] + tmp[9]*tsrc[9];
dst[15]-= tmp[8]*tsrc[9] + tmp[11]*tsrc[10] + tmp[5]*tsrc[8];
// calculate determinant
var det = tsrc[0]*dst[0] + tsrc[1]*dst[1] + tsrc[2]*dst[2] + tsrc[3]*dst[3];
// calculate matrix inverse
det = 1/det;
for (var j = 0; j < 16; j++) {
dst[j] *= det;
}
}
// SIMD version of the kernel
function simdMatrixInverseN(n) {
for (var iterations = 0; iterations < n; ++iterations) {
var src0, src1, src2, src3;
var row0, row1, row2, row3;
var tmp1;
var minor0, minor1, minor2, minor3;
var det;
// Load the 4 rows
var src0 = SIMD.Float32x4.load(src, 0);
var src1 = SIMD.Float32x4.load(src, 4);
var src2 = SIMD.Float32x4.load(src, 8);
var src3 = SIMD.Float32x4.load(src, 12);
// Transpose the source matrix. Sort of. Not a true transpose operation
tmp1 = SIMD.Float32x4.shuffle(src0, src1, 0, 1, 4, 5);
row1 = SIMD.Float32x4.shuffle(src2, src3, 0, 1, 4, 5);
row0 = SIMD.Float32x4.shuffle(tmp1, row1, 0, 2, 4, 6);
row1 = SIMD.Float32x4.shuffle(row1, tmp1, 1, 3, 5, 7);
tmp1 = SIMD.Float32x4.shuffle(src0, src1, 2, 3, 6, 7);
row3 = SIMD.Float32x4.shuffle(src2, src3, 2, 3, 6, 7);
row2 = SIMD.Float32x4.shuffle(tmp1, row3, 0, 2, 4, 6);
row3 = SIMD.Float32x4.shuffle(row3, tmp1, 1, 3, 5, 7);
// This is a true transposition, but it will lead to an incorrect result
//tmp1 = SIMD.Float32x4.shuffle(src0, src1, 0, 1, 4, 5);
//tmp2 = SIMD.Float32x4.shuffle(src2, src3, 0, 1, 4, 5);
//row0 = SIMD.Float32x4.shuffle(tmp1, tmp2, 0, 2, 4, 6);
//row1 = SIMD.Float32x4.shuffle(tmp1, tmp2, 1, 3, 5, 7);
//tmp1 = SIMD.Float32x4.shuffle(src0, src1, 2, 3, 6, 7);
//tmp2 = SIMD.Float32x4.shuffle(src2, src3, 2, 3, 6, 7);
//row2 = SIMD.Float32x4.shuffle(tmp1, tmp2, 0, 2, 4, 6);
//row3 = SIMD.Float32x4.shuffle(tmp1, tmp2, 1, 3, 5, 7);
// ----
tmp1 = SIMD.Float32x4.mul(row2, row3);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 1, 0, 3, 2); // 0xB1 = 10110001
minor0 = SIMD.Float32x4.mul(row1, tmp1);
minor1 = SIMD.Float32x4.mul(row0, tmp1);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 2, 3, 0, 1); // 0x4E = 01001110
minor0 = SIMD.Float32x4.sub(SIMD.Float32x4.mul(row1, tmp1), minor0);
minor1 = SIMD.Float32x4.sub(SIMD.Float32x4.mul(row0, tmp1), minor1);
minor1 = SIMD.Float32x4.swizzle(minor1, 2, 3, 0, 1); // 0x4E = 01001110
// ----
tmp1 = SIMD.Float32x4.mul(row1, row2);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 1, 0, 3, 2); // 0xB1 = 10110001
minor0 = SIMD.Float32x4.add(SIMD.Float32x4.mul(row3, tmp1), minor0);
minor3 = SIMD.Float32x4.mul(row0, tmp1);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 2, 3, 0, 1); // 0x4E = 01001110
minor0 = SIMD.Float32x4.sub(minor0, SIMD.Float32x4.mul(row3, tmp1));
minor3 = SIMD.Float32x4.sub(SIMD.Float32x4.mul(row0, tmp1), minor3);
minor3 = SIMD.Float32x4.swizzle(minor3, 2, 3, 0, 1); // 0x4E = 01001110
// ----
tmp1 = SIMD.Float32x4.mul(SIMD.Float32x4.swizzle(row1, 2, 3, 0, 1), row3); // 0x4E = 01001110
tmp1 = SIMD.Float32x4.swizzle(tmp1, 1, 0, 3, 2); // 0xB1 = 10110001
row2 = SIMD.Float32x4.swizzle(row2, 2, 3, 0, 1); // 0x4E = 01001110
minor0 = SIMD.Float32x4.add(SIMD.Float32x4.mul(row2, tmp1), minor0);
minor2 = SIMD.Float32x4.mul(row0, tmp1);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 2, 3, 0, 1); // 0x4E = 01001110
minor0 = SIMD.Float32x4.sub(minor0, SIMD.Float32x4.mul(row2, tmp1));
minor2 = SIMD.Float32x4.sub(SIMD.Float32x4.mul(row0, tmp1), minor2);
minor2 = SIMD.Float32x4.swizzle(minor2, 2, 3, 0, 1); // 0x4E = 01001110
// ----
tmp1 = SIMD.Float32x4.mul(row0, row1);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 1, 0, 3, 2); // 0xB1 = 10110001
minor2 = SIMD.Float32x4.add(SIMD.Float32x4.mul(row3, tmp1), minor2);
minor3 = SIMD.Float32x4.sub(SIMD.Float32x4.mul(row2, tmp1), minor3);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 2, 3, 0, 1); // 0x4E = 01001110
minor2 = SIMD.Float32x4.sub(SIMD.Float32x4.mul(row3, tmp1), minor2);
minor3 = SIMD.Float32x4.sub(minor3, SIMD.Float32x4.mul(row2, tmp1));
// ----
tmp1 = SIMD.Float32x4.mul(row0, row3);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 1, 0, 3, 2); // 0xB1 = 10110001
minor1 = SIMD.Float32x4.sub(minor1, SIMD.Float32x4.mul(row2, tmp1));
minor2 = SIMD.Float32x4.add(SIMD.Float32x4.mul(row1, tmp1), minor2);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 2, 3, 0, 1); // 0x4E = 01001110
minor1 = SIMD.Float32x4.add(SIMD.Float32x4.mul(row2, tmp1), minor1);
minor2 = SIMD.Float32x4.sub(minor2, SIMD.Float32x4.mul(row1, tmp1));
// ----
tmp1 = SIMD.Float32x4.mul(row0, row2);
tmp1 = SIMD.Float32x4.swizzle(tmp1, 1, 0, 3, 2); // 0xB1 = 10110001
minor1 = SIMD.Float32x4.add(SIMD.Float32x4.mul(row3, tmp1), minor1);
minor3 = SIMD.Float32x4.sub(minor3, SIMD.Float32x4.mul(row1, tmp1));
tmp1 = SIMD.Float32x4.swizzle(tmp1, 2, 3, 0, 1); // 0x4E = 01001110
minor1 = SIMD.Float32x4.sub(minor1, SIMD.Float32x4.mul(row3, tmp1));
minor3 = SIMD.Float32x4.add(SIMD.Float32x4.mul(row1, tmp1), minor3);
// Compute determinant
det = SIMD.Float32x4.mul(row0, minor0);
det = SIMD.Float32x4.add(SIMD.Float32x4.swizzle(det, 2, 3, 0, 1), det); // 0x4E = 01001110
det = SIMD.Float32x4.add(SIMD.Float32x4.swizzle(det, 1, 0, 3, 2), det); // 0xB1 = 10110001
tmp1 = SIMD.Float32x4.reciprocalApproximation(det);
det = SIMD.Float32x4.sub(SIMD.Float32x4.add(tmp1, tmp1), SIMD.Float32x4.mul(det, SIMD.Float32x4.mul(tmp1, tmp1)));
det = SIMD.Float32x4.swizzle(det, 0, 0, 0, 0);
// These shuffles aren't necessary if the faulty transposition is done
// up at the top of this function.
//minor0 = SIMD.Float32x4.swizzle(minor0, 2, 1, 0, 3);
//minor1 = SIMD.Float32x4.swizzle(minor1, 2, 1, 0, 3);
//minor2 = SIMD.Float32x4.swizzle(minor2, 2, 1, 0, 3);
//minor3 = SIMD.Float32x4.swizzle(minor3, 2, 1, 0, 3);
// Compute final values by multiplying with 1/det
minor0 = SIMD.Float32x4.mul(det, minor0);
minor1 = SIMD.Float32x4.mul(det, minor1);
minor2 = SIMD.Float32x4.mul(det, minor2);
minor3 = SIMD.Float32x4.mul(det, minor3);
SIMD.Float32x4.store(dst, 0, minor0);
SIMD.Float32x4.store(dst, 4, minor1);
SIMD.Float32x4.store(dst, 8, minor2);
SIMD.Float32x4.store(dst, 12, minor3);
}
}
// Non SIMD version of the kernel
function nonSimdMatrixInverseN(n) {
for (var iterations = 0; iterations < n; ++iterations) {
// Transpose the source matrix
for (var i = 0; i < 4; i++) {
tsrc[i] = src[i * 4];
tsrc[i + 4] = src[i * 4 + 1];
tsrc[i + 8] = src[i * 4 + 2];
tsrc[i + 12] = src[i * 4 + 3];
}
// Calculate pairs for first 8 elements (cofactors)
tmp[0] = tsrc[10] * tsrc[15];
tmp[1] = tsrc[11] * tsrc[14];
tmp[2] = tsrc[9] * tsrc[15];
tmp[3] = tsrc[11] * tsrc[13];
tmp[4] = tsrc[9] * tsrc[14];
tmp[5] = tsrc[10] * tsrc[13];
tmp[6] = tsrc[8] * tsrc[15];
tmp[7] = tsrc[11] * tsrc[12];
tmp[8] = tsrc[8] * tsrc[14];
tmp[9] = tsrc[10] * tsrc[12];
tmp[10] = tsrc[8] * tsrc[13];
tmp[11] = tsrc[9] * tsrc[12];
// calculate first 8 elements (cofactors)
dst[0] = tmp[0] * tsrc[5] + tmp[3] * tsrc[6] + tmp[4] * tsrc[7];
dst[0] -= tmp[1] * tsrc[5] + tmp[2] * tsrc[6] + tmp[5] * tsrc[7];
dst[1] = tmp[1] * tsrc[4] + tmp[6] * tsrc[6] + tmp[9] * tsrc[7];
dst[1] -= tmp[0] * tsrc[4] + tmp[7] * tsrc[6] + tmp[8] * tsrc[7];
dst[2] = tmp[2] * tsrc[4] + tmp[7] * tsrc[5] + tmp[10] * tsrc[7];
dst[2] -= tmp[3] * tsrc[4] + tmp[6] * tsrc[5] + tmp[11] * tsrc[7];
dst[3] = tmp[5] * tsrc[4] + tmp[8] * tsrc[5] + tmp[11] * tsrc[6];
dst[3] -= tmp[4] * tsrc[4] + tmp[9] * tsrc[5] + tmp[10] * tsrc[6];
dst[4] = tmp[1] * tsrc[1] + tmp[2] * tsrc[2] + tmp[5] * tsrc[3];
dst[4] -= tmp[0] * tsrc[1] + tmp[3] * tsrc[2] + tmp[4] * tsrc[3];
dst[5] = tmp[0] * tsrc[0] + tmp[7] * tsrc[2] + tmp[8] * tsrc[3];
dst[5] -= tmp[1] * tsrc[0] + tmp[6] * tsrc[2] + tmp[9] * tsrc[3];
dst[6] = tmp[3] * tsrc[0] + tmp[6] * tsrc[1] + tmp[11] * tsrc[3];
dst[6] -= tmp[2] * tsrc[0] + tmp[7] * tsrc[1] + tmp[10] * tsrc[3];
dst[7] = tmp[4] * tsrc[0] + tmp[9] * tsrc[1] + tmp[10] * tsrc[2];
dst[7] -= tmp[5] * tsrc[0] + tmp[8] * tsrc[1] + tmp[11] * tsrc[2];
// calculate pairs for second 8 elements (cofactors)
tmp[0] = tsrc[2] * tsrc[7];
tmp[1] = tsrc[3] * tsrc[6];
tmp[2] = tsrc[1] * tsrc[7];
tmp[3] = tsrc[3] * tsrc[5];
tmp[4] = tsrc[1] * tsrc[6];
tmp[5] = tsrc[2] * tsrc[5];
tmp[6] = tsrc[0] * tsrc[7];
tmp[7] = tsrc[3] * tsrc[4];
tmp[8] = tsrc[0] * tsrc[6];
tmp[9] = tsrc[2] * tsrc[4];
tmp[10] = tsrc[0] * tsrc[5];
tmp[11] = tsrc[1] * tsrc[4];
// calculate second 8 elements (cofactors)
dst[8] = tmp[0] * tsrc[13] + tmp[3] * tsrc[14] + tmp[4] * tsrc[15];
dst[8] -= tmp[1] * tsrc[13] + tmp[2] * tsrc[14] + tmp[5] * tsrc[15];
dst[9] = tmp[1] * tsrc[12] + tmp[6] * tsrc[14] + tmp[9] * tsrc[15];
dst[9] -= tmp[0] * tsrc[12] + tmp[7] * tsrc[14] + tmp[8] * tsrc[15];
dst[10] = tmp[2] * tsrc[12] + tmp[7] * tsrc[13] + tmp[10] * tsrc[15];
dst[10] -= tmp[3] * tsrc[12] + tmp[6] * tsrc[13] + tmp[11] * tsrc[15];
dst[11] = tmp[5] * tsrc[12] + tmp[8] * tsrc[13] + tmp[11] * tsrc[14];
dst[11] -= tmp[4] * tsrc[12] + tmp[9] * tsrc[13] + tmp[10] * tsrc[14];
dst[12] = tmp[2] * tsrc[10] + tmp[5] * tsrc[11] + tmp[1] * tsrc[9];
dst[12] -= tmp[4] * tsrc[11] + tmp[0] * tsrc[9] + tmp[3] * tsrc[10];
dst[13] = tmp[8] * tsrc[11] + tmp[0] * tsrc[8] + tmp[7] * tsrc[10];
dst[13] -= tmp[6] * tsrc[10] + tmp[9] * tsrc[11] + tmp[1] * tsrc[8];
dst[14] = tmp[6] * tsrc[9] + tmp[11] * tsrc[11] + tmp[3] * tsrc[8];
dst[14] -= tmp[10] * tsrc[11] + tmp[2] * tsrc[8] + tmp[7] * tsrc[9];
dst[15] = tmp[10] * tsrc[10] + tmp[4] * tsrc[8] + tmp[9] * tsrc[9];
dst[15] -= tmp[8] * tsrc[9] + tmp[11] * tsrc[10] + tmp[5] * tsrc[8];
// calculate determinant
var det = tsrc[0] * dst[0] + tsrc[1] * dst[1] + tsrc[2] * dst[2] + tsrc[3] * dst[3];
// calculate matrix inverse
det = 1 / det;
for (var j = 0; j < 16; j++) {
dst[j] *= det;
}
}
}
} ());