blob: f2dfd52a58334a51d9fbbf5e0ebf1ad542b5a8dc [file] [log] [blame]
#include "m512_test_util.h"
#include <math.h>
#include <memory.h>
#include <stdio.h>
#include <stdlib.h>
/*
* Test reduce instructions.
* Here we check for _mm512_[mask_]reduce_[add|mul|min|max|and|or] intrinsics.
*/
typedef long long s64;
typedef unsigned long long u64;
typedef float f32;
typedef double f64;
typedef int s32;
typedef unsigned int u32;
int verbose = 0;
#define VERBOSE (verbose > 1)
#define SHOW_OP (verbose > 2)
typedef enum {
ASSIGN,
ADD,
REDUCE_ADD,
REDUCE_MUL,
REDUCE_MIN,
REDUCE_MAX,
REDUCE_GMIN,
REDUCE_GMAX,
REDUCE_OR,
REDUCE_AND
} OPER;
__mmask16 mask_true = 0xffff;
#define MASK(mask, n) ((mask & (0x1 << n)) != 0)
#define IMin(i, j) (((i) <= (j)) ? (i) : (j))
#define IMax(i, j) (((i) >= (j)) ? (i) : (j))
#define MULOP(a, b) (a * b)
#define ADDOP(a, b) (a + b)
#define OROP(a, b) (a | b)
#define ANDOP(a, b) (a & b)
#define GMINOP(a, b) fmin(a, b)
#define GMAXOP(a, b) fmax(a, b)
#define DO_MASK_COPY(len, output, mask, input, def) \
{ \
int n; \
\
for (n = 0; n < len; n += 1) { \
if (MASK(mask, n)) { \
output[n] = input[n]; \
} else { \
output[n] = def; \
} \
} \
}
#define DO_REDUCE_16(res, mask, input, dtype, oper, initval) \
{ \
dtype dtype##tmp[4]; \
V512 vtmp; \
DO_MASK_COPY(16, vtmp.dtype, mask, input, initval); \
\
dtype##tmp[0] = oper(vtmp.dtype[0], vtmp.dtype[4]); \
dtype##tmp[1] = oper(vtmp.dtype[1], vtmp.dtype[5]); \
dtype##tmp[2] = oper(vtmp.dtype[2], vtmp.dtype[6]); \
dtype##tmp[3] = oper(vtmp.dtype[3], vtmp.dtype[7]); \
\
dtype##tmp[0] = oper(dtype##tmp[0], vtmp.dtype[8]); \
dtype##tmp[1] = oper(dtype##tmp[1], vtmp.dtype[9]); \
dtype##tmp[2] = oper(dtype##tmp[2], vtmp.dtype[10]); \
dtype##tmp[3] = oper(dtype##tmp[3], vtmp.dtype[11]); \
\
dtype##tmp[0] = oper(dtype##tmp[0], vtmp.dtype[12]); \
dtype##tmp[1] = oper(dtype##tmp[1], vtmp.dtype[13]); \
dtype##tmp[2] = oper(dtype##tmp[2], vtmp.dtype[14]); \
dtype##tmp[3] = oper(dtype##tmp[3], vtmp.dtype[15]); \
\
dtype##tmp[0] = oper(dtype##tmp[0], dtype##tmp[1]); \
dtype##tmp[2] = oper(dtype##tmp[2], dtype##tmp[3]); \
\
res = oper(dtype##tmp[0], dtype##tmp[2]); \
}
#define DO_REDUCE_8(res, mask, input, dtype, oper, initval) \
{ \
dtype dtype##tmp[4]; \
V512 vtmp; \
DO_MASK_COPY(8, vtmp.dtype, mask, input, initval); \
\
dtype##tmp[0] = oper(vtmp.dtype[0], vtmp.dtype[4]); \
dtype##tmp[1] = oper(vtmp.dtype[1], vtmp.dtype[5]); \
dtype##tmp[2] = oper(vtmp.dtype[2], vtmp.dtype[6]); \
dtype##tmp[3] = oper(vtmp.dtype[3], vtmp.dtype[7]); \
\
dtype##tmp[0] = oper(dtype##tmp[0], dtype##tmp[1]); \
dtype##tmp[2] = oper(dtype##tmp[2], dtype##tmp[3]); \
\
res = oper(dtype##tmp[0], dtype##tmp[2]); \
}
static int NOINLINE mask_s32_reduce_op(OPER op, __mmask16 mask,
int s32op1[16]) {
int handled = 0;
int res;
switch (op) {
case REDUCE_ADD:
handled = 1;
DO_REDUCE_16(res, mask, s32op1, s32, ADDOP, 0);
break;
case REDUCE_MUL:
handled = 1;
DO_REDUCE_16(res, mask, s32op1, s32, MULOP, 1);
break;
case REDUCE_MIN:
handled = 1;
DO_REDUCE_16(res, mask, s32op1, s32, IMin, 0x7fffffff);
break;
case REDUCE_MAX:
handled = 1;
DO_REDUCE_16(res, mask, s32op1, s32, IMax, 0x80000000);
break;
case REDUCE_OR:
handled = 1;
DO_REDUCE_16(res, mask, s32op1, s32, OROP, 0);
break;
case REDUCE_AND:
handled = 1;
DO_REDUCE_16(res, mask, s32op1, s32, ANDOP, 0xffffffff);
break;
default:
printf("FAIL: mask_s32_reduce_op: bad op\n");
exit(1);
break;
}
if (!handled) {
printf("FAIL: mask_s32_reduce_op: unsupported op\n");
}
return (res);
}
static int NOINLINE mask_u32_reduce_op(OPER op, __mmask16 mask,
u32 u32op1[16]) {
int handled = 0;
int res;
switch (op) {
case REDUCE_MIN:
handled = 1;
DO_REDUCE_16(res, mask, u32op1, u32, IMin, 0xffffffff);
break;
case REDUCE_MAX:
handled = 1;
DO_REDUCE_16(res, mask, u32op1, u32, IMax, 0x00000000);
break;
default:
printf("FAIL: mask_u32_reduce_op: bad op\n");
exit(1);
break;
}
if (!handled) {
printf("FAIL: mask_u32_reduce_op: unsupported op\n");
}
return (res);
}
static void NOINLINE init_s32(int s32out[16], int s32op1[16]) {
int i = 0;
for (i = 0; i < 16; i++) {
s32out[i] = s32op1[i];
}
}
static void NOINLINE init_f32(float f32out[16], float f32op1[16]) {
int i = 0;
for (i = 0; i < 16; i++) {
f32out[i] = f32op1[i];
}
}
static float NOINLINE mask_f32_reduce_op(OPER op, __mmask16 mask,
float valop1[16]) {
int handled = 0;
float res;
union {
float f32init;
int s32init;
} init;
switch (op) {
case REDUCE_ADD:
handled = 1;
DO_REDUCE_16(res, mask, valop1, f32, ADDOP, 0.0);
break;
case REDUCE_MUL:
handled = 1;
DO_REDUCE_16(res, mask, valop1, f32, MULOP, 1.0);
break;
case REDUCE_GMIN:
handled = 1;
init.s32init = 0x7f800000; /* +inf */
DO_REDUCE_16(res, mask, valop1, f32, GMINOP, init.f32init);
break;
case REDUCE_GMAX:
handled = 1;
init.s32init = 0xff800000; /* -inf */
DO_REDUCE_16(res, mask, valop1, f32, GMAXOP, init.f32init);
break;
default:
printf("FAIL: mask_f32_reduce_op: bad op\n");
exit(1);
break;
}
if (!handled) {
printf("FAIL: mask_f32_reduce_op: unsupported op\n");
}
return (res);
}
static void NOINLINE init_f64(double f64out[8], double f64op1[8]) {
int i = 0;
for (i = 0; i < 8; i++) {
f64out[i] = f64op1[i];
}
}
static double NOINLINE mask_f64_reduce_op(OPER op, __mmask16 mask,
double valop1[8]) {
int handled = 0;
double res;
union {
double f64init;
int s32init[2];
} init;
switch (op) {
case REDUCE_ADD:
handled = 1;
DO_REDUCE_8(res, mask, valop1, f64, ADDOP, 0.0);
break;
case REDUCE_MUL:
handled = 1;
DO_REDUCE_8(res, mask, valop1, f64, MULOP, 1.0);
break;
case REDUCE_GMIN:
handled = 1;
init.s32init[0] = 0x00000000; /* +inf */
init.s32init[1] = 0x7ff00000; /* +inf */
DO_REDUCE_8(res, mask, valop1, f64, GMINOP, init.f64init);
break;
case REDUCE_GMAX:
handled = 1;
init.s32init[0] = 0x00000000; /* -inf */
init.s32init[1] = 0xfff00000; /* -inf */
DO_REDUCE_8(res, mask, valop1, f64, GMAXOP, init.f64init);
break;
default:
printf("FAIL: mask_f64_reduce_op: bad op\n");
exit(1);
break;
}
if (!handled) {
printf("FAIL: mask_f64_reduce_op: unsupported op\n");
}
return (res);
}
static void NOINLINE print_s32(char *pfx, int var) {
if (pfx) {
printf("%s: ", pfx);
}
printf("%5d", var);
printf("\n");
}
static void NOINLINE print_u32(char *pfx, u32 var) {
if (pfx) {
printf("%s: ", pfx);
}
printf("%5u", var);
printf("\n");
}
static void NOINLINE print_f32(char *pfx, float var) {
if (pfx) {
printf("%s: ", pfx);
}
printf("%5.2f", var);
printf("\n");
}
static void NOINLINE print_f64(char *pfx, double var) {
if (pfx) {
printf("%s: ", pfx);
}
printf("%5.2lf", var);
printf("\n");
}
static void NOINLINE print_ivec(char *pfx, int ivec[]) {
if (pfx) {
printf("%s: ", pfx);
}
char *fmt = "%5d %5d %5d %5d ";
printf(fmt, ivec[15], ivec[14], ivec[13], ivec[12]);
printf(fmt, ivec[11], ivec[10], ivec[9], ivec[8]);
printf(fmt, ivec[7], ivec[6], ivec[5], ivec[4]);
printf(fmt, ivec[3], ivec[2], ivec[1], ivec[0]);
printf("\n");
}
static void NOINLINE print_uvec(char *pfx, u32 ivec[]) {
if (pfx) {
printf("%s: ", pfx);
}
char *fmt = "%5u %5u %5u %5u ";
printf(fmt, ivec[15], ivec[14], ivec[13], ivec[12]);
printf(fmt, ivec[11], ivec[10], ivec[9], ivec[8]);
printf(fmt, ivec[7], ivec[6], ivec[5], ivec[4]);
printf(fmt, ivec[3], ivec[2], ivec[1], ivec[0]);
printf("\n");
}
static void NOINLINE print_fvec(char *pfx, float fvec[]) {
if (pfx) {
printf("%s: ", pfx);
}
char *fmt = "%5.2f %5.2f %5.2f %5.2f ";
printf(fmt, fvec[15], fvec[14], fvec[13], fvec[12]);
printf(fmt, fvec[11], fvec[10], fvec[9], fvec[8]);
printf(fmt, fvec[7], fvec[6], fvec[5], fvec[4]);
printf(fmt, fvec[3], fvec[2], fvec[1], fvec[0]);
printf("\n");
}
static void NOINLINE print_dvec(char *pfx, double dvec[]) {
if (pfx) {
printf("%s: ", pfx);
}
char *fmt = "%5.2lf %5.2lf %5.2lf %5.2lf ";
printf(fmt, dvec[7], dvec[6], dvec[5], dvec[4]);
printf(fmt, dvec[3], dvec[2], dvec[1], dvec[0]);
printf("\n");
}
#define PRINT_MASK(bits, width, pfx, var) \
print_mask(bits, "%" #width "d ", pfx, var)
static void NOINLINE print_mask(int bits, char *fmt, char *pfx,
__mmask16 mask) {
int i;
if (pfx) {
printf("%s: ", pfx);
}
for (i = bits; i >= 1; i -= 1) {
printf(fmt, MASK(mask, (i - 1)));
}
printf("\n");
}
#define CHECK_PRINT(STATUS, FUNC) \
if (!(STATUS)) { \
printf("FAIL " #FUNC "\n"); \
err += 1; \
} else if (VERBOSE) { \
printf("PASS " #FUNC "\n"); \
}
#define CHECK_REDUCE_S32(FUNC) \
{ \
int passed = (result == mresult); \
CHECK_PRINT(passed, FUNC); \
if (!passed || SHOW_OP) { \
print_ivec("Opand1", v1.s32); \
print_s32("Scalar", result); \
print_s32("Vector", mresult); \
} \
}
#define CHECK_REDUCE_U32(FUNC) \
{ \
int passed = (result == mresult); \
CHECK_PRINT(passed, FUNC); \
if (!passed || SHOW_OP) { \
print_uvec("Opand1", v1.u32); \
print_u32("Scalar", result); \
print_u32("Vector", mresult); \
} \
}
#define CHECK_MASK_REDUCE_S32(FUNC) \
{ \
int passed = (result == mresult); \
CHECK_PRINT(passed, FUNC); \
if (!passed || SHOW_OP) { \
print_ivec("Opand1", v1.s32); \
PRINT_MASK(16, 5, " Mask", mask); \
print_s32("Scalar", result); \
print_s32("Vector", mresult); \
} \
}
#define CHECK_MASK_REDUCE_U32(FUNC) \
{ \
int passed = (result == mresult); \
CHECK_PRINT(passed, FUNC); \
if (!passed || SHOW_OP) { \
print_uvec("Opand1", v1.u32); \
PRINT_MASK(16, 5, " Mask", mask); \
print_u32("Scalar", result); \
print_u32("Vector", mresult); \
} \
}
#define CHECK_REDUCE_F32(FUNC) \
{ \
int passed = (result == mresult); \
CHECK_PRINT(passed, FUNC); \
if (!passed || SHOW_OP) { \
print_fvec("Opand1", v1.f32); \
print_f32("Scalar", result); \
print_f32("Vector", mresult); \
} \
}
#define CHECK_MASK_REDUCE_F32(FUNC) \
{ \
int passed = (result == mresult); \
CHECK_PRINT(passed, FUNC); \
if (!passed || SHOW_OP) { \
print_fvec("Opand1", v1.f32); \
PRINT_MASK(16, 9, " Mask", mask); \
print_f32("Scalar", result); \
print_f32("Vector", mresult); \
} \
}
#define CHECK_REDUCE_F64(FUNC) \
{ \
int passed = (result == mresult); \
CHECK_PRINT(passed, FUNC); \
if (!passed || SHOW_OP) { \
print_dvec("Opand1", v1.f64); \
print_f64("Scalar", result); \
print_f64("Vector", mresult); \
} \
}
#define CHECK_MASK_REDUCE_F64(FUNC) \
{ \
int passed = (result == mresult); \
CHECK_PRINT(passed, FUNC); \
if (!passed || SHOW_OP) { \
print_dvec("Opand1", v1.f64); \
PRINT_MASK(8, 10, " Mask", mask); \
print_f64("Scalar", result); \
print_f64("Vector", mresult); \
} \
}
#define DOONE_REDUCE_S32(OP, FUNC) \
{ \
int result; \
int mresult; \
result = mask_s32_reduce_op(OP, mask_true, v1.s32); \
mresult = FUNC(v1.zmmi); \
CHECK_REDUCE_S32(FUNC); \
}
#define DOONE_MASK_REDUCE_S32(OP, mask, FUNC) \
{ \
int result; \
int mresult; \
result = mask_s32_reduce_op(OP, mask, v1.s32); \
mresult = FUNC(mask, v1.zmmi); \
CHECK_MASK_REDUCE_S32(FUNC); \
}
#define DOONE_REDUCE_U32(OP, FUNC) \
{ \
u32 result; \
u32 mresult; \
result = mask_u32_reduce_op(OP, mask_true, v1.u32); \
mresult = FUNC(v1.zmmi); \
CHECK_REDUCE_U32(FUNC); \
}
#define DOONE_MASK_REDUCE_U32(OP, mask, FUNC) \
{ \
int result; \
int mresult; \
result = mask_u32_reduce_op(OP, mask, v1.u32); \
mresult = FUNC(mask, v1.zmmi); \
CHECK_MASK_REDUCE_U32(FUNC); \
}
#define DOONE_REDUCE_F32(OP, FUNC) \
{ \
float result; \
float mresult; \
result = mask_f32_reduce_op(OP, mask_true, v1.f32); \
mresult = FUNC(v1.zmm); \
CHECK_REDUCE_F32(FUNC); \
}
#define DOONE_MASK_REDUCE_F32(OP, mask, FUNC) \
{ \
float result; \
float mresult; \
result = mask_f32_reduce_op(OP, mask, v1.f32); \
mresult = FUNC(mask, v1.zmm); \
CHECK_MASK_REDUCE_F32(FUNC); \
}
#define DOONE_REDUCE_F64(OP, FUNC) \
{ \
double result; \
double mresult; \
result = mask_f64_reduce_op(OP, mask_true, v1.f64); \
mresult = FUNC(v1.zmmd); \
CHECK_REDUCE_F64(FUNC); \
}
#define DOONE_MASK_REDUCE_F64(OP, mask, FUNC) \
{ \
double result; \
double mresult; \
memset(&result, 0, sizeof(result)); \
memset(&mresult, 0, sizeof(mresult)); \
result = mask_f64_reduce_op(OP, mask, v1.f64); \
mresult = FUNC(mask, v1.zmmd); \
CHECK_MASK_REDUCE_F64(FUNC); \
}
__mmask16 mvals[] = {0, 0x82a5};
int main(int argc, char *argv[]) {
int i;
int err = 0;
int init1[16] = {7, 1, -3, 3, 1, 1, 2, 3, 1, 3, 2, 3, -5, 1, 11, 3};
float finit1[16] = {-1.0, -2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 18.0,
-9.0, 10.0, 11.0, -12.0, 13.0, 14.0, 15.0, 16.0};
double dinit1[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
double dinit2[8] = {0.5, 2.0, 3.0, 2.1, 5.0, 5.2, 7.1, 3.1};
V512 v1;
__mmask16 mask = 0x82a5;
verbose = argc;
/* zmmi/s32 tests ---------------------------------------- */
/* _mm512_reduce_add_epi32 */
init_s32(v1.s32, init1);
DOONE_REDUCE_S32(REDUCE_ADD, _mm512_reduce_add_epi32);
/* _mm512_reduce_mul_epi32 */
init_s32(v1.s32, init1);
DOONE_REDUCE_S32(REDUCE_MUL, _mm512_reduce_mul_epi32);
/* _mm512_reduce_min_epi32 */
init_s32(v1.s32, init1);
DOONE_REDUCE_S32(REDUCE_MIN, _mm512_reduce_min_epi32);
/* _mm512_reduce_max_epi32 */
init_s32(v1.s32, init1);
DOONE_REDUCE_S32(REDUCE_MAX, _mm512_reduce_max_epi32);
/* _mm512_reduce_and_epi32 */
init_s32(v1.s32, init1);
DOONE_REDUCE_S32(REDUCE_AND, _mm512_reduce_and_epi32);
/* _mm512_reduce_or_epi32 */
init_s32(v1.s32, init1);
DOONE_REDUCE_S32(REDUCE_OR, _mm512_reduce_or_epi32);
/* _mm512_reduce_min_epu32 */
init_s32(v1.s32, init1);
DOONE_REDUCE_U32(REDUCE_MIN, _mm512_reduce_min_epu32);
/* _mm512_reduce_max_epu32 */
init_s32(v1.s32, init1);
DOONE_REDUCE_U32(REDUCE_MAX, _mm512_reduce_max_epu32);
for (i = 0; i < 2; i += 1) {
mask = mvals[i];
/* _mm512_mask_reduce_min_epu32 */
init_s32(v1.s32, init1);
DOONE_MASK_REDUCE_U32(REDUCE_MIN, mask, _mm512_mask_reduce_min_epu32);
/* _mm512_mask_reduce_max_epu32 */
init_s32(v1.s32, init1);
DOONE_MASK_REDUCE_U32(REDUCE_MAX, mask, _mm512_mask_reduce_max_epu32);
}
for (i = 0; i < 2; i += 1) {
mask = mvals[i];
/* _mm512_mask_reduce_add_epi32 */
init_s32(v1.s32, init1);
DOONE_MASK_REDUCE_S32(REDUCE_ADD, mask, _mm512_mask_reduce_add_epi32);
/* _mm512_mask_reduce_mul_epi32 */
init_s32(v1.s32, init1);
DOONE_MASK_REDUCE_S32(REDUCE_MUL, mask, _mm512_mask_reduce_mul_epi32);
/* _mm512_mask_reduce_min_epi32 */
init_s32(v1.s32, init1);
DOONE_MASK_REDUCE_S32(REDUCE_MIN, mask, _mm512_mask_reduce_min_epi32);
/* _mm512_mask_reduce_max_epi32 */
init_s32(v1.s32, init1);
DOONE_MASK_REDUCE_S32(REDUCE_MAX, mask, _mm512_mask_reduce_max_epi32);
/* _mm512_mask_reduce_and_epi32 */
init_s32(v1.s32, init1);
DOONE_MASK_REDUCE_S32(REDUCE_AND, mask, _mm512_mask_reduce_and_epi32);
/* _mm512_mask_reduce_or_epi32 */
init_s32(v1.s32, init1);
DOONE_MASK_REDUCE_S32(REDUCE_OR, mask, _mm512_mask_reduce_or_epi32);
}
/* zmm/f32 tests ---------------------------------------- */
/* _mm512_reduce_add_ps */
init_f32(v1.f32, finit1);
DOONE_REDUCE_F32(REDUCE_ADD, _mm512_reduce_add_ps);
/* _mm512_reduce_mul_ps */
init_f32(v1.f32, finit1);
DOONE_REDUCE_F32(REDUCE_MUL, _mm512_reduce_mul_ps);
/* _mm512_reduce_gmin_ps */
init_f32(v1.f32, finit1);
DOONE_REDUCE_F32(REDUCE_GMIN, _mm512_reduce_min_ps);
/* _mm512_reduce_gmax_ps */
init_f32(v1.f32, finit1);
DOONE_REDUCE_F32(REDUCE_GMAX, _mm512_reduce_max_ps);
for (i = 0; i < 2; i += 1) {
mask = mvals[i];
/* _mm512_reduce_gmax_ps */
init_f32(v1.f32, finit1);
DOONE_MASK_REDUCE_F32(REDUCE_GMIN, mask, _mm512_mask_reduce_min_ps);
/* _mm512_reduce_gmax_ps */
init_f32(v1.f32, finit1);
DOONE_MASK_REDUCE_F32(REDUCE_GMAX, mask, _mm512_mask_reduce_max_ps);
/* _mm512_reduce_mul_ps */
init_f32(v1.f32, finit1);
DOONE_MASK_REDUCE_F32(REDUCE_MUL, mask, _mm512_mask_reduce_mul_ps);
/* _mm512_reduce_add_ps */
init_f32(v1.f32, finit1);
DOONE_MASK_REDUCE_F32(REDUCE_ADD, mask, _mm512_mask_reduce_add_ps);
}
/* zmmd/f64 tests ---------------------------------------- */
/* _mm512_reduce_add_pd */
init_f64(v1.f64, dinit1);
DOONE_REDUCE_F64(REDUCE_ADD, _mm512_reduce_add_pd);
/* _mm512_reduce_mul_pd */
init_f64(v1.f64, dinit1);
DOONE_REDUCE_F64(REDUCE_MUL, _mm512_reduce_mul_pd);
/* _mm512_reduce_gmin_pd */
init_f64(v1.f64, dinit1);
DOONE_REDUCE_F64(REDUCE_GMIN, _mm512_reduce_min_pd);
/* _mm512_reduce_gmax_pd */
init_f64(v1.f64, dinit1);
DOONE_REDUCE_F64(REDUCE_GMAX, _mm512_reduce_max_pd);
for (i = 0; i < 2; i += 1) {
mask = mvals[i];
/* _mm512_mask_reduce_gmin_ps */
init_f64(v1.f64, dinit1);
DOONE_MASK_REDUCE_F64(REDUCE_GMIN, mask, _mm512_mask_reduce_min_pd);
/* _mm512_mask_reduce_gmax_ps */
init_f64(v1.f64, dinit2);
DOONE_MASK_REDUCE_F64(REDUCE_GMAX, mask, _mm512_mask_reduce_max_pd);
/* _mm512_mask_reduce_mul_ps */
init_f64(v1.f64, dinit1);
DOONE_MASK_REDUCE_F64(REDUCE_MUL, mask, _mm512_mask_reduce_mul_pd);
/* _mm512_mask_reduce_add_ps */
init_f64(v1.f64, dinit2);
DOONE_MASK_REDUCE_F64(REDUCE_ADD, mask, _mm512_mask_reduce_add_pd);
}
if (err) {
printf("FAILED\n");
return 1;
}
printf("PASSED\n");
return 0;
}