test-suite: add avx512 tests with math intrinsics

Summary:
Here is the bunch of tests. In each test we do some operations with intrinsics and also check result.
This set depends on cpu features, so it needs support to identify avx512 features in cmake and makefile approaches of test-suite. Here are patches for makefile and cmake: [[https://reviews.llvm.org/D38182|D38182]], [[https://reviews.llvm.org/D38484|D38484]]


Reviewers: craig.topper, zvi, MatzeB

Reviewed By: MatzeB

Subscribers: llvm-commits, mgorny

Differential Revision: https://reviews.llvm.org/D41249

git-svn-id: https://llvm.org/svn/llvm-project/test-suite/trunk@321144 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/SingleSource/UnitTests/Vector/AVX512/CMakeLists.txt b/SingleSource/UnitTests/Vector/AVX512/CMakeLists.txt
new file mode 100644
index 0000000..78559cd
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/CMakeLists.txt
@@ -0,0 +1,4 @@
+list(APPEND LDFLAGS -lm)
+list(APPEND CFLAGS -march=skylake-avx512)
+list(APPEND CFLAGS -fms-extensions)
+llvm_singlesource(PREFIX "Vector-AVX512-")
diff --git a/SingleSource/UnitTests/Vector/AVX512/Makefile b/SingleSource/UnitTests/Vector/AVX512/Makefile
new file mode 100644
index 0000000..feaad11
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/Makefile
@@ -0,0 +1,11 @@
+# SingleSource/UnitTests/Vector/AVX512/Makefile
+
+DIRS = 
+LEVEL = ../../../..
+CFLAGS += -fms-extensions -march=skylake-avx512
+LDFLAGS += -lm
+
+include $(LEVEL)/SingleSource/Makefile.singlesrc
+
+TARGET_FLAGS += -march=skylake-avx512
+LCCFLAGS += -march=skylake-avx512
diff --git a/SingleSource/UnitTests/Vector/AVX512/abs.c b/SingleSource/UnitTests/Vector/AVX512/abs.c
new file mode 100644
index 0000000..de100b5
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/abs.c
@@ -0,0 +1,65 @@
+/*
+ * Test absolute value intrinsics.
+ * Here we check for _mm512_abs_ps and _mm512_abs_pd intrinsics.
+ */
+
+#include "m512_test_util.h"
+#include <stdio.h>
+
+V512 f32;
+V512 f64;
+
+void NOINLINE init() {
+  volatile int i;
+
+  for (i = 0; i < 16; i++) {
+    f32.f32[i] = (i & 1) ? i : -i;
+  }
+
+  for (i = 0; i < 8; i++) {
+    f64.f64[i] = (i & 1) ? -i : i;
+  }
+}
+
+void NOINLINE do_abs_ps() {
+  V512 res;
+  V512 expected;
+  volatile int i;
+
+  res.zmm = _mm512_abs_ps(f32.zmm);
+
+  for (i = 0; i < 16; i++) {
+    expected.s32[i] = f32.s32[i] & 0x7fffffff;
+  }
+
+  check_equal_nd(&res, &expected, 16, "_mm512_abs_ps", __LINE__);
+}
+
+void NOINLINE do_abs_pd() {
+  V512 res;
+  V512 expected;
+  volatile int i;
+
+  res.zmmd = _mm512_abs_pd(f64.zmmd);
+
+  for (i = 0; i < 8; i++) {
+    expected.s64[i] = f64.s64[i] & 0x7fffffffffffffff;
+  }
+
+  check_equal_nd(&res, &expected, 16, "_mm512_abs_pd", __LINE__);
+}
+
+int main(int argc, char *argv[]) {
+  init();
+
+  do_abs_ps();
+  do_abs_pd();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/abs.reference_output b/SingleSource/UnitTests/Vector/AVX512/abs.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/abs.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/fma.c b/SingleSource/UnitTests/Vector/AVX512/fma.c
new file mode 100644
index 0000000..73ef4b6
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/fma.c
@@ -0,0 +1,827 @@
+/*
+ * Test fma(fmadd, fmsub, fnmadd, fnmsub) instructions.
+ * Here we check for _mm512_[mask|mask3]_f[madd|msub|nmadd|nmsub]_[round]
+ * intrinsics.
+ */
+
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+
+int verbose = 0;
+
+__m512i i1;
+__m512i i2;
+__m512i i3;
+__m512i i4;
+__m512i i5;
+
+__m512 f1;
+__m512 f2;
+__m512 f3;
+__m512 f4;
+__m512 f5;
+
+__m512d d1;
+__m512d d2;
+__m512d d3;
+__m512d d4;
+__m512d d5;
+
+typedef enum { FMA_233, FMA_132, FMA_231, FMA_213, FMA_23c1 } Fma_order;
+
+volatile int vol = 0; /* To prevent optimizations */
+
+void NOINLINE init() {
+  int i;
+  V512 *pi1 = (V512 *)&i1;
+  V512 *pi2 = (V512 *)&i2;
+  V512 *pi3 = (V512 *)&i3;
+  V512 *pf1 = (V512 *)&f1;
+  V512 *pf2 = (V512 *)&f2;
+  V512 *pf3 = (V512 *)&f3;
+  V512 *pd1 = (V512 *)&d1;
+  V512 *pd2 = (V512 *)&d2;
+  V512 *pd3 = (V512 *)&d3;
+
+  for (i = 0; i < 16; i++) {
+    pi1->s32[i] = 17 + ((i & 1) ? 1 : -1) * i + vol;
+    pf1->f32[i] = pi1->s32[i];
+
+    pi2->s32[i] = 100 + ((i & 3) == 3 ? 1 : -1) * i + vol;
+    pf2->f32[i] = -pi2->s32[i];
+
+    pi3->s32[i] = 400 + ((i & 1) ? -1 : 1) * i + vol;
+    pf3->f32[i] = pi3->s32[i];
+  }
+
+  for (i = 0; i < 8; i++) {
+    pd1->f64[i] = pi1->s32[i];
+    pd2->f64[i] = pi2->s32[i];
+    pd3->f64[i] = -pi3->s32[i];
+  }
+}
+
+void NOINLINE check_equal32(void *vgot, void *vexpected, void *vexpected_orig,
+                            int mask, char *banner) {
+  int i;
+  V512 *got = (V512 *)vgot;
+  V512 *expected = (V512 *)vexpected;
+  V512 *orig = (V512 *)vexpected_orig;
+
+  for (i = 0; i < 16; i++) {
+    int ans = (mask & (1 << i)) ? expected->s32[i] : orig->s32[i];
+    if (got->s32[i] != ans) {
+      printf("ERROR: %s failed -- 0x%0.8x != 0x%0.8x at element [%d]\n",
+             banner ? banner : "", got->s32[i], ans, i);
+      n_errs++;
+      break;
+    }
+  }
+}
+
+void NOINLINE check_equal64(void *vgot, void *vexpected, void *vexpected_orig,
+                            int mask, char *banner) {
+  int i;
+  V512 *got = (V512 *)vgot;
+  V512 *expected = (V512 *)vexpected;
+  V512 *orig = (V512 *)vexpected_orig;
+
+  for (i = 0; i < 8; i++) {
+    __int64 ans = (mask & (1 << i)) ? expected->s64[i] : orig->s64[i];
+    if (got->s64[i] != ans) {
+      printf("ERROR: %s failed -- %0.16" PRIx64 " != %0.16" PRIx64
+             " at element [%d]\n",
+             banner ? banner : "", got->s64[i], ans, i);
+      n_errs++;
+      break;
+    }
+  }
+}
+
+void NOINLINE emulate_fmadd_ps(void *presult, const void *p1, int mask,
+                               const void *p2, const void *p3,
+                               Fma_order order) {
+  int i;
+  V512 *result = (V512 *)presult;
+  V512 *v1 = (V512 *)p1;
+  V512 *v2 = (V512 *)p2;
+  V512 *v3 = (V512 *)p3;
+
+  for (i = 0; i < 16; i++) {
+
+    if (((1 << i) & mask) == 0) {
+      result->u32[i] = v1->u32[i];
+      continue;
+    }
+
+    switch (order) {
+    case FMA_233:
+      result->f32[i] =
+          v2->f32[i] * v3->f32[(i & ~0x3) + 1] + v3->f32[(i & ~0x3)];
+      break;
+
+    case FMA_132:
+      result->f32[i] = v1->f32[i] * v3->f32[i] + v2->f32[i];
+      break;
+
+    case FMA_231:
+      result->f32[i] = v2->f32[i] * v3->f32[i] + v1->f32[i];
+      break;
+
+    case FMA_213:
+      result->f32[i] = v2->f32[i] * v1->f32[i] + v3->f32[i];
+      break;
+
+    case FMA_23c1:
+      result->f32[i] = (v2->f32[i] * v3->f32[i]) + 1.0f;
+      break;
+
+    default:
+      printf("ERROR -- bad fma order %d\n", (int)order);
+      n_errs++;
+      return;
+    }
+  }
+}
+
+void NOINLINE emulate_fmsub_ps(void *presult, const void *p1, int mask,
+                               const void *p2, const void *p3,
+                               Fma_order order) {
+  int i;
+  V512 *result = (V512 *)presult;
+  V512 *v1 = (V512 *)p1;
+  V512 *v2 = (V512 *)p2;
+  V512 *v3 = (V512 *)p3;
+
+  for (i = 0; i < 16; i++) {
+
+    if (((1 << i) & mask) == 0) {
+      result->u32[i] = v1->u32[i];
+      continue;
+    }
+
+    switch (order) {
+    case FMA_233:
+      result->f32[i] =
+          v2->f32[i] * v3->f32[(i & ~0x3) + 1] - v3->f32[(i & ~0x3)];
+      break;
+
+    case FMA_132:
+      result->f32[i] = v1->f32[i] * v3->f32[i] - v2->f32[i];
+      break;
+
+    case FMA_231:
+      result->f32[i] = v2->f32[i] * v3->f32[i] - v1->f32[i];
+      break;
+
+    case FMA_213:
+      result->f32[i] = v2->f32[i] * v1->f32[i] - v3->f32[i];
+      break;
+
+    case FMA_23c1:
+      result->f32[i] = (v2->f32[i] * v3->f32[i]) - 1.0f;
+      break;
+
+    default:
+      printf("ERROR -- bad fma order %d\n", (int)order);
+      n_errs++;
+      return;
+    }
+  }
+}
+
+void NOINLINE emulate_fnmadd_ps(void *presult, const void *p1, int mask,
+                                const void *p2, const void *p3,
+                                Fma_order order) {
+  int i;
+  V512 *result = (V512 *)presult;
+  V512 *v1 = (V512 *)p1;
+  V512 *v2 = (V512 *)p2;
+  V512 *v3 = (V512 *)p3;
+
+  for (i = 0; i < 16; i++) {
+
+    if (((1 << i) & mask) == 0) {
+      result->u32[i] = v1->u32[i];
+      continue;
+    }
+
+    switch (order) {
+    case FMA_233:
+      result->f32[i] =
+          -(v2->f32[i] * v3->f32[(i & ~0x3) + 1]) + v3->f32[(i & ~0x3)];
+      break;
+
+    case FMA_132:
+      result->f32[i] = -(v1->f32[i] * v3->f32[i]) + v2->f32[i];
+      break;
+
+    case FMA_231:
+      result->f32[i] = -(v2->f32[i] * v3->f32[i]) + v1->f32[i];
+      break;
+
+    case FMA_213:
+      result->f32[i] = -(v2->f32[i] * v1->f32[i]) + v3->f32[i];
+      break;
+
+    case FMA_23c1:
+      result->f32[i] = -(v2->f32[i] * v3->f32[i]) + 1.0f;
+      break;
+
+    default:
+      printf("ERROR -- bad fma order %d\n", (int)order);
+      n_errs++;
+      return;
+    }
+  }
+}
+
+void NOINLINE emulate_fnmsub_ps(void *presult, const void *p1, int mask,
+                                const void *p2, const void *p3,
+                                Fma_order order) {
+  int i;
+  V512 *result = (V512 *)presult;
+  V512 *v1 = (V512 *)p1;
+  V512 *v2 = (V512 *)p2;
+  V512 *v3 = (V512 *)p3;
+
+  for (i = 0; i < 16; i++) {
+
+    if (((1 << i) & mask) == 0) {
+      result->u32[i] = v1->u32[i];
+      continue;
+    }
+
+    switch (order) {
+    case FMA_233:
+      result->f32[i] =
+          -(v2->f32[i] * v3->f32[(i & ~0x3) + 1]) - v3->f32[(i & ~0x3)];
+      break;
+
+    case FMA_132:
+      result->f32[i] = -(v1->f32[i] * v3->f32[i]) - v2->f32[i];
+      break;
+
+    case FMA_231:
+      result->f32[i] = -(v2->f32[i] * v3->f32[i]) - v1->f32[i];
+      break;
+
+    case FMA_213:
+      result->f32[i] = -(v2->f32[i] * v1->f32[i]) - v3->f32[i];
+      break;
+
+    case FMA_23c1:
+      result->f32[i] = -(v2->f32[i] * v3->f32[i]) - 1.0f;
+      break;
+
+    default:
+      printf("ERROR -- bad fma order %d\n", (int)order);
+      n_errs++;
+      return;
+    }
+  }
+}
+
+void NOINLINE emulate_fmadd_pi(void *presult, const void *p1, int mask,
+                               const void *p2, const void *p3,
+                               Fma_order order) {
+  int i;
+  V512 *result = (V512 *)presult;
+  V512 *v1 = (V512 *)p1;
+  V512 *v2 = (V512 *)p2;
+  V512 *v3 = (V512 *)p3;
+
+  for (i = 0; i < 16; i++) {
+
+    if (((1 << i) & mask) == 0) {
+      result->u32[i] = v1->u32[i];
+      continue;
+    }
+
+    switch (order) {
+    case FMA_233:
+      result->s32[i] =
+          v2->s32[i] * v3->s32[(i & ~0x3) + 1] + v3->s32[(i & ~0x3)];
+      break;
+
+    case FMA_132:
+      result->s32[i] = v1->s32[i] * v3->s32[i] + v2->s32[i];
+      break;
+
+    case FMA_231:
+      result->s32[i] = v2->s32[i] * v3->s32[i] + v1->s32[i];
+      break;
+
+    case FMA_213:
+      result->s32[i] = v2->s32[i] * v1->s32[i] + v3->s32[i];
+      break;
+
+    case FMA_23c1:
+      result->s32[i] = v2->s32[i] * v3->s32[i] + 1;
+      break;
+
+    default:
+      printf("ERROR -- bad fma order %d\n", (int)order);
+      n_errs++;
+      return;
+    }
+  }
+}
+
+void NOINLINE emulate_fmadd_pd(void *presult, const void *p1, int mask,
+                               const void *p2, const void *p3,
+                               Fma_order order) {
+  int i;
+  V512 *result = (V512 *)presult;
+  V512 *v1 = (V512 *)p1;
+  V512 *v2 = (V512 *)p2;
+  V512 *v3 = (V512 *)p3;
+
+  for (i = 0; i < 8; i++) {
+
+    if (((1 << i) & mask) == 0) {
+      result->u64[i] = v1->u64[i];
+      continue;
+    }
+
+    switch (order) {
+    case FMA_233:
+      result->f64[i] =
+          v2->f64[i] * v3->f64[(i & ~0x3) + 1] + v3->f64[(i & ~0x3)];
+      break;
+
+    case FMA_132:
+      result->f64[i] = v1->f64[i] * v3->f64[i] + v2->f64[i];
+      break;
+
+    case FMA_231:
+      result->f64[i] = v2->f64[i] * v3->f64[i] + v1->f64[i];
+      break;
+
+    case FMA_213:
+      result->f64[i] = v2->f64[i] * v1->f64[i] + v3->f64[i];
+      break;
+
+    case FMA_23c1:
+      result->f64[i] = v2->f64[i] * v3->f64[i] + 1.0;
+      break;
+
+    default:
+      printf("ERROR -- bad fma order %d\n", (int)order);
+      n_errs++;
+      return;
+    }
+  }
+}
+
+void NOINLINE emulate_fmsub_pd(void *presult, const void *p1, int mask,
+                               const void *p2, const void *p3,
+                               Fma_order order) {
+  int i;
+  V512 *result = (V512 *)presult;
+  V512 *v1 = (V512 *)p1;
+  V512 *v2 = (V512 *)p2;
+  V512 *v3 = (V512 *)p3;
+
+  for (i = 0; i < 8; i++) {
+
+    if (((1 << i) & mask) == 0) {
+      result->u64[i] = v1->u64[i];
+      continue;
+    }
+
+    switch (order) {
+    case FMA_233:
+      result->f64[i] =
+          v2->f64[i] * v3->f64[(i & ~0x3) + 1] - v3->f64[(i & ~0x3)];
+      break;
+
+    case FMA_132:
+      result->f64[i] = v1->f64[i] * v3->f64[i] - v2->f64[i];
+      break;
+
+    case FMA_231:
+      result->f64[i] = v2->f64[i] * v3->f64[i] - v1->f64[i];
+      break;
+
+    case FMA_213:
+      result->f64[i] = v2->f64[i] * v1->f64[i] - v3->f64[i];
+      break;
+
+    case FMA_23c1:
+      result->f64[i] = v2->f64[i] * v3->f64[i] - 1.0;
+      break;
+
+    default:
+      printf("ERROR -- bad fma order %d\n", (int)order);
+      n_errs++;
+      return;
+    }
+  }
+}
+
+void NOINLINE emulate_fnmadd_pd(void *presult, const void *p1, int mask,
+                                const void *p2, const void *p3,
+                                Fma_order order) {
+  int i;
+  V512 *result = (V512 *)presult;
+  V512 *v1 = (V512 *)p1;
+  V512 *v2 = (V512 *)p2;
+  V512 *v3 = (V512 *)p3;
+
+  for (i = 0; i < 8; i++) {
+
+    if (((1 << i) & mask) == 0) {
+      result->u64[i] = v1->u64[i];
+      continue;
+    }
+
+    switch (order) {
+    case FMA_233:
+      result->f64[i] =
+          -(v2->f64[i] * v3->f64[(i & ~0x3) + 1]) + v3->f64[(i & ~0x3)];
+      break;
+
+    case FMA_132:
+      result->f64[i] = -(v1->f64[i] * v3->f64[i]) + v2->f64[i];
+      break;
+
+    case FMA_231:
+      result->f64[i] = -(v2->f64[i] * v3->f64[i]) + v1->f64[i];
+      break;
+
+    case FMA_213:
+      result->f64[i] = -(v2->f64[i] * v1->f64[i]) + v3->f64[i];
+      break;
+
+    case FMA_23c1:
+      result->f64[i] = -(v2->f64[i] * v3->f64[i]) + 1.0;
+      break;
+
+    default:
+      printf("ERROR -- bad fma order %d\n", (int)order);
+      n_errs++;
+      return;
+    }
+  }
+}
+
+void NOINLINE emulate_fnmsub_pd(void *presult, const void *p1, int mask,
+                                const void *p2, const void *p3,
+                                Fma_order order) {
+  int i;
+  V512 *result = (V512 *)presult;
+  V512 *v1 = (V512 *)p1;
+  V512 *v2 = (V512 *)p2;
+  V512 *v3 = (V512 *)p3;
+
+  for (i = 0; i < 8; i++) {
+
+    if (((1 << i) & mask) == 0) {
+      result->u64[i] = v1->u64[i];
+      continue;
+    }
+
+    switch (order) {
+    case FMA_233:
+      result->f64[i] =
+          -(v2->f64[i] * v3->f64[(i & ~0x3) + 1]) - v3->f64[(i & ~0x3)];
+      break;
+
+    case FMA_132:
+      result->f64[i] = -(v1->f64[i] * v3->f64[i]) - v2->f64[i];
+      break;
+
+    case FMA_231:
+      result->f64[i] = -(v2->f64[i] * v3->f64[i]) - v1->f64[i];
+      break;
+
+    case FMA_213:
+      result->f64[i] = -(v2->f64[i] * v1->f64[i]) - v3->f64[i];
+      break;
+
+    case FMA_23c1:
+      result->f64[i] = -(v2->f64[i] * v3->f64[i]) - 1.0;
+      break;
+
+    default:
+      printf("ERROR -- bad fma order %d\n", (int)order);
+      n_errs++;
+      return;
+    }
+  }
+}
+
+void NOINLINE do_fmadd_ps() {
+  f4 = _mm512_fmadd_ps(f1, f2, f3);
+  emulate_fmadd_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmadd_ps");
+
+  f4 = _mm512_mask_fmadd_ps(f1, 0x79fa, f2, f3);
+  emulate_fmadd_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmadd_ps");
+
+  f4 = _mm512_mask3_fmadd_ps(f1, f2, f3, 0x563a);
+  emulate_fmadd_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231);
+  check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmadd_ps");
+
+  /*
+   * Employ rounding modes.
+   * Our FP inputs are all integer values, so there's no need for any
+   * special emulation routine.
+   */
+
+  f4 = _mm512_fmadd_round_ps(f1, f2, f3,
+                             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  emulate_fmadd_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmadd_round_ps");
+
+  f4 = _mm512_mask_fmadd_round_ps(f1, 0x79fa, f2, f3,
+                                  _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  emulate_fmadd_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmadd_round_ps");
+
+  f4 = _mm512_mask3_fmadd_round_ps(f1, f2, f3, 0x563a,
+                                   _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  emulate_fmadd_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231);
+  check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmadd_round_ps");
+}
+
+void NOINLINE do_fnmsub_ps() {
+  f4 = _mm512_fnmsub_ps(f1, f2, f3);
+  emulate_fnmsub_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fnmsub_ps");
+
+  f4 = _mm512_mask_fnmsub_ps(f1, 0x79fa, f2, f3);
+  emulate_fnmsub_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fnmsub_ps");
+
+  f4 = _mm512_mask3_fnmsub_ps(f1, f2, f3, 0x563a);
+  emulate_fnmsub_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231);
+  check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fnmsub_ps");
+
+  /*
+   * Employ rounding modes.
+   * Our FP inputs are all integer values, so there's no need for any
+   * special emulation routine.
+   */
+
+  f4 = _mm512_fnmsub_round_ps(f1, f2, f3,
+                              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  emulate_fnmsub_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fnmsub_round_ps");
+
+  f4 = _mm512_mask_fnmsub_round_ps(f1, 0x79fa, f2, f3,
+                                   _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  emulate_fnmsub_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fnmsub_round_ps");
+
+  f4 = _mm512_mask3_fnmsub_round_ps(f1, f2, f3, 0x563a,
+                                    _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  emulate_fnmsub_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231);
+  check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fnmsub_round_ps");
+}
+
+void NOINLINE do_fmadd_pd() {
+  d4 = _mm512_fmadd_pd(d1, d2, d3);
+  emulate_fmadd_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmadd_pd");
+
+  d4 = _mm512_mask_fmadd_pd(d1, 0xfa, d2, d3);
+  emulate_fmadd_pd(&d5, &d1, 0xfa, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, &d1, 0xfa, "_mm512_mask_fmadd_pd");
+
+  d4 = _mm512_mask3_fmadd_pd(d1, d2, d3, 0x56);
+  emulate_fmadd_pd(&d5, &d3, 0x56, &d1, &d2, FMA_231);
+  check_equal64(&d4, &d5, &d3, 0x56, "_mm512_mask3_fmadd_pd");
+
+  /*
+   * Employ rounding modes.
+   * Our FP inputs are all integer values, so there's no need for any
+   * special emulation routine.
+   */
+
+  d4 = _mm512_fmadd_round_pd(d1, d2, d3,
+                             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  emulate_fmadd_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmadd_round_pd");
+
+  d4 = _mm512_mask_fmadd_round_pd(d1, 0x79, d2, d3,
+                                  _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  emulate_fmadd_pd(&d5, &d1, 0x79, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, &d1, 0x79, "_mm512_mask_fmadd_round_pd");
+
+  d4 = _mm512_mask3_fmadd_round_pd(d1, d2, d3, 0x63,
+                                   _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  emulate_fmadd_pd(&d5, &d3, 0x63, &d1, &d2, FMA_231);
+  check_equal64(&d4, &d5, &d3, 0x63, "_mm512_mask3_fmadd_round_pd");
+}
+
+void NOINLINE do_fnmsub_pd() {
+  d4 = _mm512_fnmsub_pd(d1, d2, d3);
+  emulate_fnmsub_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fnmsub_pd");
+
+  d4 = _mm512_mask_fnmsub_pd(d1, 0xfa, d2, d3);
+  emulate_fnmsub_pd(&d5, &d1, 0xfa, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, &d1, 0xfa, "_mm512_mask_fnmsub_pd");
+
+  d4 = _mm512_mask3_fnmsub_pd(d1, d2, d3, 0x56);
+  emulate_fnmsub_pd(&d5, &d3, 0x56, &d1, &d2, FMA_231);
+  check_equal64(&d4, &d5, &d3, 0x56, "_mm512_mask3_fnmsub_pd");
+
+  /*
+   * Employ rounding modes.
+   * Our FP inputs are all integer values, so there's no need for any
+   * special emulation routine.
+   */
+
+  d4 = _mm512_fnmsub_round_pd(d1, d2, d3,
+                              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  emulate_fnmsub_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fnmsub_round_pd");
+
+  d4 = _mm512_mask_fnmsub_round_pd(d1, 0x79, d2, d3,
+                                   _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  emulate_fnmsub_pd(&d5, &d1, 0x79, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, &d1, 0x79, "_mm512_mask_fnmsub_round_pd");
+
+  d4 = _mm512_mask3_fnmsub_round_pd(d1, d2, d3, 0x63,
+                                    _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  emulate_fnmsub_pd(&d5, &d3, 0x63, &d1, &d2, FMA_231);
+  check_equal64(&d4, &d5, &d3, 0x63, "_mm512_mask3_fnmsub_round_pd");
+}
+
+void NOINLINE do_fmsub_ps() {
+  f4 = _mm512_fmsub_ps(f1, f2, f3);
+  emulate_fmsub_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmsub_ps");
+
+  f4 = _mm512_mask_fmsub_ps(f1, 0x79fa, f2, f3);
+  emulate_fmsub_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmsub_ps");
+
+  f4 = _mm512_mask3_fmsub_ps(f1, f2, f3, 0x563a);
+  emulate_fmsub_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231);
+  check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmsub_ps");
+
+  /*
+   * Employ rounding modes.
+   * Our FP inputs are all integer values, so there's no need for any
+   * special emulation routine.
+   */
+
+  f4 = _mm512_fmsub_round_ps(f1, f2, f3,
+                             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  emulate_fmsub_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmsub_round_ps");
+
+  f4 = _mm512_mask_fmsub_round_ps(f1, 0x79fa, f2, f3,
+                                  _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  emulate_fmsub_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmsub_round_ps");
+
+  f4 = _mm512_mask3_fmsub_round_ps(f1, f2, f3, 0x563a,
+                                   _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  emulate_fmsub_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231);
+  check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmsub_round_ps");
+}
+
+void NOINLINE do_fnmadd_ps() {
+  f4 = _mm512_fnmadd_ps(f1, f2, f3);
+  emulate_fnmadd_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fnmadd_ps");
+
+  f4 = _mm512_mask_fnmadd_ps(f1, 0x79fa, f2, f3);
+  emulate_fnmadd_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fnmadd_ps");
+
+  f4 = _mm512_mask3_fnmadd_ps(f1, f2, f3, 0x563a);
+  emulate_fnmadd_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231);
+  check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fnmadd_ps");
+
+  /*
+   * Employ rounding modes.
+   * Our FP inputs are all integer values, so there's no need for any
+   * special emulation routine.
+   */
+
+  f4 = _mm512_fnmadd_round_ps(f1, f2, f3,
+                              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  emulate_fnmadd_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fnmadd_round_ps");
+
+  f4 = _mm512_mask_fnmadd_round_ps(f1, 0x79fa, f2, f3,
+                                   _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  emulate_fnmadd_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fnmadd_round_ps");
+
+  f4 = _mm512_mask3_fnmadd_round_ps(f1, f2, f3, 0x563a,
+                                    _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  emulate_fnmadd_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231);
+  check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fnmadd_round_ps");
+}
+
+void NOINLINE do_fmsub_pd() {
+  d4 = _mm512_fmsub_pd(d1, d2, d3);
+  emulate_fmsub_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmsub_pd");
+
+  d4 = _mm512_mask_fmsub_pd(d1, 0xfa, d2, d3);
+  emulate_fmsub_pd(&d5, &d1, 0xfa, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, &d1, 0xfa, "_mm512_mask_fmsub_pd");
+
+  d4 = _mm512_mask3_fmsub_pd(d1, d2, d3, 0x56);
+  emulate_fmsub_pd(&d5, &d3, 0x56, &d1, &d2, FMA_231);
+  check_equal64(&d4, &d5, &d3, 0x56, "_mm512_mask3_fmsub_pd");
+
+  /*
+   * Employ rounding modes.
+   * Our FP inputs are all integer values, so there's no need for any
+   * special emulation routine.
+   */
+
+  d4 = _mm512_fmsub_round_pd(d1, d2, d3,
+                             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  emulate_fmsub_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmsub_round_pd");
+
+  d4 = _mm512_mask_fmsub_round_pd(d1, 0x79, d2, d3,
+                                  _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  emulate_fmsub_pd(&d5, &d1, 0x79, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, &d1, 0x79, "_mm512_mask_fmsub_round_pd");
+
+  d4 = _mm512_mask3_fmsub_round_pd(d1, d2, d3, 0x63,
+                                   _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  emulate_fmsub_pd(&d5, &d3, 0x63, &d1, &d2, FMA_231);
+  check_equal64(&d4, &d5, &d3, 0x63, "_mm512_mask3_fmsub_round_pd");
+}
+
+void NOINLINE do_fnmadd_pd() {
+  d4 = _mm512_fnmadd_pd(d1, d2, d3);
+  emulate_fnmadd_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fnmadd_pd");
+
+  d4 = _mm512_mask_fnmadd_pd(d1, 0xfa, d2, d3);
+  emulate_fnmadd_pd(&d5, &d1, 0xfa, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, &d1, 0xfa, "_mm512_mask_fnmadd_pd");
+
+  d4 = _mm512_mask3_fnmadd_pd(d1, d2, d3, 0x56);
+  emulate_fnmadd_pd(&d5, &d3, 0x56, &d1, &d2, FMA_231);
+  check_equal64(&d4, &d5, &d3, 0x56, "_mm512_mask3_fnmadd_pd");
+
+  /*
+   * Employ rounding modes.
+   * Our FP inputs are all integer values, so there's no need for any
+   * special emulation routine.
+   */
+
+  d4 = _mm512_fnmadd_round_pd(d1, d2, d3,
+                              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  emulate_fnmadd_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fnmadd_round_pd");
+
+  d4 = _mm512_mask_fnmadd_round_pd(d1, 0x79, d2, d3,
+                                   _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  emulate_fnmadd_pd(&d5, &d1, 0x79, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, &d1, 0x79, "_mm512_mask_fnmadd_round_pd");
+
+  d4 = _mm512_mask3_fnmadd_round_pd(d1, d2, d3, 0x63,
+                                    _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  emulate_fnmadd_pd(&d5, &d3, 0x63, &d1, &d2, FMA_231);
+  check_equal64(&d4, &d5, &d3, 0x63, "_mm512_mask3_fnmadd_round_pd");
+}
+
+int main(int argc, char *argv[]) {
+  if (argc > 1 && argv[1][0] == '-' && argv[1][1] == 'v' &&
+      argv[1][2] == '\0') {
+    verbose = 1;
+  }
+
+  init();
+
+  do_fmadd_ps();
+
+  do_fmadd_pd();
+
+  do_fnmsub_ps();
+
+  do_fnmsub_pd();
+
+  do_fmsub_ps();
+
+  do_fmsub_pd();
+
+  do_fnmadd_ps();
+
+  do_fnmadd_pd();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/fma.reference_output b/SingleSource/UnitTests/Vector/AVX512/fma.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/fma.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/fma_addsub.c b/SingleSource/UnitTests/Vector/AVX512/fma_addsub.c
new file mode 100644
index 0000000..c848e8b
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/fma_addsub.c
@@ -0,0 +1,411 @@
+/*
+ * Test addsub and subadd instructions.
+ * Here we check for _mm512_[mask|mask3]_[fmaddsub|fmsubadd]_[round]
+ * intrinsics.
+ */
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+
+int verbose = 0;
+
+__m512 f1;
+__m512 f2;
+__m512 f3;
+__m512 f4;
+__m512 f5;
+
+__m512d d1;
+__m512d d2;
+__m512d d3;
+__m512d d4;
+__m512d d5;
+
+typedef enum {
+  FMA_132,
+  FMA_231,
+  FMA_213,
+} Fma_order;
+
+volatile int vol = 0; /* To prevent optimizations */
+
+void NOINLINE init() {
+  int i;
+  V512 *pf1 = (V512 *)&f1;
+  V512 *pf2 = (V512 *)&f2;
+  V512 *pf3 = (V512 *)&f3;
+  V512 *pd1 = (V512 *)&d1;
+  V512 *pd2 = (V512 *)&d2;
+  V512 *pd3 = (V512 *)&d3;
+
+  for (i = 0; i < 16; i++) {
+    pf1->f32[i] = 17 + ((i & 1) ? 1 : -1) * i + vol;
+    pf2->f32[i] = -(100 + ((i & 3) == 3 ? 1 : -1) * i + vol);
+    pf3->f32[i] = 400 + ((i & 1) ? -1 : 1) * i + vol;
+  }
+
+  for (i = 0; i < 8; i++) {
+    pd1->f64[i] = pf1->f32[i];
+    pd2->f64[i] = -pf2->f32[i];
+    pd3->f64[i] = -pf3->f32[i];
+  }
+}
+
+void NOINLINE check_equal32(void *vgot, void *vexpected, void *vexpected_orig,
+                            int mask, char *banner) {
+  int i;
+  V512 *got = (V512 *)vgot;
+  V512 *expected = (V512 *)vexpected;
+  V512 *orig = (V512 *)vexpected_orig;
+
+  for (i = 0; i < 16; i++) {
+    int ans = (mask & (1 << i)) ? expected->s32[i] : orig->s32[i];
+    if (got->s32[i] != ans) {
+      printf("ERROR: %s failed -- 0x%0.8x != 0x%0.8x at element [%d]\n",
+             banner ? banner : "", got->s32[i], ans, i);
+      n_errs++;
+      break;
+    }
+  }
+}
+
+void NOINLINE check_equal64(void *vgot, void *vexpected, void *vexpected_orig,
+                            int mask, char *banner) {
+  int i;
+  V512 *got = (V512 *)vgot;
+  V512 *expected = (V512 *)vexpected;
+  V512 *orig = (V512 *)vexpected_orig;
+
+  for (i = 0; i < 8; i++) {
+    __int64 ans = (mask & (1 << i)) ? expected->s64[i] : orig->s64[i];
+    if (got->s64[i] != ans) {
+      printf("ERROR: %s failed -- %0.16" PRIx64 " != %0.16" PRIx64
+             " at element [%d]\n",
+             banner ? banner : "", got->s64[i], ans, i);
+      n_errs++;
+      break;
+    }
+  }
+}
+
+void NOINLINE emulate_fmaddsub_ps(void *presult, const void *p1, int mask,
+                                  const void *p2, const void *p3,
+                                  Fma_order order) {
+  int i;
+  V512 *result = (V512 *)presult;
+  V512 *v1 = (V512 *)p1;
+  V512 *v2 = (V512 *)p2;
+  V512 *v3 = (V512 *)p3;
+  for (i = 0; i < 16; i++) {
+
+    if (((1 << i) & mask) == 0) {
+      result->u32[i] = v1->u32[i];
+      continue;
+    }
+
+    switch (order) {
+    case FMA_132:
+      result->f32[i] =
+          v1->f32[i] * v3->f32[i] + ((i % 2 == 0) ? -v2->f32[i] : v2->f32[i]);
+      break;
+
+    case FMA_231:
+      result->f32[i] =
+          v2->f32[i] * v3->f32[i] + ((i % 2 == 0) ? -v1->f32[i] : v1->f32[i]);
+      break;
+
+    case FMA_213:
+      result->f32[i] =
+          v2->f32[i] * v1->f32[i] + ((i % 2 == 0) ? -v3->f32[i] : v3->f32[i]);
+      break;
+
+    default:
+      printf("ERROR -- bad fma order %d\n", (int)order);
+      n_errs++;
+      return;
+    }
+  }
+}
+
+void NOINLINE emulate_fmsubadd_ps(void *presult, const void *p1, int mask,
+                                  const void *p2, const void *p3,
+                                  Fma_order order) {
+  int i;
+  V512 *result = (V512 *)presult;
+  V512 *v1 = (V512 *)p1;
+  V512 *v2 = (V512 *)p2;
+  V512 *v3 = (V512 *)p3;
+
+  for (i = 0; i < 16; i++) {
+
+    if (((1 << i) & mask) == 0) {
+      result->u32[i] = v1->u32[i];
+      continue;
+    }
+
+    switch (order) {
+    case FMA_132:
+      result->f32[i] =
+          v1->f32[i] * v3->f32[i] + ((i % 2 == 0) ? v2->f32[i] : -v2->f32[i]);
+      break;
+
+    case FMA_231:
+      result->f32[i] =
+          v2->f32[i] * v3->f32[i] + ((i % 2 == 0) ? v1->f32[i] : -v1->f32[i]);
+      break;
+
+    case FMA_213:
+      result->f32[i] =
+          v2->f32[i] * v1->f32[i] + ((i % 2 == 0) ? v3->f32[i] : -v3->f32[i]);
+      break;
+
+    default:
+      printf("ERROR -- bad fma order %d\n", (int)order);
+      n_errs++;
+      return;
+    }
+  }
+}
+
+void NOINLINE emulate_fmaddsub_pd(void *presult, const void *p1, int mask,
+                                  const void *p2, const void *p3,
+                                  Fma_order order) {
+  int i;
+  V512 *result = (V512 *)presult;
+  V512 *v1 = (V512 *)p1;
+  V512 *v2 = (V512 *)p2;
+  V512 *v3 = (V512 *)p3;
+
+  for (i = 0; i < 8; i++) {
+
+    if (((1 << i) & mask) == 0) {
+      result->u64[i] = v1->u64[i];
+      continue;
+    }
+
+    switch (order) {
+    case FMA_132:
+      result->f64[i] =
+          v1->f64[i] * v3->f64[i] + ((i % 2 == 0) ? -v2->f64[i] : v2->f64[i]);
+      break;
+
+    case FMA_231:
+      result->f64[i] =
+          v2->f64[i] * v3->f64[i] + ((i % 2 == 0) ? -v1->f64[i] : v1->f64[i]);
+      break;
+
+    case FMA_213:
+      result->f64[i] =
+          v2->f64[i] * v1->f64[i] + ((i % 2 == 0) ? -v3->f64[i] : v3->f64[i]);
+      break;
+
+    default:
+      printf("ERROR -- bad fma order %d\n", (int)order);
+      n_errs++;
+      return;
+    }
+  }
+}
+
+void NOINLINE emulate_fmsubadd_pd(void *presult, const void *p1, int mask,
+                                  const void *p2, const void *p3,
+                                  Fma_order order) {
+  int i;
+  V512 *result = (V512 *)presult;
+  V512 *v1 = (V512 *)p1;
+  V512 *v2 = (V512 *)p2;
+  V512 *v3 = (V512 *)p3;
+
+  for (i = 0; i < 8; i++) {
+
+    if (((1 << i) & mask) == 0) {
+      result->u64[i] = v1->u64[i];
+      continue;
+    }
+
+    switch (order) {
+    case FMA_132:
+      result->f64[i] =
+          v1->f64[i] * v3->f64[i] + ((i % 2 == 0) ? v2->f64[i] : -v2->f64[i]);
+      break;
+
+    case FMA_231:
+      result->f64[i] =
+          v2->f64[i] * v3->f64[i] + ((i % 2 == 0) ? v1->f64[i] : -v1->f64[i]);
+      break;
+
+    case FMA_213:
+      result->f64[i] =
+          v2->f64[i] * v1->f64[i] + ((i % 2 == 0) ? v3->f64[i] : -v3->f64[i]);
+      break;
+
+    default:
+      printf("ERROR -- bad fma order %d\n", (int)order);
+      n_errs++;
+      return;
+    }
+  }
+}
+
+void NOINLINE do_fmaddsub_ps() {
+  f4 = _mm512_fmaddsub_ps(f1, f2, f3);
+  emulate_fmaddsub_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmaddsub_ps");
+
+  f4 = _mm512_mask_fmaddsub_ps(f1, 0x79fa, f2, f3);
+  emulate_fmaddsub_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmaddsub_ps");
+
+  f4 = _mm512_mask3_fmaddsub_ps(f1, f2, f3, 0x563a);
+  emulate_fmaddsub_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231);
+  check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmaddsub_ps");
+
+  /*
+   * Employ rounding modes.
+   * Our FP inputs are all integer values, so there's no need for any
+   * special emulation routine.
+   */
+
+  f4 = _mm512_fmaddsub_round_ps(f1, f2, f3,
+                                _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  emulate_fmaddsub_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmaddsub_round_ps");
+
+  f4 = _mm512_mask_fmaddsub_round_ps(f1, 0x79fa, f2, f3,
+                                     _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  emulate_fmaddsub_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmaddsub_round_ps");
+
+  f4 = _mm512_mask3_fmaddsub_round_ps(
+      f1, f2, f3, 0x563a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  emulate_fmaddsub_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231);
+  check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmaddsub_round_ps");
+}
+
+void NOINLINE do_fmaddsub_pd() {
+  d4 = _mm512_fmaddsub_pd(d1, d2, d3);
+  emulate_fmaddsub_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmaddsub_pd");
+
+  d4 = _mm512_mask_fmaddsub_pd(d1, 0xfa, d2, d3);
+  emulate_fmaddsub_pd(&d5, &d1, 0xfa, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, &d1, 0xfa, "_mm512_mask_fmaddsub_pd");
+
+  d4 = _mm512_mask3_fmaddsub_pd(d1, d2, d3, 0x56);
+  emulate_fmaddsub_pd(&d5, &d3, 0x56, &d1, &d2, FMA_231);
+  check_equal64(&d4, &d5, &d3, 0x56, "_mm512_mask3_fmaddsub_pd");
+
+  /*
+   * Employ rounding modes.
+   * Our FP inputs are all integer values, so there's no need for any
+   * special emulation routine.
+   */
+
+  d4 = _mm512_fmaddsub_round_pd(d1, d2, d3,
+                                _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  emulate_fmaddsub_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmaddsub_round_pd");
+
+  d4 = _mm512_mask_fmaddsub_round_pd(d1, 0x79, d2, d3,
+                                     _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  emulate_fmaddsub_pd(&d5, &d1, 0x79, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, &d1, 0x79, "_mm512_mask_fmaddsub_round_pd");
+
+  d4 = _mm512_mask3_fmaddsub_round_pd(
+      d1, d2, d3, 0x63, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  emulate_fmaddsub_pd(&d5, &d3, 0x63, &d1, &d2, FMA_231);
+  check_equal64(&d4, &d5, &d3, 0x63, "_mm512_mask3_fmaddsub_round_pd");
+}
+
+void NOINLINE do_fmsubadd_ps() {
+  f4 = _mm512_fmsubadd_ps(f1, f2, f3);
+  emulate_fmsubadd_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmsubadd_ps");
+
+  f4 = _mm512_mask_fmsubadd_ps(f1, 0x79fa, f2, f3);
+  emulate_fmsubadd_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmsubadd_ps");
+
+  f4 = _mm512_mask3_fmsubadd_ps(f1, f2, f3, 0x563a);
+  emulate_fmsubadd_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231);
+  check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmsubadd_ps");
+
+  /*
+   * Employ rounding modes.
+   * Our FP inputs are all integer values, so there's no need for any
+   * special emulation routine.
+   */
+
+  f4 = _mm512_fmsubadd_round_ps(f1, f2, f3,
+                                _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  emulate_fmsubadd_ps(&f5, &f1, 0xffff, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, (void *)0, 0xffff, "_mm512_fmsubadd_round_ps");
+
+  f4 = _mm512_mask_fmsubadd_round_ps(f1, 0x79fa, f2, f3,
+                                     _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  emulate_fmsubadd_ps(&f5, &f1, 0x79fa, &f3, &f2, FMA_132);
+  check_equal32(&f4, &f5, &f1, 0x79fa, "_mm512_mask_fmsubadd_round_ps");
+
+  f4 = _mm512_mask3_fmsubadd_round_ps(
+      f1, f2, f3, 0x563a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  emulate_fmsubadd_ps(&f5, &f3, 0x563a, &f1, &f2, FMA_231);
+  check_equal32(&f4, &f5, &f3, 0x563a, "_mm512_mask3_fmsubadd_round_ps");
+}
+
+void NOINLINE do_fmsubadd_pd() {
+  d4 = _mm512_fmsubadd_pd(d1, d2, d3);
+  emulate_fmsubadd_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmsubadd_pd");
+
+  d4 = _mm512_mask_fmsubadd_pd(d1, 0xfa, d2, d3);
+  emulate_fmsubadd_pd(&d5, &d1, 0xfa, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, &d1, 0xfa, "_mm512_mask_fmsubadd_pd");
+
+  d4 = _mm512_mask3_fmsubadd_pd(d1, d2, d3, 0x56);
+  emulate_fmsubadd_pd(&d5, &d3, 0x56, &d1, &d2, FMA_231);
+  check_equal64(&d4, &d5, &d3, 0x56, "_mm512_mask3_fmsubadd_pd");
+
+  /*
+   * Employ rounding modes.
+   * Our FP inputs are all integer values, so there's no need for any
+   * special emulation routine.
+   */
+
+  d4 = _mm512_fmsubadd_round_pd(d1, d2, d3,
+                                _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  emulate_fmsubadd_pd(&d5, &d1, 0xffff, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, (void *)0, 0xffff, "_mm512_fmsubadd_round_pd");
+
+  d4 = _mm512_mask_fmsubadd_round_pd(d1, 0x79, d2, d3,
+                                     _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  emulate_fmsubadd_pd(&d5, &d1, 0x79, &d3, &d2, FMA_132);
+  check_equal64(&d4, &d5, &d1, 0x79, "_mm512_mask_fmsubadd_round_pd");
+
+  d4 = _mm512_mask3_fmsubadd_round_pd(
+      d1, d2, d3, 0x63, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  emulate_fmsubadd_pd(&d5, &d3, 0x63, &d1, &d2, FMA_231);
+  check_equal64(&d4, &d5, &d3, 0x63, "_mm512_mask3_fmsubadd_round_pd");
+}
+
+int main(int argc, char *argv[]) {
+  if (argc > 1 && argv[1][0] == '-' && argv[1][1] == 'v' &&
+      argv[1][2] == '\0') {
+    verbose = 1;
+  }
+
+  init();
+
+  do_fmaddsub_ps();
+  do_fmaddsub_pd();
+
+  do_fmsubadd_ps();
+  do_fmsubadd_pd();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/fma_addsub.reference_output b/SingleSource/UnitTests/Vector/AVX512/fma_addsub.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/fma_addsub.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/imul.c b/SingleSource/UnitTests/Vector/AVX512/imul.c
new file mode 100644
index 0000000..04f28b7
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/imul.c
@@ -0,0 +1,103 @@
+/*
+ * Test various integer multiply intrinsics.
+ * Here we check for _mm512_[mask]mul_ep[i|u]32 intrinsics.
+ */
+
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+
+volatile int vol0 = 0;
+
+V512 i64;
+V512 i64_mix;
+V512 i64_big;
+
+void NOINLINE init() {
+  volatile int i;
+
+
+  for (i = 0; i < 8; i++) {
+    i64.s64[i] = i;
+    i64_mix.s64[i] = (i & 1) ? i : -i;
+    i64_big.s64[i] = 1000 * (i + 1);
+    if ((i & 1) != 0) {
+      i64_big.s64[i] = -i64_big.s64[i];
+    }
+  }
+}
+
+void NOINLINE do_muldq() {
+  V512 res;
+  V512 expected;
+  __mmask16 k;
+  volatile int i;
+
+  res.zmmi = _mm512_mul_epi32(i64_mix.zmmi, i64_big.zmmi);
+  for (i = 0; i < 8; i++) {
+    expected.s64[i] = (I64)i64_mix.s32[2 * i] * i64_big.s32[2 * i];
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mul_epi32", __LINE__);
+
+  /*
+   * No-op to inhibit PRE of i64_big, thus enabling localized ciscization.
+   */
+  i64_big.xmm[vol0] = i64_big.xmm[vol0];
+
+  k = 0xcd;
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_mul_epi32(res.zmmi, k, i64.zmmi, i64_big.zmmi);
+  for (i = 0; i < 8; i++) {
+    expected.s64[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.s64[i] = (I64)i64.s32[2 * i] * i64_big.s32[2 * i];
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_mul_epi32", __LINE__);
+}
+
+void NOINLINE do_muludq() {
+  V512 res;
+  V512 expected;
+  __mmask16 k;
+  volatile int i;
+
+  res.zmmi = _mm512_mul_epu32(i64_mix.zmmi, i64_big.zmmi);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = (U64)i64_mix.u32[2 * i] * i64_big.u32[2 * i];
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mul_epu32", __LINE__);
+
+  /*
+   * No-op to inhibit PRE of i64_big, thus enabling localized ciscization.
+   */
+  i64_big.xmm[vol0] = i64_big.xmm[vol0];
+
+  k = 0xcd;
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_mul_epu32(res.zmmi, k, i64.zmmi, i64_big.zmmi);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.u64[i] = (U64)i64.u32[2 * i] * i64_big.u32[2 * i];
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_mul_epu32", __LINE__);
+}
+
+int main(int argc, char *argv[]) {
+  init();
+
+  do_muldq();
+  do_muludq();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/imul.reference_output b/SingleSource/UnitTests/Vector/AVX512/imul.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/imul.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/m512_op_pd.c b/SingleSource/UnitTests/Vector/AVX512/m512_op_pd.c
new file mode 100644
index 0000000..066ea8a
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/m512_op_pd.c
@@ -0,0 +1,240 @@
+#include "m512_test_util.h"
+#include <memory.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ *      _mm512_add_pd()
+ *      _mm512_max_pd()
+ *      _mm512_min_pd()
+ *      _mm512_mask_max_pd()
+ *      _mm512_mask_min_pd()
+ *      _mm512_mask_mul_pd()
+ *      _mm512_mask_abs_pd()
+ *      _mm512_add_round_pd()
+ *      _mm512_sub_round_pd()
+ */
+
+int show_op = 0;
+
+typedef enum { ASSIGN, ABS, ADD, MAX, MIN, MUL, SUB } OPER;
+
+static void NOINLINE intop(OPER op, double ivalout[8], double ivalop1[8],
+                           double ivalop2[8]) {
+  int i;
+  int handled = 0;
+
+  memset(ivalout, 0, sizeof(ivalout));
+  for (i = 0; i < 8; i += 1) {
+    switch (op) {
+    case ASSIGN:
+      handled = 1;
+      ivalout[i] = ivalop1[i];
+      break;
+    case ABS:
+      handled = 1;
+      ivalout[i] = ivalop1[i] >= 0 ? ivalop1[i] : -ivalop1[i];
+      break;
+    case ADD:
+      handled = 1;
+      ivalout[i] = ivalop1[i] + ivalop2[i];
+      break;
+    case MAX:
+      handled = 1;
+      ivalout[i] = (ivalop1[i] > ivalop2[i]) ? ivalop1[i] : ivalop2[i];
+      break;
+    case MIN:
+      handled = 1;
+      ivalout[i] = (ivalop1[i] < ivalop2[i]) ? ivalop1[i] : ivalop2[i];
+      break;
+    case MUL:
+      handled = 1;
+      ivalout[i] = ivalop2[i] * ivalop1[i];
+      break;
+    case SUB:
+      handled = 1;
+      ivalout[i] = ivalop1[i] - ivalop2[i];
+      break;
+    default:
+      printf("FAIL: bad op\n");
+      break;
+    }
+  }
+  if (!handled) {
+    printf("FAIL: unsupported op\n");
+    n_errs++;
+  }
+}
+
+static int NOINLINE check(double val1[], double good[]) {
+  int i;
+  int res = 1;
+  for (i = 0; i < 8; i += 1) {
+    if (val1[i] != good[i]) {
+      res = 0;
+      printf("FAIL: %f != %f\n", val1[i], good[i]);
+    }
+  }
+  return (res);
+}
+
+static int NOINLINE check_mask(double val1[], double good[], int mask) {
+  int i;
+  int res = 1;
+  for (i = 0; i < 8; i += 1) {
+    if ((1 << i) & mask) {
+      if (val1[i] != good[i]) {
+        res = 0;
+        printf("FAIL: %f != %f\n", val1[i], good[i]);
+      }
+    }
+  }
+  return (res);
+}
+
+static void NOINLINE print_vec(char *pfx, double ivec[]) {
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  printf("%10.4f %10.4f %10.4f %10.4f ", ivec[7], ivec[6], ivec[5], ivec[4]);
+  printf("%10.4f %10.4f %10.4f %10.4f\n", ivec[3], ivec[2], ivec[1], ivec[0]);
+}
+
+#define DOONE(OP, FUNC)                                                        \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f64, v1.f64, v2.f64);                                       \
+    vvv.zmmd = FUNC(v1.zmmd, v2.zmmd);                                         \
+    passed = check(vvv.f64, good.f64);                                         \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f64);                                             \
+      print_vec("Opand2", v2.f64);                                             \
+      print_vec("Scalar", good.f64);                                           \
+      print_vec("Vector", vvv.f64);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_WITH_MASK(OP, FUNC, MMASK)                                       \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f64, v1.f64, v2.f64);                                       \
+    vvv.zmmd = FUNC(vvv.zmmd, MMASK, v1.zmmd, v2.zmmd);                        \
+    passed = check_mask(vvv.f64, good.f64, MMASK);                             \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f64);                                             \
+      print_vec("Opand2", v2.f64);                                             \
+      print_vec("Scalar", good.f64);                                           \
+      print_vec("Vector", vvv.f64);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_WITH_MASK_1OP(OP, FUNC, MMASK)                                   \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f64, v1.f64, v2.f64);                                       \
+    vvv.zmmd = FUNC(vvv.zmmd, MMASK, v1.zmmd);                                 \
+    passed = check_mask(vvv.f64, good.f64, MMASK);                             \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f64);                                             \
+      print_vec("Opand2", v2.f64);                                             \
+      print_vec("Scalar", good.f64);                                           \
+      print_vec("Vector", vvv.f64);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_ROUND(OP, FUNC, ROUND)                                           \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f64, v1.f64, v2.f64);                                       \
+    vvv.zmmd = FUNC(v1.zmmd, v2.zmmd, ROUND);                                  \
+    passed = check(vvv.f64, good.f64);                                         \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f64);                                             \
+      print_vec("Opand2", v2.f64);                                             \
+      print_vec("Scalar", good.f64);                                           \
+      print_vec("Vector", vvv.f64);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_WITH_MASK_ROUND(OP, FUNC, MMASK, ROUND)                          \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f64, v1.f64, v2.f64);                                       \
+    vvv.zmmd = FUNC(vvv.zmmd, MMASK, v1.zmmd, v2.zmmd, ROUND);                 \
+    passed = check_mask(vvv.f64, good.f64, MMASK);                             \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f64);                                             \
+      print_vec("Opand2", v2.f64);                                             \
+      print_vec("Scalar", good.f64);                                           \
+      print_vec("Vector", vvv.f64);                                            \
+    }                                                                          \
+  }
+
+int main() {
+  double init1[] = {1, 2, -3, 4, 5, -6, 7, 8};
+  double init2[] = {11, 12, 23, 24, 35, 36, 17, 38};
+
+  V512 v1;
+  V512 v2;
+  V512 good;
+  V512 vvv;
+
+  intop(ASSIGN, v1.f64, init1, 0);
+  intop(ASSIGN, v2.f64, init2, 0);
+
+  // simple intrinsics
+  DOONE(ADD, _mm512_add_pd);
+  DOONE(MAX, _mm512_max_pd);
+  DOONE(MIN, _mm512_min_pd);
+  DOONE(MUL, _mm512_mul_pd);
+  DOONE(SUB, _mm512_sub_pd);
+
+  DOONE_WITH_MASK(ADD, _mm512_mask_add_pd, 0x07);
+  DOONE_WITH_MASK(MAX, _mm512_mask_max_pd, 0x01);
+  DOONE_WITH_MASK(MIN, _mm512_mask_min_pd, 0x03);
+  DOONE_WITH_MASK(MUL, _mm512_mask_mul_pd, 0xf0);
+  DOONE_WITH_MASK(SUB, _mm512_mask_sub_pd, 0x9f);
+
+  DOONE_WITH_MASK_1OP(ABS, _mm512_mask_abs_pd, 0xf4);
+
+  // intrinsics with rounding mode
+  DOONE_ROUND(ADD, _mm512_add_round_pd,
+              _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  DOONE_ROUND(SUB, _mm512_sub_round_pd,
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+
+  DOONE_WITH_MASK_ROUND(ADD, _mm512_mask_add_round_pd, 0x07,
+                        _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_MASK_ROUND(SUB, _mm512_mask_sub_round_pd, 0xf0,
+                        _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/m512_op_pd.reference_output b/SingleSource/UnitTests/Vector/AVX512/m512_op_pd.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/m512_op_pd.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/m512_op_ps.c b/SingleSource/UnitTests/Vector/AVX512/m512_op_ps.c
new file mode 100644
index 0000000..59d8763
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/m512_op_ps.c
@@ -0,0 +1,236 @@
+#include "m512_test_util.h"
+#include <memory.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ *      _mm512_add_ps()
+ *      _mm512_max_ps()
+ *      _mm512_min_ps()
+ *      _mm512_mask_max_ps()
+ *      _mm512_mask_min_ps()
+ *      _mm512_mask_mul_ps()
+ *      _mm512_mask_abs_ps()
+ *      _mm512_add_round_ps()
+ *      _mm512_sub_round_ps()
+ */
+
+
+int show_op = 0;
+
+typedef enum { ASSIGN, ABS, ADD, MAX, MIN, MUL, SUB } OPER;
+
+static void NOINLINE intop(OPER op, float ivalout[16], float ivalop1[16],
+                           float ivalop2[16]) {
+  int i;
+  int handled = 0;
+
+  memset(ivalout, 0, sizeof(ivalout));
+  for (i = 0; i < 16; i += 1) {
+    switch (op) {
+    case ASSIGN:
+      handled = 1;
+      ivalout[i] = ivalop1[i];
+      break;
+    case ADD:
+      handled = 1;
+      ivalout[i] = ivalop1[i] + ivalop2[i];
+      break;
+    case ABS:
+      handled = 1;
+      ivalout[i] = ivalop1[i] >= 0 ? ivalop1[i] : -ivalop1[i];
+      break;
+    case MAX:
+      handled = 1;
+      ivalout[i] = (ivalop1[i] > ivalop2[i]) ? ivalop1[i] : ivalop2[i];
+      break;
+    case MIN:
+      handled = 1;
+      ivalout[i] = (ivalop1[i] < ivalop2[i]) ? ivalop1[i] : ivalop2[i];
+      break;
+    case MUL:
+      handled = 1;
+      ivalout[i] = ivalop2[i] * ivalop1[i];
+      break;
+    case SUB:
+      handled = 1;
+      ivalout[i] = ivalop1[i] - ivalop2[i];
+      break;
+    default:
+      printf("FAIL: bad op\n");
+      break;
+    }
+  }
+  if (!handled) {
+    printf("FAIL: unsupported op\n");
+    n_errs++;
+  }
+}
+
+static int NOINLINE check(float val1[], float good[]) {
+  int i;
+  int res = 1;
+  for (i = 0; i < 16; i += 1) {
+    if (val1[i] != good[i]) {
+      res = 0;
+      printf("FAIL: %f != %f\n", val1[i], good[i]);
+    }
+  }
+  return (res);
+}
+
+static int NOINLINE check_mask(float val1[], float good[], int mask) {
+  int i;
+  int res = 1;
+  for (i = 0; i < 16; i += 1) {
+    if ((1 << i) & mask) {
+      if (val1[i] != good[i]) {
+        res = 0;
+        printf("FAIL: %f != %f\n", val1[i], good[i]);
+      }
+    }
+  }
+  return (res);
+}
+
+static void NOINLINE print_vec(char *pfx, float ivec[]) {
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  printf("%10.4f %10.4f %10.4f %10.4f ", ivec[15], ivec[14], ivec[13],
+         ivec[12]);
+  printf("%10.4f %10.4f %10.4f %10.4f ", ivec[11], ivec[10], ivec[9], ivec[8]);
+  printf("%10.4f %10.4f %10.4f %10.4f ", ivec[7], ivec[6], ivec[5], ivec[4]);
+  printf("%10.4f %10.4f %10.4f %10.4f\n", ivec[3], ivec[2], ivec[1], ivec[0]);
+}
+
+#define DOONE(OP, FUNC)                                                        \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f32, v1.f32, v2.f32);                                       \
+    vvv.zmm = FUNC(v1.zmm, v2.zmm);                                            \
+    passed = check(vvv.f32, good.f32);                                         \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f32);                                             \
+      print_vec("Opand2", v2.f32);                                             \
+      print_vec("Scalar", good.f32);                                           \
+      print_vec("Vector", vvv.f32);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_WITH_MASK(OP, FUNC, MMASK)                                       \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f32, v1.f32, v2.f32);                                       \
+    vvv.zmm = FUNC(vvv.zmm, MMASK, v1.zmm, v2.zmm);                            \
+    passed = check_mask(vvv.f32, good.f32, MMASK);                             \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f32);                                             \
+      print_vec("Opand2", v2.f32);                                             \
+      print_vec("Scalar", good.f32);                                           \
+      print_vec("Vector", vvv.f32);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_WITH_MASK_1OP(OP, FUNC, MMASK)                                   \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f32, v1.f32, v2.f32);                                       \
+    vvv.zmm = FUNC(vvv.zmm, MMASK, v1.zmm);                                    \
+    passed = check_mask(vvv.f32, good.f32, MMASK);                             \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f32);                                             \
+      print_vec("Opand2", v2.f32);                                             \
+      print_vec("Scalar", good.f32);                                           \
+      print_vec("Vector", vvv.f32);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_ROUND(OP, FUNC, ROUND)                                           \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f32, v1.f32, v2.f32);                                       \
+    vvv.zmm = FUNC(v1.zmm, v2.zmm, ROUND);                                     \
+    passed = check(vvv.f32, good.f32);                                         \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f32);                                             \
+      print_vec("Opand2", v2.f32);                                             \
+      print_vec("Scalar", good.f32);                                           \
+      print_vec("Vector", vvv.f32);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_WITH_MASK_ROUND(OP, FUNC, MMASK, ROUND)                          \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f32, v1.f32, v2.f32);                                       \
+    vvv.zmm = FUNC(vvv.zmm, MMASK, v1.zmm, v2.zmm, ROUND);                     \
+    passed = check_mask(vvv.f32, good.f32, MMASK);                             \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f32);                                             \
+      print_vec("Opand2", v2.f32);                                             \
+      print_vec("Scalar", good.f32);                                           \
+      print_vec("Vector", vvv.f32);                                            \
+    }                                                                          \
+  }
+
+int main() {
+  float init1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -11, 12, 13, 14, 15, 16};
+  float init2[] = {11, 12, 23, 24, -35, 36, 17, 38,
+                   42, -1, 33, 7,  8,   10, 11, 12};
+
+  V512 v1;
+  V512 v2;
+  V512 good;
+  V512 vvv;
+
+  intop(ASSIGN, v1.f32, init1, 0);
+  intop(ASSIGN, v2.f32, init2, 0);
+
+  // simple intrinsics
+  DOONE(ADD, _mm512_add_ps);
+  DOONE(MAX, _mm512_max_ps);
+  DOONE(MIN, _mm512_min_ps);
+
+  DOONE_WITH_MASK(MAX, _mm512_mask_max_ps, 0xf01);
+  DOONE_WITH_MASK(MIN, _mm512_mask_min_ps, 0xf03);
+  DOONE_WITH_MASK(MUL, _mm512_mask_mul_ps, 0xff0);
+
+  DOONE_WITH_MASK_1OP(ABS, _mm512_mask_abs_ps, 0xcf1);
+
+  // intrinsics with rounding mode round
+  DOONE_ROUND(ADD, _mm512_add_round_ps,
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_ROUND(SUB, _mm512_sub_round_ps,
+              _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/m512_op_ps.reference_output b/SingleSource/UnitTests/Vector/AVX512/m512_op_ps.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/m512_op_ps.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/m512_test_util.h b/SingleSource/UnitTests/Vector/AVX512/m512_test_util.h
new file mode 100644
index 0000000..c98e174
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/m512_test_util.h
@@ -0,0 +1,258 @@
+#ifndef M512_TEST_UTIL_H_INCLUDED
+#define M512_TEST_UTIL_H_INCLUDED
+
+/*
+ * Common declarations useful for writing 512-bit unit tests.
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#include <x86intrin.h>
+
+#define ALIGNTO(n) __declspec(align(n))
+
+/*
+ * For purposes of unit tests it can be beneficial to suppress inlining
+ * simply so that only a single instance of a test function is emitted.
+ * Makes it easier to diff A/B assembly output.
+ */
+#define NOINLINE __declspec(noinline)
+
+/*
+ * FULL_IREG(ax) expands to either eax or rax depending on the target.
+ */
+#if defined(__x86_64) || defined(_M_X64)
+#define FULL_IREG(reg) r##reg
+#else
+#define FULL_IREG(reg) e##reg
+#endif
+
+/* Number of elements in an array. */
+#define ASIZE(a) (sizeof((a)) / sizeof((a)[0]))
+
+typedef __int64 I64;
+typedef unsigned __int64 U64;
+
+typedef union ALIGNTO(64) {
+
+  __m512 zmm;
+  __m512d zmmd;
+  __m512i zmmi;
+
+  __m256 ymm[2];
+  __m256d ymmd[2];
+  __m256i ymmi[2];
+
+  __m128 xmm[4];
+  __m128d xmmd[4];
+  __m128i xmmi[4];
+
+  char c[64];
+  signed char s8[64];
+  unsigned char u8[64];
+  short s16[32];
+  unsigned short u16[32];
+  int s32[16];
+  unsigned int u32[16];
+  float f32[16];
+  I64 s64[8];
+  U64 u64[8];
+  double f64[8];
+
+} V512;
+
+int n_errs = 0;
+
+/*
+ * Print the low N 32-bit unsigned integers from p.
+ */
+
+void NOINLINE display_pd(const V512 *p, const char *banner, int n_elems) {
+  int i = 15;
+
+  if (banner) {
+    printf("%s", banner);
+  }
+
+  for (i = n_elems; i >= 0; i--) {
+    printf(" %0.8x", p->u32[i]);
+    if (i > 0 && i % 4 == 0) {
+      printf("\n");
+      if (banner) {
+        printf("%*s", (int)strlen((void *)banner), "");
+      }
+    }
+  }
+  printf("\n");
+}
+
+/*
+ * Print the low N 64-bit unsigned integers from p.
+ */
+void NOINLINE display_pq(const V512 *p, const char *banner, int n_elems) {
+  int i = 7;
+
+  if (banner) {
+    printf("%s", banner);
+  }
+
+  for (i = n_elems; i >= 0; i--) {
+    printf(" %0.16llx", p->u64[i]);
+    if (i > 0 && i % 4 == 0) {
+      printf("\n");
+      if (banner) {
+        printf("%*s", (int)strlen((void *)banner), "");
+      }
+    }
+  }
+  printf("\n");
+}
+
+/*
+ * Print the low N single precision floats from p.
+ */
+
+void NOINLINE display_psf(const V512 *p, const char *banner, int n_elems) {
+  int i = 15;
+
+  if (banner) {
+    printf("%s", banner);
+  }
+
+  for (i = n_elems; i >= 0; i--) {
+    printf(" %7g", p->f32[i]);
+    if (i > 0 && i % 4 == 0) {
+      printf("\n");
+      if (banner) {
+        printf("%*s", (int)strlen((void *)banner), "");
+      }
+    }
+  }
+  printf("\n");
+}
+
+/*
+ * Print the low N double precision floats from p.
+ */
+
+void NOINLINE display_pdf(const V512 *p, const char *banner, int n_elems) {
+  int i = 15;
+
+  if (banner) {
+    printf("%s", banner);
+  }
+
+  for (i = n_elems; i >= 0; i--) {
+    printf(" %7g", p->f64[i]);
+    if (i > 0 && i % 4 == 0) {
+      printf("\n");
+      if (banner) {
+        printf("%*s", (int)strlen((void *)banner), "");
+      }
+    }
+  }
+  printf("\n");
+}
+
+/*
+ * Check that the low N 32-bit elements of "got" and "expected" are the same.
+ */
+int NOINLINE check_equal_nd(void *got, void *expected, int n_elems,
+                            char *banner, int line) {
+  int i, fail = 0;
+  V512 *v1 = (V512 *)got;
+  V512 *v2 = (V512 *)expected;
+
+  for (i = 0; i < n_elems; i++) {
+    if (v1->u32[i] != v2->u32[i]) {
+      printf("ERROR(%d): %s failed at %d'th element:  0x%0.8x != 0x%0.8x\n",
+             line, banner ? banner : "", i, v1->u32[i], v2->u32[i]);
+      display_pd(got, "got:", n_elems);
+      display_pd(expected, "exp:", n_elems);
+      n_errs++;
+      fail = 1;
+      break;
+    }
+  }
+  return fail;
+}
+
+/*
+ * Check that the low N 64-bit elements of "got" and "expected" are the same.
+ */
+int NOINLINE check_equal_nq(void *got, void *expected, int n_elems,
+                            char *banner, int line) {
+  int i, fail = 0;
+  V512 *v1 = (V512 *)got;
+  V512 *v2 = (V512 *)expected;
+
+  for (i = 0; i < n_elems; i++) {
+    if (v1->u64[i] != v2->u64[i]) {
+      printf(
+          "ERROR(%d): %s failed at %d'th element:  0x%0.16llx != 0x%0.16llx\n",
+          line, banner ? banner : "", i, v1->u64[i], v2->u64[i]);
+      display_pq(got, "got:", n_elems);
+      display_pq(expected, "exp:", n_elems);
+      n_errs++;
+      fail = 1;
+      break;
+    }
+  }
+  return fail;
+}
+
+double delta = 1e-4;
+
+#define EQUAL_FP(v1, v2)                                                       \
+  ((v1) < (v2) ? ((v2) - (v1) < delta) : ((v1) - (v2) < delta))
+
+/*
+ * Check that the low N single precision float elements of "got" and "expected"
+ * are the same.
+ */
+int NOINLINE check_equal_nsf(void *got, void *expected, int n_elems,
+                             char *banner, int line) {
+  int i, fail = 0;
+  V512 *v1 = (V512 *)got;
+  V512 *v2 = (V512 *)expected;
+
+  for (i = 0; i < n_elems; i++) {
+    if (!EQUAL_FP(v1->f32[i], v2->f32[i])) {
+      printf("ERROR(%d): %s failed at %d'th element:  %7g != %7g \n", line,
+             banner ? banner : "", i, v1->f32[i], v2->f32[i]);
+      display_psf(got, "got:", n_elems);
+      display_psf(expected, "exp:", n_elems);
+      n_errs++;
+      fail = 1;
+      break;
+    }
+  }
+  return fail;
+}
+
+/*
+ * Check that the low N double precision float elements of "got" and "expected"
+ * are the same.
+ */
+int NOINLINE check_equal_ndf(void *got, void *expected, int n_elems,
+                             char *banner, int line) {
+  int i, fail = 0;
+  V512 *v1 = (V512 *)got;
+  V512 *v2 = (V512 *)expected;
+
+  for (i = 0; i < n_elems; i++) {
+    if (!EQUAL_FP(v1->f64[i], v2->f64[i])) {
+      printf("ERROR(%d): %s failed at %d'th element:  %7g != %7g \n", line,
+             banner ? banner : "", i, v1->f64[i], v2->f64[i]);
+      display_pdf(got, "got:", n_elems);
+      display_pdf(expected, "exp:", n_elems);
+      n_errs++;
+      fail = 1;
+      break;
+    }
+  }
+  return fail;
+}
+
+#endif /* M512_TEST_UTIL_H_INCLUDED */
diff --git a/SingleSource/UnitTests/Vector/AVX512/maskz.c b/SingleSource/UnitTests/Vector/AVX512/maskz.c
new file mode 100644
index 0000000..97e4c58
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/maskz.c
@@ -0,0 +1,99 @@
+
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+/*
+ * Here we check for _mm512_maskz_[add|sub]_[round]_ps intrinsics.
+ */
+volatile int vol0 = 0;
+
+V512 i32;
+V512 i32_squares;
+V512 i64;
+V512 i64_squares;
+V512 f32;
+V512 f32_squares;
+V512 f32_halves;
+V512 f64;
+V512 f64_squares;
+V512 f64_halves;
+
+void NOINLINE init() {
+  volatile int i;
+
+  for (i = 0; i < 16; i++) {
+    i32.s32[i] = i;
+    i32_squares.s32[i] = i * i;
+    f32.f32[i] = i;
+    f32_squares.f32[i] = i * i;
+    f32_halves.f32[i] = i + 0.5f;
+  }
+
+  for (i = 0; i < 8; i++) {
+    i64.s64[i] = i;
+    i64_squares.s64[i] = i * i;
+    f64.f64[i] = i;
+    f64_squares.f64[i] = i * i;
+    f64_halves.f64[i] = i + 0.5;
+  }
+}
+
+/*
+ * Generate function do_"oper"_ps, which tests
+ * _mm512_maskz_oper_ps(__mmask16, __m512, __m512) and
+ * _mm512_maskz_oper_round_ps(__mmask16, __m512, __m512, int rounding)
+ */
+
+#define GEN_PS2_OROUND(oper)                                                   \
+  void NOINLINE do_##oper##_ps() {                                             \
+    V512 resm, resz;                                                           \
+    __mmask16 k;                                                               \
+                                                                               \
+    k = 0xbcdf;                                                                \
+    resm.zmm = _mm512_setzero_ps();                                            \
+    resm.zmm =                                                                 \
+        _mm512_mask_##oper##_ps(resm.zmm, k, f32_halves.zmm, f32_squares.zmm); \
+                                                                               \
+    /* Set resz to all 1's, use vol0 to make it stick. */                      \
+    resz.zmmi = _mm512_ternarylogic_epi32(i32.zmmi, i32.zmmi, i32.zmmi, 0xff); \
+    resz.xmm[vol0] = resz.xmm[vol0]; /* No-op. */                              \
+    resz.zmm = _mm512_maskz_##oper##_ps(k, f32_halves.zmm, f32_squares.zmm);   \
+    check_equal_nd(&resz, &resm, 16, "_mm512_maskz_" #oper "_ps", __LINE__);   \
+                                                                               \
+    /* Now with a rounding override. */                                        \
+                                                                               \
+    f32_squares.xmm[vol0] = f32_squares.xmm[vol0]; /* No-op. */                \
+    resm.zmm = _mm512_setzero_ps();                                            \
+    resm.zmm = _mm512_mask_##oper##_round_ps(                                  \
+        resm.zmm, k, f32_halves.zmm, f32_squares.zmm,                          \
+        _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);                            \
+    f32_squares.xmm[vol0] = f32_squares.xmm[vol0]; /* No-op. */                \
+                                                                               \
+    /* Set resz to all 1's, use vol0 to make it stick. */                      \
+    resz.zmmi = _mm512_ternarylogic_epi32(i32.zmmi, i32.zmmi, i32.zmmi, 0xff); \
+    resz.xmm[vol0] = resz.xmm[vol0]; /* No-op. */                              \
+                                                                               \
+    resz.zmm = _mm512_maskz_##oper##_round_ps(                                 \
+        k, f32_halves.zmm, f32_squares.zmm,                                    \
+        _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);                            \
+    check_equal_nd(&resz, &resm, 16, "_mm512_maskz_" #oper "_round_ps",        \
+                   __LINE__);                                                  \
+  }
+
+GEN_PS2_OROUND(sub)
+GEN_PS2_OROUND(add)
+
+int main(int argc, char *argv[]) {
+  init();
+
+  do_add_ps();
+  do_sub_ps();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/maskz.reference_output b/SingleSource/UnitTests/Vector/AVX512/maskz.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/maskz.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/math.c b/SingleSource/UnitTests/Vector/AVX512/math.c
new file mode 100644
index 0000000..2affc18
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/math.c
@@ -0,0 +1,696 @@
+/*
+ * Test math instructions: sqrt, reciprocal, floor, ceil, exponent,
+ *                          scale, fixup ,roundscale and ternary logic.
+ * Here we check for _mm512_[mask|maskz]_[ceil|floor|scalef|sqrt|ternarylogic]
+ * intrinsics.
+ */
+#include "m512_test_util.h"
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+volatile int vol0 = 0;
+
+V512 i32;
+V512 i32_squares;
+V512 i32_neg;
+V512 i64;
+V512 i64_squares;
+V512 i64_neg;
+V512 f32;
+V512 f32_squares;
+V512 f32_halves;
+V512 f64;
+V512 f64_squares;
+V512 f64_halves;
+
+void NOINLINE init() {
+  volatile int i;
+
+  for (i = 0; i < 16; i++) {
+    i32.s32[i] = i;
+    i32_squares.s32[i] = i * i;
+    i32_neg.s32[i] = -i;
+    f32.f32[i] = i;
+    f32_squares.f32[i] = i * i;
+    f32_halves.f32[i] = i + 0.5f;
+  }
+
+  for (i = 0; i < 8; i++) {
+    i64.s64[i] = i;
+    i64_squares.s64[i] = i * i;
+    i64_neg.s64[i] = -i;
+    f64.f64[i] = i;
+    f64_squares.f64[i] = i * i;
+    f64_halves.f64[i] = i + 0.5;
+  }
+}
+
+void NOINLINE do_rcp14pd() {
+  volatile V512 res;
+  V512 expected;
+  __mmask8 k = 0xc3;
+
+  res.zmmd = _mm512_rcp14_pd(f64.zmmd);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmd = _mm512_mask_rcp14_pd(res.zmmd, k, f64.zmmd);
+}
+
+void NOINLINE do_rcp14ps() {
+  volatile V512 res;
+  V512 expected;
+  __mmask16 k = 0x7e95;
+
+  res.zmm = _mm512_rcp14_ps(f32.zmm);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmm = _mm512_mask_rcp14_ps(res.zmm, k, f32.zmm);
+}
+
+void NOINLINE do_sqrtps() {
+  V512 res;
+  V512 expected;
+  __mmask16 k;
+  volatile int i;
+
+  res.zmm = _mm512_sqrt_ps(f32_squares.zmm);
+  for (i = 0; i < 16; i++) {
+    expected.f32[i] = i;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_sqrt_ps", __LINE__);
+
+  f32_squares.xmm[vol0] = f32_squares.xmm[vol0]; /* No-op. */
+
+  k = 0xbcdf;
+  res.zmm = _mm512_setzero_ps();
+  res.zmm = _mm512_mask_sqrt_ps(res.zmm, k, f32_squares.zmm);
+  expected.zmm = _mm512_setzero_ps();
+  for (i = 0; i < 16; i++) {
+    if (k & (1 << i)) {
+      expected.f32[i] = i;
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_sqrt_ps", __LINE__);
+}
+
+void NOINLINE do_sqrtpd() {
+  V512 res;
+  V512 expected;
+  __mmask8 k;
+  volatile int i;
+
+  res.zmmd = _mm512_sqrt_pd(f64_squares.zmmd);
+  for (i = 0; i < 8; i++) {
+    expected.f64[i] = i;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_sqrt_pd", __LINE__);
+
+  f64_squares.xmmd[vol0] = f64_squares.xmmd[vol0]; /* No-op. */
+
+  k = 0xe9;
+  res.zmmd = _mm512_setzero_pd();
+  res.zmmd = _mm512_mask_sqrt_pd(res.zmmd, k, f64_squares.zmmd);
+  expected.zmmd = _mm512_setzero_pd();
+  for (i = 0; i < 8; i++) {
+    if (k & (1 << i)) {
+      expected.f64[i] = i;
+    }
+  }
+
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_sqrt_pd", __LINE__);
+}
+
+void NOINLINE do_floorps() {
+  V512 res;
+  V512 expected;
+  __mmask16 k;
+  volatile int i;
+
+  res.zmm = _mm512_floor_ps(f32_halves.zmm);
+  for (i = 0; i < 16; i++) {
+    expected.f32[i] = i;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_floor_ps", __LINE__);
+
+  f32_halves.xmm[vol0] = f32_halves.xmm[vol0]; /* No-op. */
+
+  k = 0xbcdf;
+  res.zmm = _mm512_setzero_ps();
+  res.zmm = _mm512_mask_floor_ps(res.zmm, k, f32_halves.zmm);
+  expected.zmm = _mm512_setzero_ps();
+  for (i = 0; i < 16; i++) {
+    if (k & (1 << i)) {
+      expected.f32[i] = i;
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_floor_ps", __LINE__);
+}
+
+void NOINLINE do_floorpd() {
+  V512 res;
+  V512 expected;
+  __mmask8 k;
+  volatile int i;
+
+  res.zmmd = _mm512_floor_pd(f64_halves.zmmd);
+  for (i = 0; i < 8; i++) {
+    expected.f64[i] = i;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_floor_pd", __LINE__);
+
+  f64_halves.xmmd[vol0] = f64_halves.xmmd[vol0]; /* No-op. */
+
+  k = 0x7b;
+  res.zmmd = _mm512_setzero_pd();
+  res.zmmd = _mm512_mask_floor_pd(res.zmmd, k, f64_halves.zmmd);
+  expected.zmmd = _mm512_setzero_pd();
+  for (i = 0; i < 8; i++) {
+    if (k & (1 << i)) {
+      expected.f64[i] = i;
+    }
+  }
+
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_floor_pd", __LINE__);
+}
+
+void NOINLINE do_ceilps() {
+  V512 res;
+  V512 expected;
+  __mmask16 k;
+  volatile int i;
+
+  res.zmm = _mm512_ceil_ps(f32_halves.zmm);
+  for (i = 0; i < 16; i++) {
+    expected.f32[i] = i + 1;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_ceil_ps", __LINE__);
+
+  f32_halves.xmm[vol0] = f32_halves.xmm[vol0]; /* No-op. */
+
+  k = 0xbcdf;
+  res.zmm = _mm512_setzero_ps();
+  res.zmm = _mm512_mask_ceil_ps(res.zmm, k, f32_halves.zmm);
+  expected.zmm = _mm512_setzero_ps();
+  for (i = 0; i < 16; i++) {
+    if (k & (1 << i)) {
+      expected.f32[i] = i + 1;
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_ceil_ps", __LINE__);
+}
+
+void NOINLINE do_ceilpd() {
+  V512 res;
+  V512 expected;
+  __mmask8 k;
+  volatile int i;
+
+  res.zmmd = _mm512_ceil_pd(f64_halves.zmmd);
+  for (i = 0; i < 8; i++) {
+    expected.f64[i] = i + 1;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_ceil_pd", __LINE__);
+
+  f64_halves.xmmd[vol0] = f64_halves.xmmd[vol0]; /* No-op. */
+
+  k = 0x7b;
+  res.zmmd = _mm512_setzero_pd();
+  res.zmmd = _mm512_mask_ceil_pd(res.zmmd, k, f64_halves.zmmd);
+  expected.zmmd = _mm512_setzero_pd();
+  for (i = 0; i < 8; i++) {
+    if (k & (1 << i)) {
+      expected.f64[i] = i + 1;
+    }
+  }
+
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_ceil_pd", __LINE__);
+}
+
+void NOINLINE do_getexpsd() {
+  __mmask8 k8 = 0x2;
+  volatile __m128d res;
+  volatile __m128d v1 = _mm_set_pd(8.0, 32.0);
+  volatile __m128d v2 = _mm_set_pd(16.0, 64.0);
+
+  __m128d res_exp_nomask = _mm_set_pd(8.0, 6.0);
+  __m128d res_exp_mask = _mm_set_pd(8.0, 32.0);
+  __m128d res_exp_maskz = _mm_set_pd(8.0, 0.0);
+
+  res = _mm_setzero_pd();
+  res = _mm_getexp_sd(v1, v2);
+
+  check_equal_ndf((void *)&res, (void *)&res_exp_nomask, 2, "_mm_getexp_sd",
+                  __LINE__);
+
+  res = _mm_setzero_pd();
+  res = _mm_mask_getexp_sd(v1, k8, v1, v2);
+  check_equal_ndf((void *)&res, (void *)&res_exp_mask, 2, "_mm_mask_getexp_sd",
+                  __LINE__);
+
+  res = _mm_setzero_pd();
+  res = _mm_maskz_getexp_sd(k8, v1, v2);
+  check_equal_ndf((void *)&res, (void *)&res_exp_maskz, 2,
+                  "_mm_maskz_getexp_sd", __LINE__);
+}
+
+void NOINLINE do_getexpss() {
+  __mmask8 k8 = 0xe;
+  volatile __m128 res;
+  volatile __m128 v1 = _mm_set_ps(16.0f, 32.0f, 64.0f, 128.0f);
+  volatile __m128 v2 = _mm_set_ps(128.0f, 256.0f, 512.0f, 1024.0f);
+
+  volatile __m128 res_exp_nomask = _mm_set_ps(16.0f, 32.0f, 64.0f, 10.0f);
+  volatile __m128 res_exp_mask = _mm_set_ps(16.0f, 32.0f, 64.0f, 128.0f);
+  volatile __m128 res_exp_maskz = _mm_set_ps(16.0f, 32.0f, 64.0f, 0.0f);
+
+  res = _mm_setzero_ps();
+  res = _mm_getexp_ss(v1, v2);
+  check_equal_nsf((void *)&res, (void *)&res_exp_nomask, 4, "_mm_getexp_ss",
+                  __LINE__);
+
+  res = _mm_setzero_ps();
+  res = _mm_mask_getexp_ss(v1, k8, v1, v2);
+  check_equal_nsf((void *)&res, (void *)&res_exp_mask, 2, "_mm_mask_getexp_ss",
+                  __LINE__);
+
+  res = _mm_setzero_ps();
+  res = _mm_maskz_getexp_ss(k8, v1, v2);
+  check_equal_nsf((void *)&res, (void *)&res_exp_maskz, 4,
+                  "_mm_maskz_getexp_ss", __LINE__);
+}
+
+void NOINLINE do_getmantpd() {
+  volatile V512 res;
+  V512 expected;
+  __mmask8 k = 0x75;
+
+  res.zmmd =
+      _mm512_getmant_pd(f64.zmmd, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmd = _mm512_mask_getmant_pd(res.zmmd, k, f64.zmmd, _MM_MANT_NORM_p5_2,
+                                    _MM_MANT_SIGN_zero);
+}
+
+void NOINLINE do_getmantps() {
+  volatile V512 res;
+  V512 expected;
+  __mmask16 k = 0x7e95;
+
+  res.zmm = _mm512_getmant_ps(f32.zmm, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmm = _mm512_mask_getmant_ps(res.zmm, k, f32.zmm, _MM_MANT_NORM_p5_2,
+                                   _MM_MANT_SIGN_zero);
+}
+
+#define CHECK_SCALEFPD(n_elems, dest, mask, zeroing, name)                     \
+  {                                                                            \
+    volatile int i;                                                            \
+    for (i = 0; i < n_elems; i++) {                                            \
+      expected.f64[i] = f64.f64[i] * (pow(2.0, floor(f64_squares.f64[i])));    \
+      if ((mask & (1 << i)) == 0) {                                            \
+        if (zeroing) {                                                         \
+          expected.f64[i] = 0.0;                                               \
+        } else {                                                               \
+          expected.f64[i] = dest.f64[i];                                       \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    check_equal_nd(&res, &expected, n_elems * 2, name, __LINE__);              \
+    f64.xmmd[vol0] = f64.xmmd[vol0];                                           \
+  }
+
+void NOINLINE do_scalefpd() {
+  V512 res;
+  V512 expected;
+  __mmask8 k = 0xFF;
+
+  res.zmmd = _mm512_scalef_round_pd(f64.zmmd, f64_squares.zmmd,
+                                    _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  CHECK_SCALEFPD(8, f64_halves, k, 0, "_mm512_scalef_round_pd");
+
+  res.zmmd = _mm512_scalef_pd(f64.zmmd, f64_squares.zmmd);
+  CHECK_SCALEFPD(8, f64_halves, k, 0, "_mm512_scalef_pd");
+
+  k = 0x75;
+
+  res.zmmd = _mm512_mask_scalef_round_pd(
+      f64_halves.zmmd, k, f64.zmmd, f64_squares.zmmd,
+      _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  CHECK_SCALEFPD(8, f64_halves, k, 0, "_mm512_mask_scalef_round_pd");
+
+  res.zmmd =
+      _mm512_mask_scalef_pd(f64_halves.zmmd, k, f64.zmmd, f64_squares.zmmd);
+  CHECK_SCALEFPD(8, f64_halves, k, 0, "_mm512_mask_scalef_pd");
+
+  k = 0x57;
+
+  res.zmmd = _mm512_maskz_scalef_round_pd(
+      k, f64.zmmd, f64_squares.zmmd, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  CHECK_SCALEFPD(8, f64_halves, k, 1, "_mm512_maskz_scalef_round_pd");
+
+  res.zmmd = _mm512_maskz_scalef_pd(k, f64.zmmd, f64_squares.zmmd);
+  CHECK_SCALEFPD(8, f64_halves, k, 1, "_mm512_maskz_scalef_pd");
+}
+
+#define CHECK_SCALEFPS(n_elems, dest, mask, zeroing, name)                     \
+  {                                                                            \
+    volatile int i;                                                            \
+    for (i = 0; i < n_elems; i++) {                                            \
+      expected.f32[i] = f32.f32[i] * (powf(2.0F, floorf(f32_squares.f32[i]))); \
+      if ((mask & (1 << i)) == 0) {                                            \
+        if (zeroing) {                                                         \
+          expected.f32[i] = 0.0F;                                              \
+        } else {                                                               \
+          expected.f32[i] = dest.f32[i];                                       \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    check_equal_nd(&res, &expected, n_elems, name, __LINE__);                  \
+    f32.xmm[vol0] = f32.xmm[vol0];                                             \
+  }
+
+void NOINLINE do_scalefps() {
+  V512 res;
+  V512 expected;
+  __mmask16 k = 0xFFFF;
+
+  res.zmm = _mm512_scalef_round_ps(f32.zmm, f32_squares.zmm,
+                                   _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+  CHECK_SCALEFPS(16, f32_halves, k, 0, "_mm512_scalef_round_ps");
+
+  res.zmm = _mm512_scalef_ps(f32.zmm, f32_squares.zmm);
+  CHECK_SCALEFPS(16, f32_halves, k, 0, "_mm512_scalef_ps");
+
+  k = 0x0bcd;
+
+  res.zmm =
+      _mm512_mask_scalef_round_ps(f32_halves.zmm, k, f32.zmm, f32_squares.zmm,
+                                  _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  CHECK_SCALEFPS(16, f32_halves, k, 0, "_mm512_mask_scalef_round_ps");
+
+  res.zmm = _mm512_mask_scalef_ps(f32_halves.zmm, k, f32.zmm, f32_squares.zmm);
+  CHECK_SCALEFPS(16, f32_halves, k, 0, "_mm512_mask_scalef_ps");
+
+  k = 0x0dcb;
+
+  res.zmm = _mm512_maskz_scalef_round_ps(
+      k, f32.zmm, f32_squares.zmm, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  CHECK_SCALEFPS(16, f32_halves, k, 1, "_mm512_maskz_scalef_round_ps");
+
+  res.zmm = _mm512_maskz_scalef_ps(k, f32.zmm, f32_squares.zmm);
+  CHECK_SCALEFPS(16, f32_halves, k, 1, "_mm512_maskz_scalef_ps");
+}
+
+#define SOME_ROUND (_MM_FROUND_CUR_DIRECTION)
+
+void NOINLINE do_fixupimmpd() {
+  volatile V512 res;
+  V512 expected;
+  __mmask8 k = 0x75;
+
+  res.zmmd = _mm512_fixupimm_pd(f64.zmmd, f64_squares.zmmd, i32.zmmi, 0x97);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmd = _mm512_mask_fixupimm_pd(res.zmmd, k, f64.zmmd, i32.zmmi, 0xfe);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmd = _mm512_maskz_fixupimm_pd(k, res.zmmd, f64.zmmd, i32.zmmi, 0xfe);
+
+  res.zmmd = _mm512_fixupimm_round_pd(f64.zmmd, f64_squares.zmmd, i32.zmmi,
+                                      0x97, SOME_ROUND);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmd = _mm512_mask_fixupimm_round_pd(res.zmmd, k, f64.zmmd, i32.zmmi,
+                                           0xfe, SOME_ROUND);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmd = _mm512_maskz_fixupimm_round_pd(k, res.zmmd, f64.zmmd, i32.zmmi,
+                                            0xfe, SOME_ROUND);
+}
+
+void NOINLINE do_fixupimmps() {
+  volatile V512 res;
+  V512 expected;
+  __mmask16 k = 0x75;
+
+  res.zmm = _mm512_fixupimm_ps(f32.zmm, f32_squares.zmm, i32.zmmi, 0x97);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmm = _mm512_mask_fixupimm_ps(res.zmm, k, f32.zmm, i32.zmmi, 0xfe);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmm = _mm512_maskz_fixupimm_ps(k, res.zmm, f32.zmm, i32.zmmi, 0xfe);
+
+  res.zmm = _mm512_fixupimm_round_ps(f32.zmm, f32_squares.zmm, i32.zmmi, 0x97,
+                                     SOME_ROUND);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmm = _mm512_mask_fixupimm_round_ps(res.zmm, k, f32.zmm, i32.zmmi, 0xfe,
+                                          SOME_ROUND);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmm = _mm512_maskz_fixupimm_round_ps(k, res.zmm, f32.zmm, i32.zmmi, 0xfe,
+                                           SOME_ROUND);
+}
+
+void NOINLINE do_fixupimmsd() {
+  volatile V512 res;
+  V512 expected;
+
+  __mmask8 k = 0x75;
+
+  res.xmmd[0] =
+      _mm_fixupimm_sd(f64.xmmd[0], f64_squares.xmmd[0], i32.xmmi[0], 0x97);
+
+  res.xmmi[0] = _mm_setzero_si128();
+  res.xmmd[0] =
+      _mm_mask_fixupimm_sd(res.xmmd[0], k, f64.xmmd[0], i32.xmmi[0], 0xfe);
+
+  res.xmmi[0] = _mm_setzero_si128();
+  res.xmmd[0] =
+      _mm_maskz_fixupimm_sd(k, res.xmmd[0], f64.xmmd[0], i32.xmmi[0], 0xfe);
+
+  res.xmmd[0] = _mm_fixupimm_round_sd(f64.xmmd[0], f64_squares.xmmd[0],
+                                      i32.xmmi[0], 0x97, SOME_ROUND);
+
+  res.xmmi[0] = _mm_setzero_si128();
+  res.xmmd[0] = _mm_mask_fixupimm_round_sd(res.xmmd[0], k, f64.xmmd[0],
+                                           i32.xmmi[0], 0xfe, SOME_ROUND);
+
+  res.xmmi[0] = _mm_setzero_si128();
+  res.xmmd[0] = _mm_maskz_fixupimm_round_sd(k, res.xmmd[0], f64.xmmd[0],
+                                            i32.xmmi[0], 0xfe, SOME_ROUND);
+}
+
+void NOINLINE do_fixupimmss() {
+  volatile V512 res;
+  V512 expected;
+  __mmask8 k = 0x75;
+
+  res.xmm[0] =
+      _mm_fixupimm_ss(f32.xmm[0], f32_squares.xmm[0], i32.xmmi[0], 0x97);
+
+  res.xmmi[0] = _mm_setzero_si128();
+  res.xmm[0] =
+      _mm_mask_fixupimm_ss(res.xmm[0], k, f32.xmm[0], i32.xmmi[0], 0xfe);
+
+  res.xmmi[0] = _mm_setzero_si128();
+  res.xmm[0] =
+      _mm_maskz_fixupimm_ss(k, res.xmm[0], f32.xmm[0], i32.xmmi[0], 0xfe);
+
+  res.xmm[0] = _mm_fixupimm_round_ss(f64.xmm[0], f64_squares.xmm[0],
+                                     i32.xmmi[0], 0x97, SOME_ROUND);
+
+  res.xmmi[0] = _mm_setzero_si128();
+  res.xmm[0] = _mm_mask_fixupimm_round_ss(res.xmm[0], k, f64.xmm[0],
+                                          i32.xmmi[0], 0xfe, SOME_ROUND);
+
+  res.xmmi[0] = _mm_setzero_si128();
+  res.xmm[0] = _mm_maskz_fixupimm_round_ss(k, res.xmm[0], f64.xmm[0],
+                                           i32.xmmi[0], 0xfe, SOME_ROUND);
+}
+
+void NOINLINE do_roundscalepd() {
+  volatile V512 res;
+  V512 expected;
+  __mmask16 k = 0x3d;
+
+  res.zmmd = _mm512_roundscale_pd(f64.zmmd, 0xff);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmd = _mm512_mask_roundscale_pd(res.zmmd, k, f64.zmmd, 0x36);
+}
+
+void NOINLINE do_roundscaleps() {
+  volatile V512 res;
+  V512 expected;
+  __mmask16 k = 0x74cb;
+
+  res.zmm = _mm512_roundscale_ps(f32.zmm, 0xf7);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmm = _mm512_mask_roundscale_ps(res.zmm, k, f32.zmm, 0x36);
+}
+
+static int NOINLINE emulate_ternarylogicd(int a, int b, int c, int imm) {
+  int i, index, res = 0;
+
+  for (i = 0; i < 32; i++) {
+    index = ((a & 1) << 2) | ((b & 1) << 1) | (c & 1);
+    res |= ((imm & (1 << index)) ? 1 : 0) << i;
+    a >>= 1;
+    b >>= 1;
+    c >>= 1;
+  }
+
+  return res;
+}
+
+void NOINLINE do_pternlogq() {
+  volatile int i;
+  V512 res, resx, resy;
+  V512 expected;
+  __mmask8 k8 = 0x75;
+
+  res.zmmi =
+      _mm512_ternarylogic_epi64(i64.zmmi, i64_squares.zmmi, i64.zmmi, 0x79);
+  for (i = 0; i < 16; i++) {
+    expected.s32[i] =
+        emulate_ternarylogicd(i64.s32[i], i64_squares.s32[i], i64.s32[i], 0x79);
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_ternarylogic_epi64", __LINE__);
+
+  i64.xmm[vol0] = i64.xmm[vol0]; /* No-op. */
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_ternarylogic_epi64(res.zmmi, k8, i64_neg.zmmi,
+                                            i64.zmmi, 0xca);
+  for (i = 0; i < 16; i += 2) {
+    if (k8 & (1 << (i / 2))) {
+      expected.s32[i] =
+          emulate_ternarylogicd(0, i64_neg.s32[i], i64.s32[i], 0xca);
+      expected.s32[i + 1] =
+          emulate_ternarylogicd(0, i64_neg.s32[i + 1], i64.s32[i + 1], 0xca);
+    } else {
+      expected.s32[i] = 0;
+      expected.s32[i + 1] = 0;
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_ternarylogic_epi64",
+                 __LINE__);
+
+  i64.xmm[vol0] = i64.xmm[vol0]; /* No-op. */
+
+  res.zmmi = _mm512_maskz_ternarylogic_epi64(k8, i64_squares.zmmi,
+                                             i64_squares.zmmi, i64.zmmi, 0x3b);
+  for (i = 0; i < 16; i += 2) {
+    if (k8 & (1 << (i / 2))) {
+      expected.s32[i] = emulate_ternarylogicd(
+          i64_squares.s32[i], i64_squares.s32[i], i64.s32[i], 0x3b);
+      expected.s32[i + 1] = emulate_ternarylogicd(
+          i64_squares.s32[i + 1], i64_squares.s32[i + 1], i64.s32[i], 0x3b);
+    } else {
+      expected.s32[i] = 0;
+      expected.s32[i + 1] = 0;
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_maskz_ternarylogic_epi64",
+                 __LINE__);
+}
+
+void NOINLINE do_pternlogd() {
+  volatile int i;
+  V512 res, resx, resy;
+  V512 expected;
+  __mmask16 k = 0x23bc;
+  __mmask8 k8 = (__mmask8)k;
+
+  res.zmmi =
+      _mm512_ternarylogic_epi32(i32.zmmi, i32_squares.zmmi, i32.zmmi, 0x97);
+  for (i = 0; i < 16; i++) {
+    expected.s32[i] =
+        emulate_ternarylogicd(i32.s32[i], i32_squares.s32[i], i32.s32[i], 0x97);
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_ternarylogic_epi32", __LINE__);
+
+  i32.xmm[vol0] = i32.xmm[vol0]; /* No-op. */
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_ternarylogic_epi32(res.zmmi, k, i32_squares.zmmi,
+                                            i32.zmmi, 0xfe);
+  for (i = 0; i < 16; i++) {
+    if (k & (1 << i)) {
+      expected.s32[i] =
+          emulate_ternarylogicd(0, i32_squares.s32[i], i32.s32[i], 0xfe);
+    } else {
+      expected.s32[i] = 0;
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_ternarylogic_epi32",
+                 __LINE__);
+
+  i32.xmm[vol0] = i32.xmm[vol0]; /* No-op. */
+
+  k = 0xabcd;
+  k8 = (__mmask8)k;
+  res.zmmi = _mm512_maskz_ternarylogic_epi32(k, i32_squares.zmmi, i32_neg.zmmi,
+                                             i32.zmmi, 0x3b);
+  for (i = 0; i < 16; i++) {
+    if (k & (1 << i)) {
+      expected.s32[i] = emulate_ternarylogicd(i32_squares.s32[i],
+                                              i32_neg.s32[i], i32.s32[i], 0x3b);
+    } else {
+      expected.s32[i] = 0;
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_maskz_ternarylogic_epi32",
+                 __LINE__);
+}
+
+int main(int argc, char *argv[]) {
+  init();
+
+  do_rcp14pd();
+  do_rcp14ps();
+
+  do_sqrtps();
+  do_sqrtpd();
+
+  do_floorps();
+  do_floorpd();
+
+  do_ceilps();
+  do_ceilpd();
+
+  do_getexpsd();
+  do_getexpss();
+
+  do_getmantpd();
+  do_getmantps();
+
+  do_scalefpd();
+  do_scalefps();
+
+  do_fixupimmpd();
+  do_fixupimmps();
+
+  do_fixupimmsd();
+  do_fixupimmss();
+
+  do_roundscalepd();
+  do_roundscaleps();
+
+  do_pternlogq();
+  do_pternlogd();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/math.reference_output b/SingleSource/UnitTests/Vector/AVX512/math.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/math.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/minmax_int64.c b/SingleSource/UnitTests/Vector/AVX512/minmax_int64.c
new file mode 100644
index 0000000..37319e0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/minmax_int64.c
@@ -0,0 +1,135 @@
+/*
+ * Test the min/max int64 family of intrinsics.
+ * Here we check for _mm512_[mask|maskz]_[min|max]_ep[i|u]64 intrinsics.
+ */
+
+#include "m512_test_util.h"
+#include <stdio.h>
+
+V512 i64, i64_2;
+
+void NOINLINE init() {
+  volatile int i;
+  for (i = 0; i < 8; i++) {
+    if (i % 2) {
+      i64.s64[i] = (i + 1) * 10000;
+      i64_2.s64[i] = -(i + 1) * 1000;
+    } else {
+      i64.s64[i] = -(i + 1) * 1000;
+      i64_2.s64[i] = (i + 1) * 10000;
+    }
+  }
+}
+
+void NOINLINE do_512_max_epi64() {
+  V512 res;
+  V512 expected;
+  volatile int i;
+  __mmask8 k = 0xdb; /* 11011011 */
+
+  /* scalar calculation */
+  for (i = 0; i < 8; i++) {
+    expected.s64[i] = (i64.s64[i] > i64_2.s64[i]) ? i64.s64[i] : i64_2.s64[i];
+  }
+  /* intrinsic calculation */
+  res.zmmi = _mm512_max_epi64(i64.zmmi, i64_2.zmmi);
+  check_equal_nq(&res, &expected, 8, "_mm512_max_epi64", __LINE__);
+
+  /* scalar mask */
+  expected.s64[2] = 0;
+  expected.s64[5] = 0;
+
+  /* masked intrinsic calculation */
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_max_epi64(res.zmmi, k, i64.zmmi, i64_2.zmmi);
+  check_equal_nq(&res, &expected, 8, "_mm512_mask_max_epi64", __LINE__);
+}
+
+void NOINLINE do_512_max_epu64() {
+  V512 res;
+  V512 expected;
+  volatile int i;
+  __mmask8 k = 0xdb; /* 11011011 */
+
+  /* scalar calculation */
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = (i64.u64[i] > i64_2.u64[i]) ? i64.u64[i] : i64_2.u64[i];
+  }
+  /* intrinsic calculation */
+  res.zmmi = _mm512_max_epu64(i64.zmmi, i64_2.zmmi);
+  check_equal_nq(&res, &expected, 8, "_mm512_max_epu64", __LINE__);
+
+  /* scalar mask */
+  expected.u64[2] = 0;
+  expected.u64[5] = 0;
+
+  /* masked intrinsic calculation */
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_max_epu64(res.zmmi, k, i64.zmmi, i64_2.zmmi);
+  check_equal_nq(&res, &expected, 8, "_mm512_mask_max_epu64", __LINE__);
+}
+
+void NOINLINE do_512_min_epi64() {
+  V512 res;
+  V512 expected;
+  volatile int i;
+  __mmask8 k = 0xdb; /* 11011011 */
+
+  /* scalar calculation */
+  for (i = 0; i < 8; i++) {
+    expected.s64[i] = (i64.s64[i] < i64_2.s64[i]) ? i64.s64[i] : i64_2.s64[i];
+  }
+  /* intrinsic calculation */
+  res.zmmi = _mm512_min_epi64(i64.zmmi, i64_2.zmmi);
+  check_equal_nq(&res, &expected, 8, "_mm512_min_epi64", __LINE__);
+
+  /* scalar mask */
+  expected.s64[2] = 0;
+  expected.s64[5] = 0;
+
+  /* masked intrinsic calculation */
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_min_epi64(res.zmmi, k, i64.zmmi, i64_2.zmmi);
+  check_equal_nq(&res, &expected, 8, "_mm512_mask_min_epi64", __LINE__);
+}
+
+void NOINLINE do_512_min_epu64() {
+  V512 res;
+  V512 expected;
+  volatile int i;
+  __mmask8 k = 0xdb; /* 11011011 */
+
+  /* scalar calculation */
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = (i64.u64[i] < i64_2.u64[i]) ? i64.u64[i] : i64_2.u64[i];
+  }
+  /* intrinsic calculation */
+  res.zmmi = _mm512_min_epu64(i64.zmmi, i64_2.zmmi);
+  check_equal_nq(&res, &expected, 8, "_mm512_min_epu64", __LINE__);
+
+  /* scalar mask */
+  expected.u64[2] = 0;
+  expected.u64[5] = 0;
+
+  /* masked intrinsic calculation */
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_min_epu64(res.zmmi, k, i64.zmmi, i64_2.zmmi);
+  check_equal_nq(&res, &expected, 8, "_mm512_mask_min_epu64", __LINE__);
+}
+
+int main(int argc, char *argv[]) {
+  init();
+
+  do_512_max_epi64();
+  do_512_max_epu64();
+  do_512_min_epi64();
+  do_512_min_epu64();
+
+  if (n_errs) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/minmax_int64.reference_output b/SingleSource/UnitTests/Vector/AVX512/minmax_int64.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/minmax_int64.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/minmax_shift.c b/SingleSource/UnitTests/Vector/AVX512/minmax_shift.c
new file mode 100644
index 0000000..6fe78dd
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/minmax_shift.c
@@ -0,0 +1,1157 @@
+/*
+ * Test min, max and shift instructions
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ *      _mm512_[mask|maskz]_[abs|min|max]
+ *      _mm512_[mask|maskz]_[sll|slli|srai|srli|sra|srl]_*
+ *      _mm512_[mask|maskz]_cvtep*_ep*
+ *      _mm256_cvtepi16_epi64
+ *      _mm256_sll
+ *      _mm_cvtepi16_epi64
+ *      _mm_sll
+ */
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+
+V512 i8;
+V512 i8_mix;
+V512 i8_big;
+V512 i16;
+V512 i16_mix;
+V512 i16_big;
+V512 i32;
+V512 i32_mix;
+V512 i32_big;
+V512 i64;
+V512 i64_mix;
+V512 i64_big;
+volatile int vol0 = 0;
+/*
+ * Use this between tests to make compiler think src was updated.
+ * Prevents PRE'ing of a load of src, thus allowing ciscization.
+ */
+#define soft_update(src) (src).xmmi[vol0] = (src).xmmi[vol0]
+
+void NOINLINE init() {
+  volatile int i;
+
+  for (i = 0; i < 64; i++) {
+    i8.s8[i] = i;
+    i8_mix.s8[i] = (i & 1) ? i : -i;
+    i8_big.s8[i] = 1000 * (i + 1);
+    if ((i & 1) != 0) {
+      i8_big.s8[i] = -i8_big.s8[i];
+    }
+  }
+
+  for (i = 0; i < 32; i++) {
+    i16.s16[i] = i;
+    i16_mix.s16[i] = (i & 1) ? i : -i;
+    i16_big.s16[i] = 1000 * (i + 1);
+    if ((i & 1) != 0) {
+      i16_big.s16[i] = -i16_big.s16[i];
+    }
+  }
+
+  for (i = 0; i < 16; i++) {
+    i32.s32[i] = i;
+    i32_mix.s32[i] = (i & 1) ? i : -i;
+    i32_big.s32[i] = 1000 * (i + 1);
+    if ((i & 1) != 0) {
+      i32_big.s32[i] = -i32_big.s32[i];
+    }
+  }
+
+  for (i = 0; i < 8; i++) {
+    i64.s64[i] = i;
+    i64_mix.s64[i] = (i & 1) ? i : -i;
+    i64_big.s64[i] = 1000 * (i + 1);
+    if ((i & 1) != 0) {
+      i64_big.s64[i] = -i64_big.s64[i];
+    }
+  }
+}
+
+void NOINLINE do_absd() {
+  V512 res;
+  V512 expected;
+  __mmask16 k;
+
+  res.zmmi = _mm512_abs_epi32(i32_mix.zmmi);
+  check_equal_nd(&res, &i32, 16, "_mm512_abs_epi32", __LINE__);
+
+  k = 0x1234;
+  res.zmmi = _mm512_mask_abs_epi32(_mm512_setzero_epi32(), k, i32_mix.zmmi);
+  expected.zmmi = _mm512_mask_mov_epi32(_mm512_setzero_epi32(), k, i32.zmmi);
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_abs_epi32", __LINE__);
+}
+
+void NOINLINE do_absq() {
+  V512 res;
+  V512 expected;
+  __mmask8 k;
+
+  res.zmmi = _mm512_abs_epi64(i64_mix.zmmi);
+  check_equal_nd(&res, &i64, 16, "_mm512_abs_epi64", __LINE__);
+
+  k = 0x73;
+  res.zmmi = _mm512_mask_abs_epi64(_mm512_setzero_epi32(), k, i64_mix.zmmi);
+  expected.zmmi = _mm512_mask_mov_epi64(_mm512_setzero_epi32(), k, i64.zmmi);
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_abs_epi64", __LINE__);
+}
+
+void NOINLINE do_movsxwq() {
+  V512 xres, yres, zres;
+  V512 expected, expected_save;
+  volatile int i;
+  __mmask8 k8 = 0xe7;
+
+  /* Non-masked. */
+
+  zres.zmmi = _mm512_cvtepi16_epi64(i16_mix.xmmi[0]);
+  for (i = 0; i < 8; i++) {
+    expected.s64[i] = i16_mix.s16[i];
+  }
+  expected_save = expected;
+  soft_update(i16_mix);
+  soft_update(i16_mix);
+  check_equal_nd(&zres, &expected, 16, "_mm512_cvtepi16_epi64", __LINE__);
+  check_equal_nd(&yres, &expected, 8, "_mm256_cvtepi16_epi64", __LINE__);
+  check_equal_nd(&xres, &expected, 4, "_mm_cvtepi16_epi64", __LINE__);
+
+  /* Masked. */
+
+  soft_update(i16_mix);
+  zres.zmmi = _mm512_setzero_epi32();
+  yres = zres;
+  xres = zres;
+  zres.zmmi = _mm512_mask_cvtepi16_epi64(zres.zmmi, k8, i16_mix.xmmi[0]);
+  expected = expected_save;
+  for (i = 0; i < 8; i++) {
+    if ((k8 & (1 << i)) == 0) {
+      expected.s64[i] = 0;
+    }
+  }
+  soft_update(i16_mix);
+  soft_update(i16_mix);
+  check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepi16_epi64", __LINE__);
+
+  /* Zero-masked. */
+
+  zres = i8_mix;
+  yres = zres;
+  xres = zres;
+  soft_update(i16_mix);
+  zres.zmmi = _mm512_maskz_cvtepi16_epi64(k8, i16_mix.xmmi[0]);
+  for (i = 0; i < 8; i++) {
+    if ((k8 & (1 << i)) == 0) {
+      expected.s64[i] = 0;
+    }
+  }
+  soft_update(i16_mix);
+  soft_update(i16_mix);
+  soft_update(i16_mix);
+  check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepi16_epi64", __LINE__);
+}
+
+void NOINLINE do_movsxdq() {
+  V512 xres, yres, zres;
+  V512 expected;
+  __mmask8 k8 = 0x5d;
+
+  /* Non-masked. */
+
+  zres.zmmi = _mm512_cvtepi32_epi64(i32_mix.ymmi[0]);
+  expected.zmmi = _mm512_set_epi64(7, -6, 5, -4, 3, -2, 1, 0);
+  check_equal_nd(&zres, &expected, 16, "_mm512_cvtepi32_epi64", __LINE__);
+
+  /* Masked. */
+
+  soft_update(i32_mix);
+  zres.zmmi = _mm512_setzero_epi32();
+  yres = zres;
+  xres = zres;
+  zres.zmmi = _mm512_mask_cvtepi32_epi64(zres.zmmi, k8, i32_mix.ymmi[0]);
+  expected.zmmi = _mm512_set_epi64(0, -6, 0, -4, 3, -2, 0, 0);
+  soft_update(i32_mix);
+  soft_update(i32_mix);
+  check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepi32_epi64", __LINE__);
+
+  /* Zero-masked. */
+
+  k8 = 0x79;
+  soft_update(i32_mix);
+  zres = i8_mix;
+  yres = zres;
+  xres = zres;
+  zres.zmmi = _mm512_maskz_cvtepi32_epi64(k8, i32_mix.ymmi[0]);
+  soft_update(i32_mix);
+  soft_update(i32_mix);
+  expected.zmmi = _mm512_set_epi64(7, -6, 5, -4, 3, -2, 1, 0);
+  expected.zmmi = _mm512_maskz_mov_epi64(k8, expected.zmmi);
+  check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepi32_epi64", __LINE__);
+}
+
+void NOINLINE do_movsxbd() {
+  V512 xres, yres, zres;
+  V512 expected, expected_save;
+  volatile int i;
+  __mmask16 k16 = 0xfefe;
+  __mmask8 k8 = (__mmask8)k16;
+
+  /* Non-masked. */
+
+  zres.zmmi = _mm512_cvtepi8_epi32(i8_mix.xmmi[0]);
+  for (i = 0; i < 16; i++) {
+    expected.s32[i] = i8_mix.s8[i];
+  }
+  expected_save = expected;
+  check_equal_nd(&zres, &expected, 16, "_mm512_cvtepi8_epi32", __LINE__);
+  soft_update(i8_mix);
+
+  /* Masked. */
+
+  soft_update(i8_mix);
+  zres.zmmi = _mm512_setzero_epi32();
+  yres = zres;
+  xres = zres;
+  zres.zmmi = _mm512_mask_cvtepi8_epi32(zres.zmmi, k16, i8_mix.xmmi[0]);
+  expected = expected_save;
+  for (i = 0; i < 16; i++) {
+    if ((k16 & (1 << i)) == 0) {
+      expected.s32[i] = 0;
+    }
+  }
+  check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepi8_epi32", __LINE__);
+  soft_update(i8_mix);
+
+  /* Zero-masked. */
+
+  soft_update(i8_mix);
+  k16 <<= 1;
+  k8 = (__mmask8)k16;
+  zres.zmmi = _mm512_maskz_cvtepi8_epi32(k16, i8_mix.xmmi[0]);
+  expected = expected_save;
+  for (i = 0; i < 16; i++) {
+    if ((k16 & (1 << i)) == 0) {
+      expected.s32[i] = 0;
+    }
+  }
+  check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepi8_epi32", __LINE__);
+  soft_update(i8_mix);
+}
+
+void NOINLINE do_movsxbq() {
+  V512 xres, yres, zres;
+  V512 expected, expected_save;
+  volatile int i;
+  __mmask8 k8 = 0xfe;
+
+  /* Non-masked. */
+
+  zres.zmmi = _mm512_cvtepi8_epi64(i8_mix.xmmi[0]);
+  for (i = 0; i < 8; i++) {
+    expected.s64[i] = i8_mix.s8[i];
+  }
+  expected_save = expected;
+  check_equal_nd(&zres, &expected, 16, "_mm512_cvtepi8_epi64", __LINE__);
+  soft_update(i8_mix);
+
+  /* Masked. */
+
+  soft_update(i8_mix);
+  zres.zmmi = _mm512_setzero_epi32();
+  yres = zres;
+  xres = zres;
+  zres.zmmi = _mm512_mask_cvtepi8_epi64(zres.zmmi, k8, i8_mix.xmmi[0]);
+  expected = expected_save;
+  for (i = 0; i < 8; i++) {
+    if ((k8 & (1 << i)) == 0) {
+      expected.s64[i] = 0;
+    }
+  }
+  check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepi8_epi64", __LINE__);
+  soft_update(i8_mix);
+
+  /* Zero-masked. */
+
+  soft_update(i8_mix);
+  k8 <<= 1;
+  zres.zmmi = _mm512_maskz_cvtepi8_epi64(k8, i8_mix.xmmi[0]);
+  expected = expected_save;
+  for (i = 0; i < 8; i++) {
+    if ((k8 & (1 << i)) == 0) {
+      expected.s64[i] = 0;
+    }
+  }
+  check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepi8_epi64", __LINE__);
+  soft_update(i8_mix);
+}
+
+void NOINLINE do_movzxwd() {
+  V512 xres, yres, zres;
+  V512 expected;
+  __mmask16 k16 = 0xc936;
+  __mmask8 k8 = (__mmask8)k16;
+  ;
+
+  /* Non-masked. */
+
+  zres.zmmi = _mm512_cvtepu16_epi32(i16_mix.ymmi[0]);
+  expected.zmmi =
+      _mm512_set_epi32(15, 0xfff2, 13, 0xfff4, 11, 0xfff6, 9, 0xfff8, 7, 0xfffa,
+                       5, 0xfffc, 3, 0xfffe, 1, 0);
+  check_equal_nd(&zres, &expected, 16, "_mm512_cvtepu16_epi32", __LINE__);
+
+  /* Masked. */
+
+  soft_update(i16_mix);
+  zres.zmmi = _mm512_setzero_epi32();
+  yres = zres;
+  xres = zres;
+  zres.zmmi = _mm512_mask_cvtepu16_epi32(zres.zmmi, k16, i16_mix.ymmi[0]);
+  expected.zmmi = _mm512_set_epi32(15, 0xfff2, 0, 0, 11, 0, 0, 0xfff8, 0, 0, 5,
+                                   0xfffc, 0, 0xfffe, 1, 0);
+  check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepu16_epi32", __LINE__);
+
+  /* Zero-masked. */
+
+  zres = i8_mix;
+  yres = zres;
+  xres = zres;
+  soft_update(i16_mix);
+  zres.zmmi = _mm512_maskz_cvtepu16_epi32(k16, i16_mix.ymmi[0]);
+  expected.zmmi =
+      _mm512_set_epi32(15, 0xfff2, 13, 0xfff4, 11, 0xfff6, 9, 0xfff8, 7, 0xfffa,
+                       5, 0xfffc, 3, 0xfffe, 1, 0);
+  expected.zmmi = _mm512_maskz_mov_epi32(k16, expected.zmmi);
+  check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepu16_epi32", __LINE__);
+}
+
+void NOINLINE do_movzxwq() {
+  V512 xres, yres, zres;
+  V512 expected, expected_save;
+  volatile int i;
+  __mmask8 k8 = 0xe7;
+
+  /* Non-masked. */
+
+  zres.zmmi = _mm512_cvtepu16_epi64(i16_mix.xmmi[0]);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = i16_mix.u16[i];
+  }
+  expected_save = expected;
+  soft_update(i16_mix);
+  check_equal_nd(&zres, &expected, 16, "_mm512_cvtepu16_epi64", __LINE__);
+
+  /* Masked. */
+
+  soft_update(i16_mix);
+  zres.zmmi = _mm512_setzero_epi32();
+  yres = zres;
+  xres = zres;
+  zres.zmmi = _mm512_mask_cvtepu16_epi64(zres.zmmi, k8, i16_mix.xmmi[0]);
+  expected = expected_save;
+  for (i = 0; i < 8; i++) {
+    if ((k8 & (1 << i)) == 0) {
+      expected.u64[i] = 0;
+    }
+  }
+  soft_update(i16_mix);
+  check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepu16_epi64", __LINE__);
+
+  /* Zero-masked. */
+
+  zres = i8_mix;
+  yres = zres;
+  xres = zres;
+  soft_update(i16_mix);
+  zres.zmmi = _mm512_maskz_cvtepu16_epi64(k8, i16_mix.xmmi[0]);
+  for (i = 0; i < 8; i++) {
+    if ((k8 & (1 << i)) == 0) {
+      expected.u64[i] = 0;
+    }
+  }
+  soft_update(i16_mix);
+  soft_update(i16_mix);
+  check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepu16_epi64", __LINE__);
+}
+
+void NOINLINE do_movzxdq() {
+  V512 xres, yres, zres;
+  V512 expected;
+  __mmask8 k8 = 0xeb;
+
+  /* Non-masked. */
+
+  zres.zmmi = _mm512_cvtepu32_epi64(i32_mix.ymmi[0]);
+  expected.zmmi = _mm512_set_epi64(7, (unsigned int)-6, 5, (unsigned int)-4, 3,
+                                   (unsigned int)-2, 1, 0);
+  soft_update(i32_mix);
+  check_equal_nd(&zres, &expected, 16, "_mm512_cvtepu32_epi64", __LINE__);
+
+  /* Masked. */
+
+  soft_update(i32_mix);
+  zres.zmmi = _mm512_setzero_epi32();
+  yres = zres;
+  xres = zres;
+  zres.zmmi = _mm512_mask_cvtepu32_epi64(zres.zmmi, k8, i32_mix.ymmi[0]);
+  expected.zmmi = _mm512_set_epi64(0, -6, 0, -4, 3, -2, 0, 0);
+  expected.zmmi = _mm512_set_epi64(7, (unsigned int)-6, 5, 0, 3, 0, 1, 0);
+  soft_update(i32_mix);
+  check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepu32_epi64", __LINE__);
+
+  /* Zero-masked. */
+
+  k8 = 0xe7;
+  soft_update(i32_mix);
+  zres = i8_mix;
+  yres = zres;
+  xres = zres;
+  zres.zmmi = _mm512_maskz_cvtepu32_epi64(k8, i32_mix.ymmi[0]);
+  soft_update(i32_mix);
+  expected.zmmi = _mm512_set_epi64(7, (unsigned int)-6, 5, (unsigned int)-4, 3,
+                                   (unsigned int)-2, 1, 0);
+  expected.zmmi = _mm512_maskz_mov_epi64(k8, expected.zmmi);
+  check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepu32_epi64", __LINE__);
+}
+
+void NOINLINE do_movzxbd() {
+  V512 xres, yres, zres;
+  V512 expected, expected_save;
+  volatile int i;
+  __mmask16 k16 = 0xfefe;
+  __mmask8 k8 = (__mmask8)k16;
+
+  /* Non-masked. */
+
+  zres.zmmi = _mm512_cvtepu8_epi32(i8_mix.xmmi[0]);
+  for (i = 0; i < 16; i++) {
+    expected.u32[i] = i8_mix.u8[i];
+  }
+  expected_save = expected;
+  check_equal_nd(&zres, &expected, 16, "_mm512_cvtepu8_epi32", __LINE__);
+  soft_update(i8_mix);
+
+  /* Masked. */
+
+  soft_update(i8_mix);
+  zres.zmmi = _mm512_setzero_epi32();
+  yres = zres;
+  xres = zres;
+  zres.zmmi = _mm512_mask_cvtepu8_epi32(zres.zmmi, k16, i8_mix.xmmi[0]);
+  expected = expected_save;
+  for (i = 0; i < 16; i++) {
+    if ((k16 & (1 << i)) == 0) {
+      expected.u32[i] = 0;
+    }
+  }
+  check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepu8_epi32", __LINE__);
+  soft_update(i8_mix);
+
+  /* Zero-masked. */
+
+  soft_update(i8_mix);
+  k16 <<= 1;
+  k8 = (__mmask8)k16;
+  zres.zmmi = _mm512_maskz_cvtepu8_epi32(k16, i8_mix.xmmi[0]);
+  expected = expected_save;
+  for (i = 0; i < 16; i++) {
+    if ((k16 & (1 << i)) == 0) {
+      expected.u32[i] = 0;
+    }
+  }
+  check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepu8_epi32", __LINE__);
+  soft_update(i8_mix);
+}
+
+void NOINLINE do_movzxbq() {
+  V512 xres, yres, zres;
+  V512 expected, expected_save;
+  volatile int i;
+  __mmask8 k8 = 0xfe;
+
+  /* Non-masked. */
+
+  zres.zmmi = _mm512_cvtepu8_epi64(i8_mix.xmmi[0]);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = i8_mix.u8[i];
+  }
+  expected_save = expected;
+  check_equal_nd(&zres, &expected, 16, "_mm512_cvtepu8_epi64", __LINE__);
+  soft_update(i8_mix);
+
+  /* Masked. */
+
+  soft_update(i8_mix);
+  zres.zmmi = _mm512_setzero_epi32();
+  yres = zres;
+  xres = zres;
+  zres.zmmi = _mm512_mask_cvtepu8_epi64(zres.zmmi, k8, i8_mix.xmmi[0]);
+  expected = expected_save;
+  for (i = 0; i < 8; i++) {
+    if ((k8 & (1 << i)) == 0) {
+      expected.u64[i] = 0;
+    }
+  }
+  check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepu8_epi64", __LINE__);
+  soft_update(i8_mix);
+
+  /* Zero-masked. */
+
+  soft_update(i8_mix);
+  k8 <<= 1;
+  zres.zmmi = _mm512_maskz_cvtepu8_epi64(k8, i8_mix.xmmi[0]);
+  expected = expected_save;
+  for (i = 0; i < 8; i++) {
+    if ((k8 & (1 << i)) == 0) {
+      expected.u64[i] = 0;
+    }
+  }
+  check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepu8_epi64", __LINE__);
+  soft_update(i8_mix);
+}
+
+void NOINLINE do_maxsd() {
+  V512 res;
+  V512 expected;
+  __mmask8 k = 0x5d;
+
+  res.zmmi = _mm512_max_epi32(i32.zmmi, i32_mix.zmmi);
+  check_equal_nd(&res, &i32, 16, "_mm512_max_epi32", __LINE__);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_max_epi32(res.zmmi, k, i32.zmmi, i32_mix.zmmi);
+  expected.zmm = _mm512_mask_mov_ps(_mm512_setzero_ps(), k, i32.zmm);
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_max_epi32", __LINE__);
+}
+
+void NOINLINE do_maxud() {
+  V512 res;
+  V512 expected;
+  __mmask16 k = 0x5d;
+
+  res.zmmi = _mm512_max_epu32(i32.zmmi, i32_mix.zmmi);
+  check_equal_nd(&res, &i32_mix, 16, "_mm512_max_epu32", __LINE__);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_max_epu32(res.zmmi, k, i32.zmmi, i32_mix.zmmi);
+  expected.zmm = _mm512_mask_mov_ps(_mm512_setzero_ps(), k, i32_mix.zmm);
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_max_epu32", __LINE__);
+}
+
+void NOINLINE do_minsd() {
+  V512 res;
+  V512 expected;
+  __mmask8 k = 0x5d;
+
+  res.zmmi = _mm512_min_epi32(i32.zmmi, i32_mix.zmmi);
+  check_equal_nd(&res, &i32_mix, 16, "_mm512_min_epi32", __LINE__);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_min_epi32(res.zmmi, k, i32.zmmi, i32_mix.zmmi);
+  expected.zmm = _mm512_mask_mov_ps(_mm512_setzero_ps(), k, i32_mix.zmm);
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_min_epi32", __LINE__);
+}
+
+void NOINLINE do_minud() {
+  V512 res;
+  V512 expected;
+  __mmask16 k = 0x5d;
+
+  res.zmmi = _mm512_min_epu32(i32.zmmi, i32_mix.zmmi);
+  check_equal_nd(&res, &i32, 16, "_mm512_min_epu32", __LINE__);
+
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_min_epu32(res.zmmi, k, i32.zmmi, i32_mix.zmmi);
+  expected.zmm = _mm512_mask_mov_ps(_mm512_setzero_ps(), k, i32.zmm);
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_min_epu32", __LINE__);
+}
+
+void NOINLINE do_pslld() {
+  V512 res;
+  V512 vcount;
+  V512 expected;
+  volatile int i;
+  int count = 7;
+  __mmask16 k;
+  __mmask8 k8;
+
+  i = 0;
+  vcount.zmm = _mm512_setzero_ps();
+  vcount.u64[i] = count;
+
+  res.zmmi = _mm512_sll_epi32(i32_big.zmmi, vcount.xmmi[0]);
+  for (i = 0; i < 16; i++) {
+    expected.u32[i] = (count > 31) ? 0 : (i32_big.u32[i] << count);
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_sll_epi32", __LINE__);
+
+  soft_update(vcount);
+  res.ymmi[0] = _mm256_sll_epi32(i32_big.ymmi[0], vcount.xmmi[0]);
+  check_equal_nd(&res, &expected, 8, "_mm256_sll_epi32", __LINE__);
+
+  soft_update(vcount);
+  res.xmmi[0] = _mm_sll_epi32(i32_big.xmmi[0], vcount.xmmi[0]);
+  check_equal_nd(&res, &expected, 4, "_mm_sll_epi32", __LINE__);
+
+  k = 0x7fdb;
+  k8 = (__mmask8)k;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_sll_epi32(res.zmmi, k, i32_mix.zmmi, vcount.xmmi[0]);
+  for (i = 0; i < 16; i++) {
+    expected.u32[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.u32[i] = (count > 31) ? 0 : (i32_mix.u32[i] << count);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_sll_epi32", __LINE__);
+
+  soft_update(vcount);
+  res.ymmi[0] = _mm256_setzero_si256();
+
+  soft_update(vcount);
+  res.xmmi[0] = _mm_setzero_si128();
+
+  res = i8_mix;
+  res.zmmi = _mm512_maskz_sll_epi32(k, i32_mix.zmmi, vcount.xmmi[0]);
+  check_equal_nd(&res, &expected, 16, "_mm512_maskz_sll_epi32", __LINE__);
+
+  soft_update(vcount);
+  res = i8_mix;
+
+  soft_update(vcount);
+  res = i8_mix;
+}
+
+void NOINLINE do_psllq() {
+  V512 res;
+  V512 vcount;
+  V512 expected;
+  volatile int i;
+  int count = 7;
+  __mmask8 k;
+
+  i = 0;
+  vcount.zmm = _mm512_setzero_ps();
+  vcount.u64[i] = count;
+
+  res.zmmi = _mm512_sll_epi64(i64_big.zmmi, vcount.xmmi[0]);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = (count > 63) ? 0 : (i64_big.u64[i] << count);
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_sll_epi64", __LINE__);
+
+  soft_update(vcount);
+  res.ymmi[0] = _mm256_sll_epi64(i64_big.ymmi[0], vcount.xmmi[0]);
+  check_equal_nd(&res, &expected, 8, "_mm256_sll_epi64", __LINE__);
+
+  soft_update(vcount);
+  res.xmmi[0] = _mm_sll_epi64(i64_big.xmmi[0], vcount.xmmi[0]);
+  check_equal_nd(&res, &expected, 4, "_mm_sll_epi64", __LINE__);
+
+  k = 0xc3;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_sll_epi64(res.zmmi, k, i64_mix.zmmi, vcount.xmmi[0]);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.u64[i] = (count > 63) ? 0 : (i64_mix.u64[i] << count);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_sll_epi64", __LINE__);
+
+  soft_update(vcount);
+  res.ymmi[0] = _mm256_setzero_si256();
+
+  soft_update(vcount);
+  res.xmmi[0] = _mm_setzero_si128();
+
+  res = i8_mix;
+  res.zmmi = _mm512_maskz_sll_epi64(k, i64_mix.zmmi, vcount.xmmi[0]);
+  check_equal_nd(&res, &expected, 16, "_mm512_maskz_sll_epi64", __LINE__);
+
+  soft_update(vcount);
+  res = i8_mix;
+
+  soft_update(vcount);
+  res = i8_mix;
+}
+
+void NOINLINE do_pslldi(int cnt) {
+  V512 res;
+  V512 expected;
+  __mmask16 k;
+  volatile int i;
+
+  res.zmmi = _mm512_slli_epi32(i32_big.zmmi, 3);
+  for (i = 0; i < 16; i++) {
+    expected.u32[i] = i32_big.u32[i] << 3;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_slli_epi32", __LINE__);
+
+  k = 0x9786;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_slli_epi32(res.zmmi, k, i32_mix.zmmi, 6);
+  for (i = 0; i < 16; i++) {
+    expected.u32[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.u32[i] = (i32_mix.u32[i] << 6);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_slli_epi32", __LINE__);
+
+  res.zmmi = _mm512_slli_epi32(i32_big.zmmi, cnt);
+  for (i = 0; i < 16; i++) {
+    expected.u32[i] = i32_big.u32[i] << cnt;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_slli_epi32", __LINE__);
+
+  k = 0x9786;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_slli_epi32(res.zmmi, k, i32_mix.zmmi, cnt);
+  for (i = 0; i < 16; i++) {
+    expected.u32[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.u32[i] = (i32_mix.u32[i] << cnt);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_slli_epi32", __LINE__);
+}
+
+void NOINLINE do_psllqi(int cnt) {
+  V512 res;
+  V512 expected;
+  __mmask8 k;
+  volatile int i;
+
+  res.zmmi = _mm512_slli_epi64(i64_big.zmmi, 3);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = i64_big.u64[i] << 3;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_slli_epi64", __LINE__);
+
+  k = 0x97;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_slli_epi64(res.zmmi, k, i64_mix.zmmi, 6);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.u64[i] = (i64_mix.u64[i] << 6);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_slli_epi64", __LINE__);
+
+  res.zmmi = _mm512_slli_epi64(i64_big.zmmi, cnt);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = i64_big.u64[i] << cnt;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_slli_epi64", __LINE__);
+
+  k = 0x97;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_slli_epi64(res.zmmi, k, i64_mix.zmmi, cnt);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.u64[i] = (i64_mix.u64[i] << cnt);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_slli_epi64", __LINE__);
+}
+
+void NOINLINE do_psradi(int cnt) {
+  V512 res;
+  V512 expected;
+  __mmask16 k;
+  volatile int i;
+
+  res.zmmi = _mm512_srai_epi32(i32_big.zmmi, 3);
+  for (i = 0; i < 16; i++) {
+    expected.s32[i] = i32_big.s32[i] >> 3;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_srai_epi32", __LINE__);
+
+  k = 0x9786;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_srai_epi32(res.zmmi, k, i32_mix.zmmi, 6);
+  for (i = 0; i < 16; i++) {
+    expected.s32[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.s32[i] = (i32_mix.s32[i] >> 6);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_srai_epi32", __LINE__);
+
+  res.zmmi = _mm512_srai_epi32(i32_big.zmmi, cnt);
+  for (i = 0; i < 16; i++) {
+    expected.s32[i] = i32_big.s32[i] >> cnt;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_srai_epi32", __LINE__);
+
+  k = 0x9786;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_srai_epi32(res.zmmi, k, i32_mix.zmmi, cnt);
+  for (i = 0; i < 16; i++) {
+    expected.s32[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.s32[i] = (i32_mix.s32[i] >> cnt);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_srai_epi32", __LINE__);
+}
+
+void NOINLINE do_psrldi(int cnt) {
+  V512 res;
+  V512 expected;
+  __mmask16 k;
+  volatile int i;
+
+  res.zmmi = _mm512_srli_epi32(i32_big.zmmi, 3);
+  for (i = 0; i < 16; i++) {
+    expected.u32[i] = i32_big.u32[i] >> 3;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_srli_epi32", __LINE__);
+
+  k = 0x9786;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_srli_epi32(res.zmmi, k, i32_mix.zmmi, 6);
+  for (i = 0; i < 16; i++) {
+    expected.u32[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.u32[i] = (i32_mix.u32[i] >> 6);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_srli_epi32", __LINE__);
+
+  res.zmmi = _mm512_srli_epi32(i32_big.zmmi, cnt);
+  for (i = 0; i < 16; i++) {
+    expected.u32[i] = i32_big.u32[i] >> cnt;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_srli_epi32", __LINE__);
+
+  k = 0x9786;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_srli_epi32(res.zmmi, k, i32_mix.zmmi, cnt);
+  for (i = 0; i < 16; i++) {
+    expected.u32[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.u32[i] = (i32_mix.u32[i] >> cnt);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_srli_epi32 #2", __LINE__);
+}
+
+void NOINLINE do_psraqi(int cnt) {
+  V512 res;
+  V512 expected;
+  __mmask8 k;
+  volatile int i;
+
+  res.zmmi = _mm512_srai_epi64(i64_big.zmmi, 3);
+  for (i = 0; i < 8; i++) {
+    expected.s64[i] = i64_big.s64[i] >> 3;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_srai_epi64", __LINE__);
+
+  k = 0x97;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_srai_epi64(res.zmmi, k, i64_mix.zmmi, 6);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.s64[i] = (i64_mix.s64[i] >> 6);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_srai_epi64", __LINE__);
+
+  res.zmmi = _mm512_srai_epi64(i64_big.zmmi, cnt);
+  for (i = 0; i < 8; i++) {
+    expected.s64[i] = i64_big.s64[i] >> cnt;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_srai_epi64", __LINE__);
+
+  k = 0x97;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_srai_epi64(res.zmmi, k, i64_mix.zmmi, cnt);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.s64[i] = (i64_mix.s64[i] >> cnt);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_srai_epi64", __LINE__);
+}
+
+void NOINLINE do_psrlqi(int cnt) {
+  V512 res;
+  V512 expected;
+  __mmask8 k;
+  volatile int i;
+
+  res.zmmi = _mm512_srli_epi64(i64_big.zmmi, 3);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = i64_big.u64[i] >> 3;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_srli_epi64", __LINE__);
+
+  k = 0x97;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_srli_epi64(res.zmmi, k, i64_mix.zmmi, 6);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.u64[i] = (i64_mix.u64[i] >> 6);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_srli_epi64", __LINE__);
+
+  res.zmmi = _mm512_srli_epi64(i64_big.zmmi, cnt);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = i64_big.u64[i] >> cnt;
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_srli_epi64", __LINE__);
+
+  k = 0x97;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_srli_epi64(res.zmmi, k, i64_mix.zmmi, cnt);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.u64[i] = (i64_mix.u64[i] >> cnt);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_srli_epi64", __LINE__);
+}
+
+void NOINLINE do_psrad() {
+  V512 res;
+  V512 vcount;
+  V512 expected;
+  volatile int i;
+  int count = 7;
+  __mmask16 k;
+  __mmask8 k8;
+
+  i = 0;
+  vcount.zmm = _mm512_setzero_ps();
+  vcount.u64[i] = count;
+
+  res.zmmi = _mm512_sra_epi32(i32_big.zmmi, vcount.xmmi[0]);
+  for (i = 0; i < 16; i++) {
+    expected.s32[i] = (count > 31) ? 0 : (i32_big.s32[i] >> count);
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_sra_epi32", __LINE__);
+
+  k = 0x7fdb;
+  k8 = (__mmask8)k;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_sra_epi32(res.zmmi, k, i32_mix.zmmi, vcount.xmmi[0]);
+  for (i = 0; i < 16; i++) {
+    expected.u32[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.s32[i] = (count > 31) ? 0 : (i32_mix.s32[i] >> count);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_sra_epi32", __LINE__);
+
+  res = i8_mix;
+  res.zmmi = _mm512_maskz_sra_epi32(k, i32_mix.zmmi, vcount.xmmi[0]);
+  check_equal_nd(&res, &expected, 16, "_mm512_maskz_sra_epi32", __LINE__);
+}
+
+void NOINLINE do_psrld() {
+  V512 res;
+  V512 vcount;
+  V512 expected;
+  volatile int i;
+  int count = 7;
+  __mmask16 k;
+  __mmask8 k8;
+
+  i = 0;
+  vcount.zmm = _mm512_setzero_ps();
+  vcount.u64[i] = count;
+
+  res.zmmi = _mm512_srl_epi32(i32_big.zmmi, vcount.xmmi[0]);
+  for (i = 0; i < 16; i++) {
+    expected.u32[i] = (count > 31) ? 0 : (i32_big.u32[i] >> count);
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_srl_epi32", __LINE__);
+
+  k = 0x7fdb;
+  k8 = (__mmask8)k;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_srl_epi32(res.zmmi, k, i32_mix.zmmi, vcount.xmmi[0]);
+  for (i = 0; i < 16; i++) {
+    expected.u32[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.u32[i] = (count > 31) ? 0 : (i32_mix.u32[i] >> count);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_srl_epi32", __LINE__);
+
+  res = i8_mix;
+  res.zmmi = _mm512_maskz_srl_epi32(k, i32_mix.zmmi, vcount.xmmi[0]);
+  check_equal_nd(&res, &expected, 16, "_mm512_maskz_srl_epi32", __LINE__);
+}
+
+void NOINLINE do_psraq() {
+  V512 res;
+  V512 vcount;
+  V512 expected;
+  volatile int i;
+  int count = 7;
+  __mmask8 k;
+
+  i = 0;
+  vcount.zmm = _mm512_setzero_ps();
+  vcount.u64[i] = count;
+
+  res.zmmi = _mm512_sra_epi64(i64_big.zmmi, vcount.xmmi[0]);
+  for (i = 0; i < 8; i++) {
+    expected.s64[i] = (count > 63) ? 0 : (i64_big.s64[i] >> count);
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_sra_epi64", __LINE__);
+
+  k = 0xc3;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_sra_epi64(res.zmmi, k, i64_mix.zmmi, vcount.xmmi[0]);
+  for (i = 0; i < 8; i++) {
+    expected.s64[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.s64[i] = (count > 63) ? 0 : (i64_mix.s64[i] >> count);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_sra_epi64", __LINE__);
+
+  res = i8_mix;
+  res.zmmi = _mm512_maskz_sra_epi64(k, i64_mix.zmmi, vcount.xmmi[0]);
+  check_equal_nd(&res, &expected, 16, "_mm512_maskz_sra_epi64", __LINE__);
+}
+
+void NOINLINE do_psrlq() {
+  V512 res;
+  V512 vcount;
+  V512 expected;
+  volatile int i;
+  int count = 7;
+  __mmask8 k;
+
+  i = 0;
+  vcount.zmm = _mm512_setzero_ps();
+  vcount.u64[i] = count;
+
+  res.zmmi = _mm512_srl_epi64(i64_big.zmmi, vcount.xmmi[0]);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = (count > 63) ? 0 : (i64_big.u64[i] >> count);
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_srl_epi64", __LINE__);
+
+  k = 0xc3;
+  res.zmmi = _mm512_setzero_epi32();
+  res.zmmi = _mm512_mask_srl_epi64(res.zmmi, k, i64_mix.zmmi, vcount.xmmi[0]);
+  for (i = 0; i < 8; i++) {
+    expected.u64[i] = 0;
+    if ((k & (1 << i)) != 0) {
+      expected.u64[i] = (count > 63) ? 0 : (i64_mix.u64[i] >> count);
+    }
+  }
+  check_equal_nd(&res, &expected, 16, "_mm512_mask_srl_epi64", __LINE__);
+
+  res.zmmi = _mm512_maskz_srl_epi64(k, i64_mix.zmmi, vcount.xmmi[0]);
+  check_equal_nd(&res, &expected, 16, "_mm512_maskz_srl_epi64", __LINE__);
+
+  soft_update(vcount);
+  res = i8_mix;
+}
+
+void NOINLINE do_movsxwd() {
+  V512 xres, yres, zres;
+  V512 expected;
+  __mmask16 k16 = 0x7e5d;
+  __mmask8 k8 = (__mmask8)k16;
+  ;
+
+  /* Non-masked. */
+
+  zres.zmmi = _mm512_cvtepi16_epi32(i16_mix.ymmi[0]);
+  expected.zmmi = _mm512_set_epi32(15, -14, 13, -12, 11, -10, 9, -8, 7, -6, 5,
+                                   -4, 3, -2, 1, 0);
+  check_equal_nd(&zres, &expected, 16, "_mm512_cvtepi16_epi32", __LINE__);
+
+  /* Masked. */
+
+  soft_update(i16_mix);
+  zres.zmmi = _mm512_setzero_epi32();
+  yres = zres;
+  xres = zres;
+  zres.zmmi = _mm512_mask_cvtepi16_epi32(zres.zmmi, k16, i16_mix.ymmi[0]);
+  expected.zmmi = _mm512_set_epi32(0, -14, 13, -12, 11, -10, 9, 0, 0, -6, 0, -4,
+                                   3, -2, 0, 0);
+  check_equal_nd(&zres, &expected, 16, "_mm512_mask_cvtepi16_epi32", __LINE__);
+
+  /* Zero-masked. */
+
+  zres = i8_mix;
+  yres = zres;
+  xres = zres;
+  soft_update(i16_mix);
+  zres.zmmi = _mm512_maskz_cvtepi16_epi32(k16, i16_mix.ymmi[0]);
+  expected.zmmi = _mm512_set_epi32(15, -14, 13, -12, 11, -10, 9, -8, 7, -6, 5,
+                                   -4, 3, -2, 1, 0);
+  expected.zmmi = _mm512_maskz_mov_epi32(k16, expected.zmmi);
+  check_equal_nd(&zres, &expected, 16, "_mm512_maskz_cvtepi16_epi32", __LINE__);
+}
+
+int main(int argc, char *argv[]) {
+  int cnt;
+
+  init();
+
+  do_absd();
+  do_absq();
+
+  do_movsxwd();
+  do_movsxdq();
+  do_movsxbd();
+  do_movsxbq();
+  do_movzxwd();
+  do_movzxwq();
+
+  do_movzxbd();
+  do_movzxbq();
+
+  do_maxsd();
+  do_maxud();
+  do_minsd();
+  do_minud();
+
+  do_pslld();
+  do_psllq();
+
+  for (cnt = 0; cnt <= 8; cnt++) {
+    do_pslldi(cnt);
+    do_psradi(cnt);
+    do_psrldi(cnt);
+
+    do_psllqi(cnt);
+    do_psraqi(cnt);
+    do_psrlqi(cnt);
+  }
+
+  do_psrlq();
+  do_psraq();
+  do_psrld();
+  do_psrad();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/minmax_shift.reference_output b/SingleSource/UnitTests/Vector/AVX512/minmax_shift.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/minmax_shift.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/mm_op_sd.c b/SingleSource/UnitTests/Vector/AVX512/mm_op_sd.c
new file mode 100644
index 0000000..1e017a0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/mm_op_sd.c
@@ -0,0 +1,309 @@
+#include "m512_test_util.h"
+#include <memory.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+ * Here we check for _mm_[mask|maskz]_[add|div|max|min|mul|sub]_[round]_sd
+ * intrinsics.
+ */
+
+int show_op =
+#ifdef SHOW_OP
+    1
+#else
+    0
+#endif
+    ;
+
+typedef enum { ASSIGN, ADD, DIV, MAX, MIN, MUL, SUB } OPER;
+
+static void NOINLINE intop(OPER op, double ivalout[2], double ivalop1[2],
+                           double ivalop2[2]) {
+  int i;
+  int handled = 0;
+
+  memset(ivalout, 0, sizeof(ivalout));
+  for (i = 0; i < 2; i += 1) {
+    switch (op) {
+    case ASSIGN:
+      handled = 1;
+      ivalout[i] = ivalop1[i];
+      break;
+    case ADD:
+      handled = 1;
+      ivalout[i] = ivalop1[i] + ivalop2[i];
+      break;
+    case DIV:
+      handled = 1;
+      ivalout[i] = ivalop1[i] / ivalop2[i];
+      break;
+    case MAX:
+      handled = 1;
+      ivalout[i] = (ivalop1[i] > ivalop2[i]) ? ivalop1[i] : ivalop2[i];
+      break;
+    case MIN:
+      handled = 1;
+      ivalout[i] = (ivalop1[i] < ivalop2[i]) ? ivalop1[i] : ivalop2[i];
+      break;
+    case MUL:
+      handled = 1;
+      ivalout[i] = ivalop2[i] * ivalop1[i];
+      break;
+    case SUB:
+      handled = 1;
+      ivalout[i] = ivalop1[i] - ivalop2[i];
+      break;
+    default:
+      printf("FAIL: bad op\n");
+      break;
+    }
+  }
+  if (!handled) {
+    printf("FAIL: unsupported op\n");
+    n_errs++;
+  }
+}
+
+static int NOINLINE check(double val1[], double good[]) {
+  int i;
+  int res = 1;
+  for (i = 0; i < 2; i += 1) {
+    if (val1[i] != good[i]) {
+      res = 0;
+      printf("FAIL: %f != %f\n", val1[i], good[i]);
+    }
+  }
+  return (res);
+}
+
+static int NOINLINE check_mask(double dest[], double val1[], double good[],
+                               int mask, int zeroing) {
+  int i, j;
+  int res = 1;
+
+  // elements number to check dest vector
+  j = 1;
+
+  if (mask == 1) {
+    if (val1[0] != good[0]) {
+      res = 0;
+      printf("FAIL: %f != %f\n", val1[0], dest[0]);
+    }
+  } else if (mask == 0) {
+    if (zeroing == 1) {
+      if (val1[0] != 0) {
+        res = 0;
+        printf("FAIL: %f != %f\n", val1[0], dest[0]);
+      }
+    } else {
+      j = 0;
+    }
+  }
+
+  // check other elements of dest vector
+  for (i = j; i < 2; i += 1) {
+    if (val1[i] != dest[i]) {
+      res = 0;
+      printf("FAIL: %f != %f\n", val1[i], dest[i]);
+    }
+  }
+  return (res);
+}
+
+static void NOINLINE print_vec(char *pfx, double ivec[]) {
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  printf("%10.4f %10.4f\n", ivec[1], ivec[0]);
+}
+
+#define DOONE(OP, FUNC)                                                        \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f64, v1.f64, v2.f64);                                       \
+    vvv.xmmd[0] = FUNC(v1.xmmd[0], v2.xmmd[0]);                                \
+    passed = check_mask(vvv.f64, vvv.f64, good.f64, 0x1, 0);                   \
+    passed = check(vvv.f64, good.f64);                                         \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f64);                                             \
+      print_vec("Opand2", v2.f64);                                             \
+      print_vec("Scalar", good.f64);                                           \
+      print_vec("Vector", vvv.f64);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_WITH_MASK(OP, FUNC, MMASK)                                       \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f64, v1.f64, v2.f64);                                       \
+    vvv.xmmd[0] = FUNC(vvv.xmmd[0], MMASK, v1.xmmd[0], v2.xmmd[0]);            \
+    passed = check_mask(vvv.f64, vvv.f64, good.f64, MMASK, 0);                 \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f64);                                             \
+      print_vec("Opand2", v2.f64);                                             \
+      print_vec("Scalar", good.f64);                                           \
+      print_vec("Vector", vvv.f64);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_WITH_ZMASK(OP, FUNC, MMASK)                                      \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f64, v1.f64, v2.f64);                                       \
+    vvv.xmmd[0] = FUNC(MMASK, v1.xmmd[0], v2.xmmd[0]);                         \
+    passed = check_mask(vvv.f64, vvv.f64, good.f64, MMASK, 1);                 \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f64);                                             \
+      print_vec("Opand2", v2.f64);                                             \
+      print_vec("Scalar", good.f64);                                           \
+      print_vec("Vector", vvv.f64);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_ROUND(OP, FUNC, ROUND)                                           \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f64, v1.f64, v2.f64);                                       \
+    vvv.xmmd[0] = FUNC(v1.xmmd[0], v2.xmmd[0], ROUND);                         \
+    passed = check_mask(vvv.f64, vvv.f64, good.f64, 0x1, 0);                   \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f64);                                             \
+      print_vec("Opand2", v2.f64);                                             \
+      print_vec("Scalar", good.f64);                                           \
+      print_vec("Vector", vvv.f64);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_WITH_MASK_ROUND(OP, FUNC, MMASK, ROUND)                          \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f64, v1.f64, v2.f64);                                       \
+    vvv.xmmd[0] = FUNC(vvv.xmmd[0], MMASK, v1.xmmd[0], v2.xmmd[0], ROUND);     \
+    passed = check_mask(vvv.f64, vvv.f64, good.f64, MMASK, 0);                 \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f64);                                             \
+      print_vec("Opand2", v2.f64);                                             \
+      print_vec("Scalar", good.f64);                                           \
+      print_vec("Vector", vvv.f64);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_WITH_ZMASK_ROUND(OP, FUNC, MMASK, ROUND)                         \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f64, v1.f64, v2.f64);                                       \
+    vvv.xmmd[0] = FUNC(MMASK, v1.xmmd[0], v2.xmmd[0], ROUND);                  \
+    passed = check_mask(vvv.f64, vvv.f64, good.f64, MMASK, 1);                 \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f64);                                             \
+      print_vec("Opand2", v2.f64);                                             \
+      print_vec("Scalar", good.f64);                                           \
+      print_vec("Vector", vvv.f64);                                            \
+    }                                                                          \
+  }
+
+int main() {
+  double init1[] = {1, -2, 3, -4, 5, 6, 7, 8, 9, 10, -11, 12, 13, 14, 15, 16};
+  double init2[] = {11, 12, 23, -24, 35, 36, 17, 38,
+                    42, -1, 33, 7,   8,  10, 11, 12};
+
+  V512 v1;
+  V512 v2;
+  V512 good;
+  V512 vvv;
+
+  intop(ASSIGN, v1.f64, init1, 0);
+  intop(ASSIGN, v2.f64, init2, 0);
+  vvv.xmmd[0] = _mm_setzero_pd();
+
+  // simple mask intrinsics
+  DOONE_WITH_MASK(ADD, _mm_mask_add_sd, 0x1);
+  DOONE_WITH_MASK(DIV, _mm_mask_div_sd, 0x1);
+  DOONE_WITH_MASK(MAX, _mm_mask_max_sd, 0x1);
+  DOONE_WITH_MASK(MIN, _mm_mask_min_sd, 0x1);
+  DOONE_WITH_MASK(MUL, _mm_mask_mul_sd, 0x1);
+  DOONE_WITH_MASK(SUB, _mm_mask_sub_sd, 0x1);
+
+  // intrinsics with rounding mode
+  DOONE_ROUND(ADD, _mm_add_round_sd,
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_ROUND(DIV, _mm_div_round_sd,
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_ROUND(MAX, _mm_max_round_sd,
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_ROUND(MIN, _mm_min_round_sd,
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_ROUND(MUL, _mm_mul_round_sd,
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_ROUND(SUB, _mm_sub_round_sd,
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+
+  // intrinsics with mask and rounding mode
+  DOONE_WITH_MASK_ROUND(ADD, _mm_mask_add_round_sd, 0x1,
+                        _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_MASK_ROUND(DIV, _mm_mask_div_round_sd, 0x0,
+                        _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_MASK_ROUND(MAX, _mm_mask_max_round_sd, 0x1,
+                        _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_MASK_ROUND(MIN, _mm_mask_min_round_sd, 0x1,
+                        _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_MASK_ROUND(MUL, _mm_mask_mul_round_sd, 0x0,
+                        _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_MASK_ROUND(SUB, _mm_mask_sub_round_sd, 0x1,
+                        _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+
+  // intrinsics with zero masking
+  DOONE_WITH_ZMASK(ADD, _mm_maskz_add_sd, 0x0);
+  DOONE_WITH_ZMASK(DIV, _mm_maskz_div_sd, 0x1);
+  DOONE_WITH_ZMASK(MAX, _mm_maskz_max_sd, 0x1);
+  DOONE_WITH_ZMASK(MIN, _mm_maskz_min_sd, 0x1);
+  DOONE_WITH_ZMASK(MUL, _mm_maskz_mul_sd, 0x1);
+  DOONE_WITH_ZMASK(SUB, _mm_maskz_sub_sd, 0x0);
+
+  // intrinsics with zero masking and rounding mode
+  DOONE_WITH_ZMASK_ROUND(ADD, _mm_maskz_add_round_sd, 0x0,
+                         _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_ZMASK_ROUND(DIV, _mm_maskz_div_round_sd, 0x1,
+                         _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_ZMASK_ROUND(MAX, _mm_maskz_max_round_sd, 0x0,
+                         _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_ZMASK_ROUND(MIN, _mm_maskz_min_round_sd, 0x1,
+                         _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_ZMASK_ROUND(MUL, _mm_maskz_mul_round_sd, 0x1,
+                         _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_ZMASK_ROUND(SUB, _mm_maskz_sub_round_sd, 0x1,
+                         _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/mm_op_sd.reference_output b/SingleSource/UnitTests/Vector/AVX512/mm_op_sd.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/mm_op_sd.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/mm_op_ss.c b/SingleSource/UnitTests/Vector/AVX512/mm_op_ss.c
new file mode 100644
index 0000000..2c7780f
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/mm_op_ss.c
@@ -0,0 +1,306 @@
+#include "m512_test_util.h"
+#include <memory.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+ * Here we check for _mm_[mask|maskz]_[add|div|max|min|mul|sub]_[round]_ss
+ * intrinsics.
+ */
+
+int show_op =
+#ifdef SHOW_OP
+    1
+#else
+    0
+#endif
+    ;
+
+typedef enum { ASSIGN, ADD, DIV, MAX, MIN, MUL, SUB } OPER;
+
+static void NOINLINE intop(OPER op, float ivalout[4], float ivalop1[4],
+                           float ivalop2[4]) {
+  int i;
+  int handled = 0;
+
+  memset(ivalout, 0, sizeof(ivalout));
+  for (i = 0; i < 4; i += 1) {
+    switch (op) {
+    case ASSIGN:
+      handled = 1;
+      ivalout[i] = ivalop1[i];
+      break;
+    case ADD:
+      handled = 1;
+      ivalout[i] = ivalop1[i] + ivalop2[i];
+      break;
+    case DIV:
+      handled = 1;
+      ivalout[i] = ivalop1[i] / ivalop2[i];
+      break;
+    case MAX:
+      handled = 1;
+      ivalout[i] = (ivalop1[i] > ivalop2[i]) ? ivalop1[i] : ivalop2[i];
+      break;
+    case MIN:
+      handled = 1;
+      ivalout[i] = (ivalop1[i] < ivalop2[i]) ? ivalop1[i] : ivalop2[i];
+      break;
+    case MUL:
+      handled = 1;
+      ivalout[i] = ivalop2[i] * ivalop1[i];
+      break;
+    case SUB:
+      handled = 1;
+      ivalout[i] = ivalop1[i] - ivalop2[i];
+      break;
+    default:
+      printf("FAIL: bad op\n");
+      break;
+    }
+  }
+  if (!handled) {
+    printf("FAIL: unsupported op\n");
+    n_errs++;
+  }
+}
+
+static int NOINLINE check(float val1[], float good[]) {
+  int i;
+  int res = 1;
+  for (i = 0; i < 4; i += 1) {
+    if (val1[i] != good[i]) {
+      res = 0;
+      printf("FAIL: %f != %f\n", val1[i], good[i]);
+    }
+  }
+  return (res);
+}
+
+static int NOINLINE check_mask(float dest[], float val1[], float good[],
+                               int mask, int zeroing) {
+  int i, j;
+  int res = 1;
+
+  // elements number to check dest vector
+  j = 1;
+
+  if (mask == 1) {
+    if (val1[0] != good[0]) {
+      res = 0;
+      printf("FAIL: %f != %f\n", val1[0], dest[0]);
+    }
+  } else if (mask == 0) {
+    if (zeroing == 1) {
+      if (val1[0] != 0) {
+        res = 0;
+        printf("FAIL: %f != %f\n", val1[0], dest[0]);
+      }
+    } else {
+      j = 0;
+    }
+  }
+
+  // check other elements of dest vector
+  for (i = j; i < 4; i += 1) {
+    if (val1[i] != dest[i]) {
+      res = 0;
+      printf("FAIL: %f != %f\n", val1[i], dest[i]);
+    }
+  }
+  return (res);
+}
+
+static void NOINLINE print_vec(char *pfx, float ivec[]) {
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  printf("%10.4f %10.4f %10.4f %10.4f\n", ivec[3], ivec[2], ivec[1], ivec[0]);
+}
+
+#define DOONE(OP, FUNC)                                                        \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f32, v1.f32, v2.f32);                                       \
+    vvv.xmm[0] = FUNC(v1.xmm[0], v2.xmm[0]);                                   \
+    passed = check_mask(vvv.f32, vvv.f32, good.f32, 0x1, 0);                   \
+    passed = check(vvv.f32, good.f32);                                         \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f32);                                             \
+      print_vec("Opand2", v2.f32);                                             \
+      print_vec("Scalar", good.f32);                                           \
+      print_vec("Vector", vvv.f32);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_WITH_MASK(OP, FUNC, MMASK)                                       \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f32, v1.f32, v2.f32);                                       \
+    vvv.xmm[0] = FUNC(vvv.xmm[0], MMASK, v1.xmm[0], v2.xmm[0]);                \
+    passed = check_mask(vvv.f32, vvv.f32, good.f32, MMASK, 0);                 \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f32);                                             \
+      print_vec("Opand2", v2.f32);                                             \
+      print_vec("Scalar", good.f32);                                           \
+      print_vec("Vector", vvv.f32);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_WITH_ZMASK(OP, FUNC, MMASK)                                      \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f32, v1.f32, v2.f32);                                       \
+    vvv.xmm[0] = FUNC(MMASK, v1.xmm[0], v2.xmm[0]);                            \
+    passed = check_mask(vvv.f32, vvv.f32, good.f32, MMASK, 1);                 \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f32);                                             \
+      print_vec("Opand2", v2.f32);                                             \
+      print_vec("Scalar", good.f32);                                           \
+      print_vec("Vector", vvv.f32);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_ROUND(OP, FUNC, ROUND)                                           \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f32, v1.f32, v2.f32);                                       \
+    vvv.xmm[0] = FUNC(v1.xmm[0], v2.xmm[0], ROUND);                            \
+    passed = check_mask(vvv.f32, vvv.f32, good.f32, 0x1, 0);                   \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f32);                                             \
+      print_vec("Opand2", v2.f32);                                             \
+      print_vec("Scalar", good.f32);                                           \
+      print_vec("Vector", vvv.f32);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_WITH_MASK_ROUND(OP, FUNC, MMASK, ROUND)                          \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f32, v1.f32, v2.f32);                                       \
+    vvv.xmm[0] = FUNC(vvv.xmm[0], MMASK, v1.xmm[0], v2.xmm[0], ROUND);         \
+    passed = check_mask(vvv.f32, vvv.f32, good.f32, MMASK, 0);                 \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f32);                                             \
+      print_vec("Opand2", v2.f32);                                             \
+      print_vec("Scalar", good.f32);                                           \
+      print_vec("Vector", vvv.f32);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_WITH_ZMASK_ROUND(OP, FUNC, MMASK, ROUND)                         \
+  {                                                                            \
+    int passed = 0;                                                            \
+    intop(OP, good.f32, v1.f32, v2.f32);                                       \
+    vvv.xmm[0] = FUNC(MMASK, v1.xmm[0], v2.xmm[0], ROUND);                     \
+    passed = check_mask(vvv.f32, vvv.f32, good.f32, MMASK, 1);                 \
+    if (!passed) {                                                             \
+      printf("FAIL " #FUNC "\n");                                              \
+      n_errs++;                                                                \
+    }                                                                          \
+    if (!passed || show_op) {                                                  \
+      print_vec("Opand1", v1.f32);                                             \
+      print_vec("Opand2", v2.f32);                                             \
+      print_vec("Scalar", good.f32);                                           \
+      print_vec("Vector", vvv.f32);                                            \
+    }                                                                          \
+  }
+
+int main() {
+  float init1[] = {1, -2, 3, -4, 5, 6, 7, 8, 9, 10, -11, 12, 13, 14, 15, 16};
+  float init2[] = {11, 12, 23, -24, 35, 36, 17, 38,
+                   42, -1, 33, 7,   8,  10, 11, 12};
+
+  V512 v1;
+  V512 v2;
+  V512 good;
+  V512 vvv;
+
+  intop(ASSIGN, v1.f32, init1, 0);
+  intop(ASSIGN, v2.f32, init2, 0);
+  vvv.xmm[0] = _mm_setzero_ps();
+
+  // simple intrinsics
+  DOONE_WITH_MASK(ADD, _mm_mask_add_ss, 0x1);
+  DOONE_WITH_MASK(MAX, _mm_mask_max_ss, 0x1);
+  DOONE_WITH_MASK(MIN, _mm_mask_min_ss, 0x1);
+  DOONE_WITH_MASK(MUL, _mm_mask_mul_ss, 0x1);
+  DOONE_WITH_MASK(SUB, _mm_mask_sub_ss, 0x1);
+
+  // intrinsics with rounding mode
+  DOONE_ROUND(ADD, _mm_add_round_ss,
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_ROUND(DIV, _mm_div_round_ss,
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_ROUND(MAX, _mm_max_round_ss,
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_ROUND(MIN, _mm_min_round_ss,
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_ROUND(MUL, _mm_mul_round_ss,
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_ROUND(SUB, _mm_sub_round_ss,
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+
+  DOONE_WITH_MASK_ROUND(ADD, _mm_mask_add_round_ss, 0x1,
+                        _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_MASK_ROUND(DIV, _mm_mask_div_round_ss, 0x0,
+                        _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_MASK_ROUND(MAX, _mm_mask_max_round_ss, 0x1,
+                        _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_MASK_ROUND(MIN, _mm_mask_min_round_ss, 0x1,
+                        _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_MASK_ROUND(MUL, _mm_mask_mul_round_ss, 0x0,
+                        _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_MASK_ROUND(SUB, _mm_mask_sub_round_ss, 0x1,
+                        _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+
+  // intrinsics with zero mask
+  DOONE_WITH_ZMASK(ADD, _mm_maskz_add_ss, 0x0);
+  DOONE_WITH_ZMASK(DIV, _mm_maskz_div_ss, 0x1);
+  DOONE_WITH_ZMASK(MAX, _mm_maskz_max_ss, 0x1);
+  DOONE_WITH_ZMASK(MIN, _mm_maskz_min_ss, 0x1);
+  DOONE_WITH_ZMASK(MUL, _mm_maskz_mul_ss, 0x1);
+  DOONE_WITH_ZMASK(SUB, _mm_maskz_sub_ss, 0x0);
+
+  DOONE_WITH_ZMASK_ROUND(ADD, _mm_maskz_add_round_ss, 0x0,
+                         _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_ZMASK_ROUND(DIV, _mm_maskz_div_round_ss, 0x1,
+                         _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_ZMASK_ROUND(MAX, _mm_maskz_max_round_ss, 0x0,
+                         _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_ZMASK_ROUND(MIN, _mm_maskz_min_round_ss, 0x1,
+                         _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_ZMASK_ROUND(MUL, _mm_maskz_mul_round_ss, 0x1,
+                         _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  DOONE_WITH_ZMASK_ROUND(SUB, _mm_maskz_sub_round_ss, 0x1,
+                         _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/mm_op_ss.reference_output b/SingleSource/UnitTests/Vector/AVX512/mm_op_ss.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/mm_op_ss.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/op2_xyz_int.c b/SingleSource/UnitTests/Vector/AVX512/op2_xyz_int.c
new file mode 100644
index 0000000..b96ff8b
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/op2_xyz_int.c
@@ -0,0 +1,273 @@
+
+/*
+ * Test 128 and 256-bit two operand integer intrinsics,
+ * with masked and zero-masked forms, by comparing
+ * their output with the corresponding 512-bit intrinsic.
+ * Here we check for _mm512_[mask|maskz]_[and|andnot|or|xor|add|max|min|mul|sub]
+ * intrinsics
+ */
+
+#include "m512_test_util.h"
+#include <stdio.h>
+
+V512 i8_src1;
+V512 i8_src2;
+V512 i16_src1;
+V512 i16_src2;
+V512 i32_src1;
+V512 i32_src2;
+V512 i64_src1;
+V512 i64_src2;
+
+void NOINLINE init() {
+  volatile int i;
+
+  for (i = 0; i < 64; i++) {
+    i8_src1.s8[i] = i;
+    i8_src2.s8[i] = (i & 1) ? i : -i;
+  }
+
+  for (i = 0; i < 32; i++) {
+    i16_src1.s16[i] = i;
+    i16_src2.s16[i] = (i & 1) ? i : -i;
+  }
+
+  for (i = 0; i < 16; i++) {
+    i32_src1.s32[i] = i;
+    i32_src2.s32[i] = (i & 1) ? i : -i;
+  }
+
+  for (i = 0; i < 8; i++) {
+    i64_src1.s64[i] = i;
+    i64_src2.s64[i] = (i & 1) ? i : -i;
+  }
+}
+
+/*
+ * Use "soft update" between tests to make compiler think src was updated.
+ * Prevents PRE'ing a load of src, thus allowing ciscization.
+ * Also prevents PRE'ing intrinsic operations, ensuring we
+ * execute the intended instructions.
+ */
+volatile int vol0 = 0;
+#define soft_v512_update(var) (var).xmmi[vol0] = (var).xmmi[vol0]
+
+/*
+ * Generate a function that tests a packed int64 intrinsic
+ * by implementing the XMM, YMM and ZMM versions, and comparing
+ * the XMM and YMM results with the low part of the ZMM result.
+ *
+ * We test regular, masked and zero masked forms.
+ *
+ * Use GEN_I64_UNIFORM when the core intrinsic name is the same
+ * for all vector lengths, e.g. "add_epi64".  Otherwise use
+ * GEN_I64 to list the different names, e.g. "and_si128" and "and_si256".
+ */
+
+#define GEN_I64_UNIFORM(oper) GEN_I64(oper, oper, oper, oper, oper)
+
+#define GEN_I64(test_name, oper_epi64, oper_xmm, oper_ymm, oper_zmm)           \
+  void NOINLINE do_##test_name() {                                             \
+    V512 xmm_res, ymm_res, zmm_res;                                            \
+    __mmask8 k8 = 0x5a;                                                        \
+                                                                               \
+    /* Non-masked. */                                                          \
+                                                                               \
+    soft_v512_update(i64_src2);                                                \
+    zmm_res.zmmi = _mm512_##oper_zmm(i64_src1.zmmi, i64_src2.zmmi);            \
+                                                                               \
+    /* Masked. */                                                              \
+                                                                               \
+    zmm_res.zmmi = _mm512_setzero_epi32();                                     \
+    ymm_res = zmm_res;                                                         \
+    xmm_res = zmm_res;                                                         \
+                                                                               \
+    soft_v512_update(i64_src2);                                                \
+    zmm_res.zmmi = _mm512_mask_##oper_epi64(zmm_res.zmmi, k8, i64_src1.zmmi,   \
+                                            i64_src2.zmmi);                    \
+                                                                               \
+    /* Zero-masked. */                                                         \
+                                                                               \
+    zmm_res.zmmi = _mm512_set1_epi64(1);                                       \
+    ymm_res = zmm_res;                                                         \
+    xmm_res = zmm_res;                                                         \
+                                                                               \
+    soft_v512_update(i64_src2);                                                \
+    zmm_res.zmmi =                                                             \
+        _mm512_maskz_##oper_epi64(k8, i64_src1.zmmi, i64_src2.zmmi);           \
+  }
+
+#define GEN_I32_UNIFORM(oper) GEN_I32(oper, oper, oper, oper, oper)
+
+#define GEN_I32(test_name, oper_epi32, oper_xmm, oper_ymm, oper_zmm)           \
+  void NOINLINE do_##test_name() {                                             \
+    V512 xmm_res, ymm_res, zmm_res;                                            \
+    __mmask16 k16 = 0x7feb;                                                    \
+    __mmask8 k8 = (__mmask8)k16;                                               \
+                                                                               \
+    /* Non-masked. */                                                          \
+                                                                               \
+    soft_v512_update(i32_src2);                                                \
+    zmm_res.zmmi = _mm512_##oper_zmm(i32_src1.zmmi, i32_src2.zmmi);            \
+                                                                               \
+    /* Masked. */                                                              \
+                                                                               \
+    zmm_res.zmmi = _mm512_setzero_epi32();                                     \
+    ymm_res = zmm_res;                                                         \
+    xmm_res = zmm_res;                                                         \
+                                                                               \
+    soft_v512_update(i32_src2);                                                \
+    zmm_res.zmmi = _mm512_mask_##oper_epi32(zmm_res.zmmi, k16, i32_src1.zmmi,  \
+                                            i32_src2.zmmi);                    \
+                                                                               \
+    /* Zero-masked. */                                                         \
+                                                                               \
+    zmm_res.zmmi = _mm512_set1_epi32(1.0);                                     \
+    ymm_res = zmm_res;                                                         \
+    xmm_res = zmm_res;                                                         \
+                                                                               \
+    soft_v512_update(i32_src2);                                                \
+    zmm_res.zmmi =                                                             \
+        _mm512_maskz_##oper_epi32(k16, i32_src1.zmmi, i32_src2.zmmi);          \
+  }
+
+#define GEN_I16_UNIFORM(oper) GEN_I16(oper, oper, oper, oper, oper)
+
+#define GEN_I16(test_name, oper_epi16, oper_xmm, oper_ymm, oper_zmm)           \
+  void NOINLINE do_##test_name() {                                             \
+    V512 xmm_res, ymm_res, zmm_res;                                            \
+    __mmask32 k32 = 0x7febeb7f;                                                \
+    __mmask16 k16 = (__mmask16)k32;                                            \
+    __mmask8 k8 = (__mmask8)k16;                                               \
+                                                                               \
+    /* Non-masked. */                                                          \
+                                                                               \
+    soft_v512_update(i16_src2);                                                \
+    zmm_res.zmmi = _mm512_##oper_zmm(i16_src1.zmmi, i16_src2.zmmi);            \
+                                                                               \
+    /* Masked. */                                                              \
+                                                                               \
+    zmm_res.zmmi = _mm512_setzero_epi32();                                     \
+    ymm_res = zmm_res;                                                         \
+    xmm_res = zmm_res;                                                         \
+                                                                               \
+    soft_v512_update(i16_src2);                                                \
+    zmm_res.zmmi = _mm512_mask_##oper_epi16(zmm_res.zmmi, k32, i16_src1.zmmi,  \
+                                            i16_src2.zmmi);                    \
+                                                                               \
+    /* Zero-masked. */                                                         \
+                                                                               \
+    zmm_res.zmmi = _mm512_set1_epi32(1.0);                                     \
+    ymm_res = zmm_res;                                                         \
+    xmm_res = zmm_res;                                                         \
+                                                                               \
+    soft_v512_update(i16_src2);                                                \
+    zmm_res.zmmi =                                                             \
+        _mm512_maskz_##oper_epi16(k32, i16_src1.zmmi, i16_src2.zmmi);          \
+  }
+
+#define GEN_I8_UNIFORM(oper) GEN_I8(oper, oper, oper, oper, oper)
+
+#define GEN_I8(test_name, oper_epi8, oper_xmm, oper_ymm, oper_zmm)             \
+  void NOINLINE do_##test_name() {                                             \
+    V512 xmm_res, ymm_res, zmm_res;                                            \
+    __mmask64 k64 = 0xa55a7febeb7f5aa5U;                                       \
+    __mmask32 k32 = (__mmask32)k64;                                            \
+    __mmask16 k16 = (__mmask16)k32;                                            \
+                                                                               \
+    /* Non-masked. */                                                          \
+                                                                               \
+    soft_v512_update(i8_src2);                                                 \
+    zmm_res.zmmi = _mm512_##oper_zmm(i8_src1.zmmi, i8_src2.zmmi);              \
+                                                                               \
+    /* Masked. */                                                              \
+                                                                               \
+    zmm_res.zmmi = _mm512_setzero_epi32();                                     \
+    ymm_res = zmm_res;                                                         \
+    xmm_res = zmm_res;                                                         \
+                                                                               \
+    soft_v512_update(i8_src2);                                                 \
+    zmm_res.zmmi = _mm512_mask_##oper_epi8(zmm_res.zmmi, k64, i8_src1.zmmi,    \
+                                           i8_src2.zmmi);                      \
+                                                                               \
+    /* Zero-masked. */                                                         \
+                                                                               \
+    zmm_res.zmmi = _mm512_set1_epi32(1.0);                                     \
+    ymm_res = zmm_res;                                                         \
+    xmm_res = zmm_res;                                                         \
+                                                                               \
+    soft_v512_update(i8_src2);                                                 \
+    zmm_res.zmmi = _mm512_maskz_##oper_epi8(k64, i8_src1.zmmi, i8_src2.zmmi);  \
+  }
+
+GEN_I32(and_si512, and_epi32, and_si128, and_si256, and_si512)
+GEN_I32(andnot_si512, andnot_epi32, andnot_si128, andnot_si256, andnot_si512)
+GEN_I32(or_si512, or_epi32, or_si128, or_si256, or_si512)
+GEN_I32(xor_si512, xor_epi32, xor_si128, xor_si256, xor_si512)
+
+GEN_I64(and_epi64, and_epi64, and_si128, and_si256, and_epi64)
+GEN_I64(andnot_epi64, andnot_epi64, andnot_si128, andnot_si256, andnot_epi64)
+GEN_I64(or_epi64, or_epi64, or_si128, or_si256, or_epi64)
+GEN_I64(xor_epi64, xor_epi64, xor_si128, xor_si256, xor_epi64)
+
+GEN_I64_UNIFORM(add_epi64)
+GEN_I64_UNIFORM(max_epi64)
+GEN_I64_UNIFORM(max_epu64)
+GEN_I64_UNIFORM(min_epi64)
+GEN_I64_UNIFORM(min_epu64)
+GEN_I64_UNIFORM(mul_epi32) /* Yes, these are really I64 vector elements. */
+GEN_I64_UNIFORM(mul_epu32) /* Yes, these are really I64 vector elements. */
+
+GEN_I32(and_epi32, and_epi32, and_si128, and_si256, and_epi32)
+GEN_I32(andnot_epi32, andnot_epi32, andnot_si128, andnot_si256, andnot_epi32)
+GEN_I32(or_epi32, or_epi32, or_si128, or_si256, or_epi32)
+GEN_I32(xor_epi32, xor_epi32, xor_si128, xor_si256, xor_epi32)
+
+GEN_I32_UNIFORM(add_epi32)
+GEN_I32_UNIFORM(max_epi32)
+GEN_I32_UNIFORM(max_epu32)
+GEN_I32_UNIFORM(min_epi32)
+GEN_I32_UNIFORM(min_epu32)
+GEN_I32_UNIFORM(sub_epi32)
+
+int main() {
+  init();
+
+  do_and_si512();
+  do_andnot_si512();
+  do_or_si512();
+  do_xor_si512();
+
+  do_and_epi64();
+  do_andnot_epi64();
+  do_or_epi64();
+  do_xor_epi64();
+
+  do_add_epi64();
+  do_max_epi64();
+  do_max_epu64();
+  do_min_epi64();
+  do_min_epu64();
+  do_mul_epi32();
+  do_mul_epu32();
+
+  do_and_epi32();
+  do_andnot_epi32();
+  do_or_epi32();
+  do_xor_epi32();
+
+  do_add_epi32();
+  do_max_epi32();
+  do_max_epu32();
+  do_min_epi32();
+  do_min_epu32();
+  do_sub_epi32();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/op2_xyz_int.reference_output b/SingleSource/UnitTests/Vector/AVX512/op2_xyz_int.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/op2_xyz_int.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_m512.c b/SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_m512.c
new file mode 100644
index 0000000..abf320f
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_m512.c
@@ -0,0 +1,83 @@
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+/*
+ * Here we check for _mm512_[mask|maskz]_[rsqrt14|rcp14] intrinsics.
+ */
+#define CHECK_PD(op)                                                           \
+  {                                                                            \
+    volatile __m512d r = _mm512_##op##_pd(v1);                                 \
+    check_equal_ndf(&r, &exp, 8, "_mm512_" #op "_pd", __LINE__);               \
+    k8 = 0xAA;                                                                 \
+    r = _mm512_mask_##op##_pd(undef, k8, v1);                                  \
+    check_equal_ndf(&r, &expm, 8, "_mm512_mask_" #op "_pd{1}", __LINE__);      \
+    r = _mm512_maskz_##op##_pd(k8, v1);                                        \
+    check_equal_ndf(&r, &expzm, 8, "_mm512_maskz_" #op "_pd{0}", __LINE__);    \
+  }
+
+#define DECL_PD(op, srcv, expv)                                                \
+  void NOINLINE do_##op##_pd() {                                               \
+    __mmask8 k8;                                                               \
+    volatile __m512d v1 = _mm512_set1_pd((srcv));                              \
+                                                                               \
+    volatile __m512d undef = _mm512_set1_pd(3.0);                              \
+    __m512d exp = _mm512_set1_pd(expv);                                        \
+    __m512d expm =                                                             \
+        _mm512_set_pd((expv), 3.0, (expv), 3.0, (expv), 3.0, (expv), 3.0);     \
+    __m512d expzm = _mm512_set_pd((expv), 0, (expv), 0, (expv), 0, (expv), 0); \
+                                                                               \
+    CHECK_PD(op);                                                              \
+  }
+
+#define TEST_PD(op) do_##op##_pd()
+
+// PS version starts here.
+
+#define CHECK_PS(op)                                                           \
+  {                                                                            \
+    volatile __m512 r = _mm512_##op##_ps(v1);                                  \
+    check_equal_nsf(&r, &exp, 16, "_mm512_" #op "_ps", __LINE__);              \
+    k8 = 0xAAAA;                                                               \
+    r = _mm512_mask_##op##_ps(undef, k8, v1);                                  \
+    check_equal_nsf(&r, &expm, 16, "_mm512_mask_" #op "_ps{1}", __LINE__);     \
+    r = _mm512_maskz_##op##_ps(k8, v1);                                        \
+    check_equal_nsf(&r, &expzm, 16, "_mm512_maskz_" #op "_ps{0}", __LINE__);   \
+  }
+
+#define DECL_PS(op, srcv, expv)                                                \
+  void NOINLINE do_##op##_ps() {                                               \
+    __mmask16 k8;                                                              \
+    volatile __m512 v1 = _mm512_set1_ps((srcv));                               \
+                                                                               \
+    volatile __m512 undef = _mm512_set1_ps(3.0);                               \
+    __m512 exp = _mm512_set1_ps(expv);                                         \
+    __m512 expm =                                                              \
+        _mm512_set_ps((expv), 3.0, (expv), 3.0, (expv), 3.0, (expv), 3.0,      \
+                      (expv), 3.0, (expv), 3.0, (expv), 3.0, (expv), 3.0);     \
+    __m512 expzm = _mm512_set_ps((expv), 0, (expv), 0, (expv), 0, (expv), 0,   \
+                                 (expv), 0, (expv), 0, (expv), 0, (expv), 0);  \
+                                                                               \
+    CHECK_PS(op);                                                              \
+  }
+
+#define TEST_PS(op) do_##op##_ps()
+
+DECL_PD(rsqrt14, 0.25, 2.0)
+DECL_PS(rsqrt14, 0.16f, 2.5f)
+DECL_PD(rcp14, 0.5, 2.0)
+DECL_PS(rcp14, 0.4f, 2.5f)
+
+int main(int argc, char *argv[]) {
+  TEST_PD(rcp14);
+  TEST_PS(rcp14);
+  TEST_PD(rsqrt14);
+  TEST_PS(rsqrt14);
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_m512.reference_output b/SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_m512.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_m512.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_scalar.c b/SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_scalar.c
new file mode 100644
index 0000000..6092fd3
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_scalar.c
@@ -0,0 +1,100 @@
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+/*
+ * Here we check for _mm_[mask|maskz]_[rsqrt14|rcp14] intrinsics.
+ */
+#define CHECK_SCALAR_SD(op)                                                    \
+  {                                                                            \
+    volatile __m128d r = _mm_##op##_sd(v1, v2);                                \
+    check_equal_ndf(&r, &exp, 2, "_mm_" #op "_sd", __LINE__);                  \
+    k8 = 1;                                                                    \
+    r = _mm_mask_##op##_sd(undef, k8, v1, v2);                                 \
+    check_equal_ndf(&r, &expm1, 2, "_mm_mask_" #op "_sd{1}", __LINE__);        \
+    k8 = 0;                                                                    \
+    r = _mm_mask_##op##_sd(undef, k8, v1, v2);                                 \
+    check_equal_ndf(&r, &expm0, 2, "_mm_mask_" #op "_sd{0}", __LINE__);        \
+    k8 = 1;                                                                    \
+    r = _mm_maskz_##op##_sd(k8, v1, v2);                                       \
+    check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_" #op "_sd{1}", __LINE__);      \
+    k8 = 0;                                                                    \
+    r = _mm_maskz_##op##_sd(k8, v1, v2);                                       \
+    check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_" #op "_sd{0}", __LINE__);      \
+  }
+
+#define DECL_SCALAR_SD(op, src1v, src2v, expv)                                 \
+  void NOINLINE do_##op##_sd() {                                               \
+    __mmask8 k8;                                                               \
+    volatile __m128d v1 = _mm_set_pd(2.0 /* upr */, (src1v) /* lwr */);        \
+    volatile __m128d v2 = _mm_set_pd(4.0 /* upr */, (src2v) /* lwr */);        \
+                                                                               \
+    volatile __m128d undef = _mm_set_pd(333.0 /* upr */, 111.0 /* lwr */);     \
+    __m128d exp = _mm_set_pd(2.0 /* upr */, (expv) /* lwr */);                 \
+    __m128d expm1 = _mm_set_pd(2.0 /* upr */, (expv) /* lwr */);               \
+    __m128d expm0 = _mm_set_pd(2.0 /* upr */, 111.0 /* lwr */);                \
+    __m128d expzm1 = _mm_set_pd(2.0 /* upr */, (expv) /* lwr */);              \
+    __m128d expzm0 = _mm_set_pd(2.0 /* upr */, 0.0 /* lwr */);                 \
+                                                                               \
+    CHECK_SCALAR_SD(op);                                                       \
+  }
+
+#define TEST_SCALAR_SD(op) do_##op##_sd()
+
+#define CHECK_SCALAR_SS(op)                                                    \
+  {                                                                            \
+    volatile __m128 r = _mm_##op##_ss(v1, v2);                                 \
+    check_equal_nsf(&r, &exp, 4, "_mm_" #op "_ss", __LINE__);                  \
+    k8 = 1;                                                                    \
+    r = _mm_mask_##op##_ss(undef, k8, v1, v2);                                 \
+    check_equal_nsf(&r, &expm1, 4, "_mm_mask_" #op "_ss", __LINE__);           \
+    k8 = 0;                                                                    \
+    r = _mm_mask_##op##_ss(undef, k8, v1, v2);                                 \
+    check_equal_nsf(&r, &expm0, 4, "_mm_mask_" #op "_ss", __LINE__);           \
+    k8 = 1;                                                                    \
+    r = _mm_maskz_##op##_ss(k8, v1, v2);                                       \
+    check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_" #op "_ss", __LINE__);         \
+    k8 = 0;                                                                    \
+    r = _mm_maskz_##op##_ss(k8, v1, v2);                                       \
+    check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_" #op "_ss", __LINE__);         \
+  }
+
+#define DECL_SCALAR_SS(op, src1v, src2v, expv)                                 \
+  void NOINLINE do_##op##_ss() {                                               \
+    __mmask8 k8;                                                               \
+    volatile __m128 v1 =                                                       \
+        _mm_set_ps(4.0f /* upr */, 3.0f, 2.0f, (src1v) /* lwr */);             \
+    volatile __m128 v2 =                                                       \
+        _mm_set_ps(8.0f /* upr */, 7.0f, 6.0f, (src2v) /* lwr */);             \
+                                                                               \
+    volatile __m128 undef =                                                    \
+        _mm_set_ps(777.0f /* upr */, 555.0f, 333.0f, 111.0f /* lwr */);        \
+    __m128 exp = _mm_set_ps(4.0f /* upr */, 3.0f, 2.0f, (expv) /* lwr */);     \
+    __m128 expm1 = _mm_set_ps(4.0f /* upr */, 3.0f, 2.0f, (expv) /* lwr */);   \
+    __m128 expm0 = _mm_set_ps(4.0f /* upr */, 3.0f, 2.0f, 111.0f /* lwr */);   \
+    __m128 expzm1 = _mm_set_ps(4.0f /* upr */, 3.0f, 2.0f, (expv) /* lwr */);  \
+    __m128 expzm0 = _mm_set_ps(4.0f /* upr */, 3.0f, 2.0f, 0.0f /* lwr */);    \
+                                                                               \
+    CHECK_SCALAR_SS(op);                                                       \
+  }
+
+#define TEST_SCALAR_SS(op) do_##op##_ss()
+
+DECL_SCALAR_SD(rsqrt14, 17.0, 0.25, 2.0)
+DECL_SCALAR_SS(rsqrt14, 17.0f, 0.16f, 2.5f)
+DECL_SCALAR_SD(rcp14, 17.0, 0.5, 2.0)
+DECL_SCALAR_SS(rcp14, 17.0f, 0.4f, 2.5f)
+
+int main(int argc, char *argv[]) {
+  TEST_SCALAR_SD(rcp14);
+  TEST_SCALAR_SS(rcp14);
+  TEST_SCALAR_SD(rsqrt14);
+  TEST_SCALAR_SS(rsqrt14);
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_scalar.reference_output b/SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_scalar.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/rcp_rsqrt_14_scalar.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/reduce.c b/SingleSource/UnitTests/Vector/AVX512/reduce.c
new file mode 100644
index 0000000..f2dfd52
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/reduce.c
@@ -0,0 +1,731 @@
+#include "m512_test_util.h"
+#include <math.h>
+#include <memory.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+ * Test reduce instructions.
+ * Here we check for _mm512_[mask_]reduce_[add|mul|min|max|and|or] intrinsics.
+ */
+
+typedef long long s64;
+typedef unsigned long long u64;
+
+typedef float f32;
+typedef double f64;
+
+typedef int s32;
+typedef unsigned int u32;
+
+int verbose = 0;
+#define VERBOSE (verbose > 1)
+#define SHOW_OP (verbose > 2)
+
+typedef enum {
+  ASSIGN,
+  ADD,
+  REDUCE_ADD,
+  REDUCE_MUL,
+  REDUCE_MIN,
+  REDUCE_MAX,
+  REDUCE_GMIN,
+  REDUCE_GMAX,
+  REDUCE_OR,
+  REDUCE_AND
+} OPER;
+
+__mmask16 mask_true = 0xffff;
+
+#define MASK(mask, n) ((mask & (0x1 << n)) != 0)
+
+#define IMin(i, j) (((i) <= (j)) ? (i) : (j))
+#define IMax(i, j) (((i) >= (j)) ? (i) : (j))
+
+#define MULOP(a, b) (a * b)
+#define ADDOP(a, b) (a + b)
+#define OROP(a, b) (a | b)
+#define ANDOP(a, b) (a & b)
+#define GMINOP(a, b) fmin(a, b)
+#define GMAXOP(a, b) fmax(a, b)
+
+#define DO_MASK_COPY(len, output, mask, input, def)                            \
+  {                                                                            \
+    int n;                                                                     \
+                                                                               \
+    for (n = 0; n < len; n += 1) {                                             \
+      if (MASK(mask, n)) {                                                     \
+        output[n] = input[n];                                                  \
+      } else {                                                                 \
+        output[n] = def;                                                       \
+      }                                                                        \
+    }                                                                          \
+  }
+
+#define DO_REDUCE_16(res, mask, input, dtype, oper, initval)                   \
+  {                                                                            \
+    dtype dtype##tmp[4];                                                       \
+    V512 vtmp;                                                                 \
+    DO_MASK_COPY(16, vtmp.dtype, mask, input, initval);                        \
+                                                                               \
+    dtype##tmp[0] = oper(vtmp.dtype[0], vtmp.dtype[4]);                        \
+    dtype##tmp[1] = oper(vtmp.dtype[1], vtmp.dtype[5]);                        \
+    dtype##tmp[2] = oper(vtmp.dtype[2], vtmp.dtype[6]);                        \
+    dtype##tmp[3] = oper(vtmp.dtype[3], vtmp.dtype[7]);                        \
+                                                                               \
+    dtype##tmp[0] = oper(dtype##tmp[0], vtmp.dtype[8]);                        \
+    dtype##tmp[1] = oper(dtype##tmp[1], vtmp.dtype[9]);                        \
+    dtype##tmp[2] = oper(dtype##tmp[2], vtmp.dtype[10]);                       \
+    dtype##tmp[3] = oper(dtype##tmp[3], vtmp.dtype[11]);                       \
+                                                                               \
+    dtype##tmp[0] = oper(dtype##tmp[0], vtmp.dtype[12]);                       \
+    dtype##tmp[1] = oper(dtype##tmp[1], vtmp.dtype[13]);                       \
+    dtype##tmp[2] = oper(dtype##tmp[2], vtmp.dtype[14]);                       \
+    dtype##tmp[3] = oper(dtype##tmp[3], vtmp.dtype[15]);                       \
+                                                                               \
+    dtype##tmp[0] = oper(dtype##tmp[0], dtype##tmp[1]);                        \
+    dtype##tmp[2] = oper(dtype##tmp[2], dtype##tmp[3]);                        \
+                                                                               \
+    res = oper(dtype##tmp[0], dtype##tmp[2]);                                  \
+  }
+
+#define DO_REDUCE_8(res, mask, input, dtype, oper, initval)                    \
+  {                                                                            \
+    dtype dtype##tmp[4];                                                       \
+    V512 vtmp;                                                                 \
+    DO_MASK_COPY(8, vtmp.dtype, mask, input, initval);                         \
+                                                                               \
+    dtype##tmp[0] = oper(vtmp.dtype[0], vtmp.dtype[4]);                        \
+    dtype##tmp[1] = oper(vtmp.dtype[1], vtmp.dtype[5]);                        \
+    dtype##tmp[2] = oper(vtmp.dtype[2], vtmp.dtype[6]);                        \
+    dtype##tmp[3] = oper(vtmp.dtype[3], vtmp.dtype[7]);                        \
+                                                                               \
+    dtype##tmp[0] = oper(dtype##tmp[0], dtype##tmp[1]);                        \
+    dtype##tmp[2] = oper(dtype##tmp[2], dtype##tmp[3]);                        \
+                                                                               \
+    res = oper(dtype##tmp[0], dtype##tmp[2]);                                  \
+  }
+
+static int NOINLINE mask_s32_reduce_op(OPER op, __mmask16 mask,
+                                       int s32op1[16]) {
+  int handled = 0;
+  int res;
+
+  switch (op) {
+
+  case REDUCE_ADD:
+    handled = 1;
+    DO_REDUCE_16(res, mask, s32op1, s32, ADDOP, 0);
+    break;
+
+  case REDUCE_MUL:
+    handled = 1;
+    DO_REDUCE_16(res, mask, s32op1, s32, MULOP, 1);
+    break;
+
+  case REDUCE_MIN:
+    handled = 1;
+    DO_REDUCE_16(res, mask, s32op1, s32, IMin, 0x7fffffff);
+    break;
+
+  case REDUCE_MAX:
+    handled = 1;
+    DO_REDUCE_16(res, mask, s32op1, s32, IMax, 0x80000000);
+    break;
+
+  case REDUCE_OR:
+    handled = 1;
+    DO_REDUCE_16(res, mask, s32op1, s32, OROP, 0);
+    break;
+
+  case REDUCE_AND:
+    handled = 1;
+    DO_REDUCE_16(res, mask, s32op1, s32, ANDOP, 0xffffffff);
+    break;
+
+  default:
+    printf("FAIL: mask_s32_reduce_op: bad op\n");
+    exit(1);
+    break;
+  }
+  if (!handled) {
+    printf("FAIL: mask_s32_reduce_op: unsupported op\n");
+  }
+  return (res);
+}
+
+static int NOINLINE mask_u32_reduce_op(OPER op, __mmask16 mask,
+                                       u32 u32op1[16]) {
+  int handled = 0;
+  int res;
+
+  switch (op) {
+
+  case REDUCE_MIN:
+    handled = 1;
+    DO_REDUCE_16(res, mask, u32op1, u32, IMin, 0xffffffff);
+    break;
+
+  case REDUCE_MAX:
+    handled = 1;
+    DO_REDUCE_16(res, mask, u32op1, u32, IMax, 0x00000000);
+    break;
+
+  default:
+    printf("FAIL: mask_u32_reduce_op: bad op\n");
+    exit(1);
+    break;
+  }
+  if (!handled) {
+    printf("FAIL: mask_u32_reduce_op: unsupported op\n");
+  }
+  return (res);
+}
+
+static void NOINLINE init_s32(int s32out[16], int s32op1[16]) {
+  int i = 0;
+  for (i = 0; i < 16; i++) {
+    s32out[i] = s32op1[i];
+  }
+}
+
+static void NOINLINE init_f32(float f32out[16], float f32op1[16]) {
+  int i = 0;
+  for (i = 0; i < 16; i++) {
+    f32out[i] = f32op1[i];
+  }
+}
+
+static float NOINLINE mask_f32_reduce_op(OPER op, __mmask16 mask,
+                                         float valop1[16]) {
+  int handled = 0;
+  float res;
+  union {
+    float f32init;
+    int s32init;
+  } init;
+
+  switch (op) {
+
+  case REDUCE_ADD:
+    handled = 1;
+    DO_REDUCE_16(res, mask, valop1, f32, ADDOP, 0.0);
+    break;
+
+  case REDUCE_MUL:
+    handled = 1;
+    DO_REDUCE_16(res, mask, valop1, f32, MULOP, 1.0);
+    break;
+
+  case REDUCE_GMIN:
+    handled = 1;
+    init.s32init = 0x7f800000; /* +inf */
+    DO_REDUCE_16(res, mask, valop1, f32, GMINOP, init.f32init);
+    break;
+
+  case REDUCE_GMAX:
+    handled = 1;
+    init.s32init = 0xff800000; /* -inf */
+    DO_REDUCE_16(res, mask, valop1, f32, GMAXOP, init.f32init);
+    break;
+
+  default:
+    printf("FAIL: mask_f32_reduce_op: bad op\n");
+    exit(1);
+    break;
+  }
+  if (!handled) {
+    printf("FAIL: mask_f32_reduce_op: unsupported op\n");
+  }
+  return (res);
+}
+
+static void NOINLINE init_f64(double f64out[8], double f64op1[8]) {
+  int i = 0;
+  for (i = 0; i < 8; i++) {
+    f64out[i] = f64op1[i];
+  }
+}
+
+static double NOINLINE mask_f64_reduce_op(OPER op, __mmask16 mask,
+                                          double valop1[8]) {
+  int handled = 0;
+  double res;
+  union {
+    double f64init;
+    int s32init[2];
+  } init;
+
+  switch (op) {
+
+  case REDUCE_ADD:
+    handled = 1;
+    DO_REDUCE_8(res, mask, valop1, f64, ADDOP, 0.0);
+    break;
+
+  case REDUCE_MUL:
+    handled = 1;
+    DO_REDUCE_8(res, mask, valop1, f64, MULOP, 1.0);
+    break;
+
+  case REDUCE_GMIN:
+    handled = 1;
+    init.s32init[0] = 0x00000000; /* +inf */
+    init.s32init[1] = 0x7ff00000; /* +inf */
+    DO_REDUCE_8(res, mask, valop1, f64, GMINOP, init.f64init);
+    break;
+
+  case REDUCE_GMAX:
+    handled = 1;
+    init.s32init[0] = 0x00000000; /* -inf */
+    init.s32init[1] = 0xfff00000; /* -inf */
+    DO_REDUCE_8(res, mask, valop1, f64, GMAXOP, init.f64init);
+    break;
+
+  default:
+    printf("FAIL: mask_f64_reduce_op: bad op\n");
+    exit(1);
+    break;
+  }
+  if (!handled) {
+    printf("FAIL: mask_f64_reduce_op: unsupported op\n");
+  }
+  return (res);
+}
+
+static void NOINLINE print_s32(char *pfx, int var) {
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  printf("%5d", var);
+  printf("\n");
+}
+
+static void NOINLINE print_u32(char *pfx, u32 var) {
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  printf("%5u", var);
+  printf("\n");
+}
+
+static void NOINLINE print_f32(char *pfx, float var) {
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  printf("%5.2f", var);
+  printf("\n");
+}
+
+static void NOINLINE print_f64(char *pfx, double var) {
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  printf("%5.2lf", var);
+  printf("\n");
+}
+
+static void NOINLINE print_ivec(char *pfx, int ivec[]) {
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  char *fmt = "%5d %5d %5d %5d ";
+  printf(fmt, ivec[15], ivec[14], ivec[13], ivec[12]);
+  printf(fmt, ivec[11], ivec[10], ivec[9], ivec[8]);
+  printf(fmt, ivec[7], ivec[6], ivec[5], ivec[4]);
+  printf(fmt, ivec[3], ivec[2], ivec[1], ivec[0]);
+  printf("\n");
+}
+
+static void NOINLINE print_uvec(char *pfx, u32 ivec[]) {
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  char *fmt = "%5u %5u %5u %5u ";
+  printf(fmt, ivec[15], ivec[14], ivec[13], ivec[12]);
+  printf(fmt, ivec[11], ivec[10], ivec[9], ivec[8]);
+  printf(fmt, ivec[7], ivec[6], ivec[5], ivec[4]);
+  printf(fmt, ivec[3], ivec[2], ivec[1], ivec[0]);
+  printf("\n");
+}
+
+static void NOINLINE print_fvec(char *pfx, float fvec[]) {
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  char *fmt = "%5.2f %5.2f %5.2f %5.2f ";
+  printf(fmt, fvec[15], fvec[14], fvec[13], fvec[12]);
+  printf(fmt, fvec[11], fvec[10], fvec[9], fvec[8]);
+  printf(fmt, fvec[7], fvec[6], fvec[5], fvec[4]);
+  printf(fmt, fvec[3], fvec[2], fvec[1], fvec[0]);
+  printf("\n");
+}
+
+static void NOINLINE print_dvec(char *pfx, double dvec[]) {
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  char *fmt = "%5.2lf %5.2lf %5.2lf %5.2lf ";
+  printf(fmt, dvec[7], dvec[6], dvec[5], dvec[4]);
+  printf(fmt, dvec[3], dvec[2], dvec[1], dvec[0]);
+  printf("\n");
+}
+
+#define PRINT_MASK(bits, width, pfx, var)                                      \
+  print_mask(bits, "%" #width "d ", pfx, var)
+
+static void NOINLINE print_mask(int bits, char *fmt, char *pfx,
+                                __mmask16 mask) {
+  int i;
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  for (i = bits; i >= 1; i -= 1) {
+    printf(fmt, MASK(mask, (i - 1)));
+  }
+  printf("\n");
+}
+
+#define CHECK_PRINT(STATUS, FUNC)                                              \
+  if (!(STATUS)) {                                                             \
+    printf("FAIL " #FUNC "\n");                                                \
+    err += 1;                                                                  \
+  } else if (VERBOSE) {                                                        \
+    printf("PASS " #FUNC "\n");                                                \
+  }
+
+#define CHECK_REDUCE_S32(FUNC)                                                 \
+  {                                                                            \
+    int passed = (result == mresult);                                          \
+    CHECK_PRINT(passed, FUNC);                                                 \
+    if (!passed || SHOW_OP) {                                                  \
+      print_ivec("Opand1", v1.s32);                                            \
+      print_s32("Scalar", result);                                             \
+      print_s32("Vector", mresult);                                            \
+    }                                                                          \
+  }
+
+#define CHECK_REDUCE_U32(FUNC)                                                 \
+  {                                                                            \
+    int passed = (result == mresult);                                          \
+    CHECK_PRINT(passed, FUNC);                                                 \
+    if (!passed || SHOW_OP) {                                                  \
+      print_uvec("Opand1", v1.u32);                                            \
+      print_u32("Scalar", result);                                             \
+      print_u32("Vector", mresult);                                            \
+    }                                                                          \
+  }
+
+#define CHECK_MASK_REDUCE_S32(FUNC)                                            \
+  {                                                                            \
+    int passed = (result == mresult);                                          \
+    CHECK_PRINT(passed, FUNC);                                                 \
+    if (!passed || SHOW_OP) {                                                  \
+      print_ivec("Opand1", v1.s32);                                            \
+      PRINT_MASK(16, 5, "  Mask", mask);                                       \
+      print_s32("Scalar", result);                                             \
+      print_s32("Vector", mresult);                                            \
+    }                                                                          \
+  }
+
+#define CHECK_MASK_REDUCE_U32(FUNC)                                            \
+  {                                                                            \
+    int passed = (result == mresult);                                          \
+    CHECK_PRINT(passed, FUNC);                                                 \
+    if (!passed || SHOW_OP) {                                                  \
+      print_uvec("Opand1", v1.u32);                                            \
+      PRINT_MASK(16, 5, "  Mask", mask);                                       \
+      print_u32("Scalar", result);                                             \
+      print_u32("Vector", mresult);                                            \
+    }                                                                          \
+  }
+
+#define CHECK_REDUCE_F32(FUNC)                                                 \
+  {                                                                            \
+    int passed = (result == mresult);                                          \
+    CHECK_PRINT(passed, FUNC);                                                 \
+    if (!passed || SHOW_OP) {                                                  \
+      print_fvec("Opand1", v1.f32);                                            \
+      print_f32("Scalar", result);                                             \
+      print_f32("Vector", mresult);                                            \
+    }                                                                          \
+  }
+
+#define CHECK_MASK_REDUCE_F32(FUNC)                                            \
+  {                                                                            \
+    int passed = (result == mresult);                                          \
+    CHECK_PRINT(passed, FUNC);                                                 \
+    if (!passed || SHOW_OP) {                                                  \
+      print_fvec("Opand1", v1.f32);                                            \
+      PRINT_MASK(16, 9, "  Mask", mask);                                       \
+      print_f32("Scalar", result);                                             \
+      print_f32("Vector", mresult);                                            \
+    }                                                                          \
+  }
+
+#define CHECK_REDUCE_F64(FUNC)                                                 \
+  {                                                                            \
+    int passed = (result == mresult);                                          \
+    CHECK_PRINT(passed, FUNC);                                                 \
+    if (!passed || SHOW_OP) {                                                  \
+      print_dvec("Opand1", v1.f64);                                            \
+      print_f64("Scalar", result);                                             \
+      print_f64("Vector", mresult);                                            \
+    }                                                                          \
+  }
+
+#define CHECK_MASK_REDUCE_F64(FUNC)                                            \
+  {                                                                            \
+    int passed = (result == mresult);                                          \
+    CHECK_PRINT(passed, FUNC);                                                 \
+    if (!passed || SHOW_OP) {                                                  \
+      print_dvec("Opand1", v1.f64);                                            \
+      PRINT_MASK(8, 10, "  Mask", mask);                                       \
+      print_f64("Scalar", result);                                             \
+      print_f64("Vector", mresult);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_REDUCE_S32(OP, FUNC)                                             \
+  {                                                                            \
+    int result;                                                                \
+    int mresult;                                                               \
+    result = mask_s32_reduce_op(OP, mask_true, v1.s32);                        \
+    mresult = FUNC(v1.zmmi);                                                   \
+    CHECK_REDUCE_S32(FUNC);                                                    \
+  }
+
+#define DOONE_MASK_REDUCE_S32(OP, mask, FUNC)                                  \
+  {                                                                            \
+    int result;                                                                \
+    int mresult;                                                               \
+    result = mask_s32_reduce_op(OP, mask, v1.s32);                             \
+    mresult = FUNC(mask, v1.zmmi);                                             \
+    CHECK_MASK_REDUCE_S32(FUNC);                                               \
+  }
+
+#define DOONE_REDUCE_U32(OP, FUNC)                                             \
+  {                                                                            \
+    u32 result;                                                                \
+    u32 mresult;                                                               \
+    result = mask_u32_reduce_op(OP, mask_true, v1.u32);                        \
+    mresult = FUNC(v1.zmmi);                                                   \
+    CHECK_REDUCE_U32(FUNC);                                                    \
+  }
+
+#define DOONE_MASK_REDUCE_U32(OP, mask, FUNC)                                  \
+  {                                                                            \
+    int result;                                                                \
+    int mresult;                                                               \
+    result = mask_u32_reduce_op(OP, mask, v1.u32);                             \
+    mresult = FUNC(mask, v1.zmmi);                                             \
+    CHECK_MASK_REDUCE_U32(FUNC);                                               \
+  }
+
+#define DOONE_REDUCE_F32(OP, FUNC)                                             \
+  {                                                                            \
+    float result;                                                              \
+    float mresult;                                                             \
+    result = mask_f32_reduce_op(OP, mask_true, v1.f32);                        \
+    mresult = FUNC(v1.zmm);                                                    \
+    CHECK_REDUCE_F32(FUNC);                                                    \
+  }
+
+#define DOONE_MASK_REDUCE_F32(OP, mask, FUNC)                                  \
+  {                                                                            \
+    float result;                                                              \
+    float mresult;                                                             \
+    result = mask_f32_reduce_op(OP, mask, v1.f32);                             \
+    mresult = FUNC(mask, v1.zmm);                                              \
+    CHECK_MASK_REDUCE_F32(FUNC);                                               \
+  }
+
+#define DOONE_REDUCE_F64(OP, FUNC)                                             \
+  {                                                                            \
+    double result;                                                             \
+    double mresult;                                                            \
+    result = mask_f64_reduce_op(OP, mask_true, v1.f64);                        \
+    mresult = FUNC(v1.zmmd);                                                   \
+    CHECK_REDUCE_F64(FUNC);                                                    \
+  }
+
+#define DOONE_MASK_REDUCE_F64(OP, mask, FUNC)                                  \
+  {                                                                            \
+    double result;                                                             \
+    double mresult;                                                            \
+    memset(&result, 0, sizeof(result));                                        \
+    memset(&mresult, 0, sizeof(mresult));                                      \
+    result = mask_f64_reduce_op(OP, mask, v1.f64);                             \
+    mresult = FUNC(mask, v1.zmmd);                                             \
+    CHECK_MASK_REDUCE_F64(FUNC);                                               \
+  }
+
+__mmask16 mvals[] = {0, 0x82a5};
+
+int main(int argc, char *argv[]) {
+  int i;
+  int err = 0;
+  int init1[16] = {7, 1, -3, 3, 1, 1, 2, 3, 1, 3, 2, 3, -5, 1, 11, 3};
+
+  float finit1[16] = {-1.0, -2.0, 3.0,  4.0,   5.0,  6.0,  7.0,  18.0,
+                      -9.0, 10.0, 11.0, -12.0, 13.0, 14.0, 15.0, 16.0};
+
+  double dinit1[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
+  double dinit2[8] = {0.5, 2.0, 3.0, 2.1, 5.0, 5.2, 7.1, 3.1};
+
+  V512 v1;
+  __mmask16 mask = 0x82a5;
+
+  verbose = argc;
+
+  /* zmmi/s32 tests ---------------------------------------- */
+  /* _mm512_reduce_add_epi32 */
+  init_s32(v1.s32, init1);
+  DOONE_REDUCE_S32(REDUCE_ADD, _mm512_reduce_add_epi32);
+
+  /* _mm512_reduce_mul_epi32 */
+  init_s32(v1.s32, init1);
+  DOONE_REDUCE_S32(REDUCE_MUL, _mm512_reduce_mul_epi32);
+
+  /* _mm512_reduce_min_epi32 */
+  init_s32(v1.s32, init1);
+  DOONE_REDUCE_S32(REDUCE_MIN, _mm512_reduce_min_epi32);
+
+  /* _mm512_reduce_max_epi32 */
+  init_s32(v1.s32, init1);
+  DOONE_REDUCE_S32(REDUCE_MAX, _mm512_reduce_max_epi32);
+
+  /* _mm512_reduce_and_epi32 */
+  init_s32(v1.s32, init1);
+  DOONE_REDUCE_S32(REDUCE_AND, _mm512_reduce_and_epi32);
+
+  /* _mm512_reduce_or_epi32 */
+  init_s32(v1.s32, init1);
+  DOONE_REDUCE_S32(REDUCE_OR, _mm512_reduce_or_epi32);
+
+  /* _mm512_reduce_min_epu32 */
+  init_s32(v1.s32, init1);
+  DOONE_REDUCE_U32(REDUCE_MIN, _mm512_reduce_min_epu32);
+
+  /* _mm512_reduce_max_epu32 */
+  init_s32(v1.s32, init1);
+  DOONE_REDUCE_U32(REDUCE_MAX, _mm512_reduce_max_epu32);
+
+  for (i = 0; i < 2; i += 1) {
+    mask = mvals[i];
+    /* _mm512_mask_reduce_min_epu32 */
+    init_s32(v1.s32, init1);
+    DOONE_MASK_REDUCE_U32(REDUCE_MIN, mask, _mm512_mask_reduce_min_epu32);
+
+    /* _mm512_mask_reduce_max_epu32 */
+    init_s32(v1.s32, init1);
+    DOONE_MASK_REDUCE_U32(REDUCE_MAX, mask, _mm512_mask_reduce_max_epu32);
+  }
+
+  for (i = 0; i < 2; i += 1) {
+    mask = mvals[i];
+    /* _mm512_mask_reduce_add_epi32 */
+    init_s32(v1.s32, init1);
+    DOONE_MASK_REDUCE_S32(REDUCE_ADD, mask, _mm512_mask_reduce_add_epi32);
+
+    /* _mm512_mask_reduce_mul_epi32 */
+    init_s32(v1.s32, init1);
+    DOONE_MASK_REDUCE_S32(REDUCE_MUL, mask, _mm512_mask_reduce_mul_epi32);
+
+    /* _mm512_mask_reduce_min_epi32 */
+    init_s32(v1.s32, init1);
+    DOONE_MASK_REDUCE_S32(REDUCE_MIN, mask, _mm512_mask_reduce_min_epi32);
+
+    /* _mm512_mask_reduce_max_epi32 */
+    init_s32(v1.s32, init1);
+    DOONE_MASK_REDUCE_S32(REDUCE_MAX, mask, _mm512_mask_reduce_max_epi32);
+
+    /* _mm512_mask_reduce_and_epi32 */
+    init_s32(v1.s32, init1);
+    DOONE_MASK_REDUCE_S32(REDUCE_AND, mask, _mm512_mask_reduce_and_epi32);
+
+    /* _mm512_mask_reduce_or_epi32 */
+    init_s32(v1.s32, init1);
+    DOONE_MASK_REDUCE_S32(REDUCE_OR, mask, _mm512_mask_reduce_or_epi32);
+  }
+
+  /* zmm/f32 tests ---------------------------------------- */
+  /* _mm512_reduce_add_ps */
+  init_f32(v1.f32, finit1);
+  DOONE_REDUCE_F32(REDUCE_ADD, _mm512_reduce_add_ps);
+
+  /* _mm512_reduce_mul_ps */
+  init_f32(v1.f32, finit1);
+  DOONE_REDUCE_F32(REDUCE_MUL, _mm512_reduce_mul_ps);
+
+  /* _mm512_reduce_gmin_ps */
+  init_f32(v1.f32, finit1);
+  DOONE_REDUCE_F32(REDUCE_GMIN, _mm512_reduce_min_ps);
+
+  /* _mm512_reduce_gmax_ps */
+  init_f32(v1.f32, finit1);
+  DOONE_REDUCE_F32(REDUCE_GMAX, _mm512_reduce_max_ps);
+
+  for (i = 0; i < 2; i += 1) {
+    mask = mvals[i];
+    /* _mm512_reduce_gmax_ps */
+    init_f32(v1.f32, finit1);
+    DOONE_MASK_REDUCE_F32(REDUCE_GMIN, mask, _mm512_mask_reduce_min_ps);
+
+    /* _mm512_reduce_gmax_ps */
+    init_f32(v1.f32, finit1);
+    DOONE_MASK_REDUCE_F32(REDUCE_GMAX, mask, _mm512_mask_reduce_max_ps);
+
+    /* _mm512_reduce_mul_ps */
+    init_f32(v1.f32, finit1);
+    DOONE_MASK_REDUCE_F32(REDUCE_MUL, mask, _mm512_mask_reduce_mul_ps);
+
+    /* _mm512_reduce_add_ps */
+    init_f32(v1.f32, finit1);
+    DOONE_MASK_REDUCE_F32(REDUCE_ADD, mask, _mm512_mask_reduce_add_ps);
+  }
+
+  /* zmmd/f64 tests ---------------------------------------- */
+  /* _mm512_reduce_add_pd */
+  init_f64(v1.f64, dinit1);
+  DOONE_REDUCE_F64(REDUCE_ADD, _mm512_reduce_add_pd);
+
+  /* _mm512_reduce_mul_pd */
+  init_f64(v1.f64, dinit1);
+  DOONE_REDUCE_F64(REDUCE_MUL, _mm512_reduce_mul_pd);
+
+  /* _mm512_reduce_gmin_pd */
+  init_f64(v1.f64, dinit1);
+  DOONE_REDUCE_F64(REDUCE_GMIN, _mm512_reduce_min_pd);
+
+  /* _mm512_reduce_gmax_pd */
+  init_f64(v1.f64, dinit1);
+  DOONE_REDUCE_F64(REDUCE_GMAX, _mm512_reduce_max_pd);
+
+  for (i = 0; i < 2; i += 1) {
+    mask = mvals[i];
+    /* _mm512_mask_reduce_gmin_ps */
+    init_f64(v1.f64, dinit1);
+    DOONE_MASK_REDUCE_F64(REDUCE_GMIN, mask, _mm512_mask_reduce_min_pd);
+
+    /* _mm512_mask_reduce_gmax_ps */
+    init_f64(v1.f64, dinit2);
+    DOONE_MASK_REDUCE_F64(REDUCE_GMAX, mask, _mm512_mask_reduce_max_pd);
+
+    /* _mm512_mask_reduce_mul_ps */
+    init_f64(v1.f64, dinit1);
+    DOONE_MASK_REDUCE_F64(REDUCE_MUL, mask, _mm512_mask_reduce_mul_pd);
+
+    /* _mm512_mask_reduce_add_ps */
+    init_f64(v1.f64, dinit2);
+    DOONE_MASK_REDUCE_F64(REDUCE_ADD, mask, _mm512_mask_reduce_add_pd);
+  }
+
+  if (err) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/reduce.reference_output b/SingleSource/UnitTests/Vector/AVX512/reduce.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/reduce.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/reduce_add_mul_m512.c b/SingleSource/UnitTests/Vector/AVX512/reduce_add_mul_m512.c
new file mode 100644
index 0000000..0a9669e
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/reduce_add_mul_m512.c
@@ -0,0 +1,73 @@
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+
+/*
+ * Here we check for _mm512_[mask_]reduce_[add|mul] intrinsics.
+ */
+
+#define CHECK_PD(op)                                                           \
+  {                                                                            \
+    volatile double r = _mm512_##op##_pd(v1);                                  \
+    check_equal_ndf(&r, &exp1, 1, "_mm512_" #op "_pd", __LINE__);              \
+    k8 = 0xAA;                                                                 \
+    r = _mm512_mask_##op##_pd(k8, v1);                                         \
+    check_equal_ndf(&r, &exp2, 1, "_mm512_mask_" #op "_pd{1}", __LINE__);      \
+  }
+
+#define DECL_PD(op, srcv, out1, out2)                                          \
+  void NOINLINE do_##op##_pd() {                                               \
+    __mmask8 k8;                                                               \
+    volatile __m512d v1 = _mm512_set1_pd((srcv));                              \
+                                                                               \
+    double exp1 = (out1);                                                      \
+    double exp2 = (out2);                                                      \
+                                                                               \
+    CHECK_PD(op);                                                              \
+  }
+
+#define TEST_PD(op) do_##op##_pd()
+
+// PS version starts here.
+
+#define CHECK_PS(op)                                                           \
+  {                                                                            \
+    volatile float r = _mm512_##op##_ps(v1);                                   \
+    check_equal_nsf(&r, &exp1, 1, "_mm512_" #op "_ps", __LINE__);              \
+    k8 = 0xAAAA;                                                               \
+    r = _mm512_mask_##op##_ps(k8, v1);                                         \
+    check_equal_nsf(&r, &exp2, 1, "_mm512_mask_" #op "_ps{1}", __LINE__);      \
+  }
+
+#define DECL_PS(op, srcv, out1, out2)                                          \
+  void NOINLINE do_##op##_ps() {                                               \
+    __mmask16 k8;                                                              \
+    volatile __m512 v1 = _mm512_set1_ps((srcv));                               \
+                                                                               \
+    float exp1 = (out1);                                                       \
+    float exp2 = (out2);                                                       \
+                                                                               \
+    CHECK_PS(op);                                                              \
+  }
+
+#define TEST_PS(op) do_##op##_ps()
+
+DECL_PD(reduce_add, 0.5, 4.0, 2.0)
+DECL_PS(reduce_add, 0.4f, 6.4f, 3.2f)
+DECL_PD(reduce_mul, 1.1, 2.1435f, 1.4641f)
+DECL_PS(reduce_mul, -1.1f, 4.5949f, 2.1435f)
+
+int main(int argc, char *argv[]) {
+  TEST_PD(reduce_add);
+  TEST_PS(reduce_add);
+  TEST_PD(reduce_mul);
+  TEST_PS(reduce_mul);
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/reduce_add_mul_m512.reference_output b/SingleSource/UnitTests/Vector/AVX512/reduce_add_mul_m512.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/reduce_add_mul_m512.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/reduce_int64.c b/SingleSource/UnitTests/Vector/AVX512/reduce_int64.c
new file mode 100644
index 0000000..eab98da
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/reduce_int64.c
@@ -0,0 +1,398 @@
+#include "m512_test_util.h"
+#include <math.h>
+#include <memory.h>
+#include <stdio.h>
+#include <stdlib.h>
+/*
+ * Here we check for _mm512_[mask_]reduce_[add|mul|min|max] intrinsics.
+ */
+typedef __int64 s64;
+typedef unsigned __int64 u64;
+
+typedef float f32;
+typedef double f64;
+
+typedef int s32;
+typedef unsigned int u32;
+
+int verbose = 0;
+#define VERBOSE (verbose > 1)
+#define SHOW_OP (verbose > 2)
+#define SCALE_TRACE (verbose > 3)
+
+typedef enum {
+  REDUCE_ADD,
+  REDUCE_MUL,
+  REDUCE_MIN,
+  REDUCE_MAX,
+  REDUCE_OR,
+  REDUCE_AND
+} OPER;
+
+__mmask16 mask_true = 0xffff;
+
+#define MASK(mask, n) ((mask & (0x1 << n)) != 0)
+
+#define IMin(i, j) (((i) <= (j)) ? (i) : (j))
+#define IMax(i, j) (((i) >= (j)) ? (i) : (j))
+
+#define MULOP(a, b) (a * b)
+#define ADDOP(a, b) (a + b)
+#define OROP(a, b) (a | b)
+#define ANDOP(a, b) (a & b)
+
+#define DO_MASK_COPY(len, output, mask, input, def)                            \
+  {                                                                            \
+    int n;                                                                     \
+                                                                               \
+    for (n = 0; n < len; n += 1) {                                             \
+      if (MASK(mask, n)) {                                                     \
+        output[n] = input[n];                                                  \
+      } else {                                                                 \
+        output[n] = def;                                                       \
+      }                                                                        \
+    }                                                                          \
+  }
+
+#define DO_REDUCE_8(res, mask, input, dtype, oper, initval)                    \
+  {                                                                            \
+    dtype dtype##tmp[4];                                                       \
+    V512 vtmp;                                                                 \
+    DO_MASK_COPY(8, vtmp.dtype, mask, input, initval);                         \
+                                                                               \
+    dtype##tmp[0] = oper(vtmp.dtype[0], vtmp.dtype[4]);                        \
+    dtype##tmp[1] = oper(vtmp.dtype[1], vtmp.dtype[5]);                        \
+    dtype##tmp[2] = oper(vtmp.dtype[2], vtmp.dtype[6]);                        \
+    dtype##tmp[3] = oper(vtmp.dtype[3], vtmp.dtype[7]);                        \
+                                                                               \
+    dtype##tmp[0] = oper(dtype##tmp[0], dtype##tmp[1]);                        \
+    dtype##tmp[2] = oper(dtype##tmp[2], dtype##tmp[3]);                        \
+                                                                               \
+    res = oper(dtype##tmp[0], dtype##tmp[2]);                                  \
+  }
+
+static __int64 NOINLINE mask_s64_reduce_op(OPER op, __mmask16 mask,
+                                           __int64 s64op1[8]) {
+  int handled = 0;
+  __int64 res;
+
+  switch (op) {
+
+  case REDUCE_ADD:
+    handled = 1;
+    DO_REDUCE_8(res, mask, s64op1, s64, ADDOP, 0);
+    break;
+
+  case REDUCE_MUL:
+    handled = 1;
+    DO_REDUCE_8(res, mask, s64op1, s64, MULOP, 1);
+    break;
+
+  case REDUCE_MIN:
+    handled = 1;
+    DO_REDUCE_8(res, mask, s64op1, s64, IMin, 0x7fffffffffffffff);
+    break;
+
+  case REDUCE_MAX:
+    handled = 1;
+    DO_REDUCE_8(res, mask, s64op1, s64, IMax, 0x8000000000000000);
+    break;
+
+  case REDUCE_OR:
+    handled = 1;
+    DO_REDUCE_8(res, mask, s64op1, s64, OROP, 0);
+    break;
+
+  case REDUCE_AND:
+    handled = 1;
+    DO_REDUCE_8(res, mask, s64op1, s64, ANDOP, 0xffffffffffffffff);
+    break;
+
+  default:
+    printf("FAIL: mask_s64_reduce_op: bad op\n");
+    exit(1);
+    break;
+  }
+  if (!handled) {
+    printf("FAIL: mask_s64_reduce_op: unsupported op\n");
+  }
+  return (res);
+}
+
+static __int64 NOINLINE mask_u64_reduce_op(OPER op, __mmask16 mask,
+                                           unsigned __int64 u64op1[8]) {
+  int handled = 0;
+  __int64 res;
+
+  switch (op) {
+
+  case REDUCE_MIN:
+    handled = 1;
+    DO_REDUCE_8(res, mask, u64op1, u64, IMin, 0xffffffffffffffff);
+    break;
+
+  case REDUCE_MAX:
+    handled = 1;
+    DO_REDUCE_8(res, mask, u64op1, u64, IMax, 0x0000000000000000);
+    break;
+
+  default:
+    printf("FAIL: mask_u64_reduce_op: bad op\n");
+    exit(1);
+    break;
+  }
+  if (!handled) {
+    printf("FAIL: mask_u64_reduce_op: unsupported op\n");
+  }
+  return (res);
+}
+
+static void NOINLINE init_s64(__int64 s64out[8], __int64 s64op1[8]) {
+  int i = 0;
+  for (i = 0; i < 8; i += 1) {
+    s64out[i] = s64op1[i];
+  }
+}
+
+static void NOINLINE print_s64(char *pfx, __int64 var) {
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  printf("%15lld", var);
+  printf("\n");
+}
+
+static void NOINLINE print_u64(char *pfx, u64 var) {
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  printf("%15llu", var);
+  printf("\n");
+}
+
+static void NOINLINE print_ivec(char *pfx, __int64 ivec[]) {
+  char *fmt = "%5ld %5ld %5ld %5ld ";
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  printf(fmt, ivec[7], ivec[6], ivec[5], ivec[4]);
+  printf(fmt, ivec[3], ivec[2], ivec[1], ivec[0]);
+  printf("\n");
+}
+
+static void NOINLINE print_uvec(char *pfx, unsigned __int64 ivec[]) {
+  char *fmt = "%5lu %5lu %5lu %5lu ";
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  printf(fmt, ivec[7], ivec[6], ivec[5], ivec[4]);
+  printf(fmt, ivec[3], ivec[2], ivec[1], ivec[0]);
+  printf("\n");
+}
+
+#define PRINT_MASK(bits, width, pfx, var)                                      \
+  print_mask(bits, "%" #width "d ", pfx, var)
+
+static void NOINLINE print_mask(int bits, char *fmt, char *pfx,
+                                __mmask16 mask) {
+  int i;
+  if (pfx) {
+    printf("%s: ", pfx);
+  }
+  for (i = bits; i >= 1; i -= 1) {
+    printf(fmt, MASK(mask, (i - 1)));
+  }
+  printf("\n");
+}
+
+#define CHECK_PRINT(STATUS, FUNC)                                              \
+  if (!(STATUS)) {                                                             \
+    printf("FAIL " #FUNC "\n");                                                \
+    err += 1;                                                                  \
+  } else if (VERBOSE) {                                                        \
+    printf("PASS " #FUNC "\n");                                                \
+  }
+
+#define CHECK_REDUCE_S64(FUNC)                                                 \
+  {                                                                            \
+    int passed = (result == mresult);                                          \
+    CHECK_PRINT(passed, FUNC);                                                 \
+    if (!passed || SHOW_OP) {                                                  \
+      print_ivec("Opand1", v1.s64);                                            \
+      print_s64("Scalar", result);                                             \
+      print_s64("Vector", mresult);                                            \
+    }                                                                          \
+  }
+
+#define CHECK_REDUCE_U64(FUNC)                                                 \
+  {                                                                            \
+    int passed = (result == mresult);                                          \
+    CHECK_PRINT(passed, FUNC);                                                 \
+    if (!passed || SHOW_OP) {                                                  \
+      print_uvec("Opand1", v1.u64);                                            \
+      print_u64("Scalar", result);                                             \
+      print_u64("Vector", mresult);                                            \
+    }                                                                          \
+  }
+
+#define CHECK_MASK_REDUCE_S64(FUNC)                                            \
+  {                                                                            \
+    int passed = (result == mresult);                                          \
+    CHECK_PRINT(passed, FUNC);                                                 \
+    if (!passed || SHOW_OP) {                                                  \
+      print_ivec("Opand1", v1.s64);                                            \
+      PRINT_MASK(8, 5, "  Mask", mask);                                        \
+      print_s64("Scalar", result);                                             \
+      print_s64("Vector", mresult);                                            \
+    }                                                                          \
+  }
+
+#define CHECK_MASK_REDUCE_U64(FUNC)                                            \
+  {                                                                            \
+    int passed = (result == mresult);                                          \
+    CHECK_PRINT(passed, FUNC);                                                 \
+    if (!passed || SHOW_OP) {                                                  \
+      print_uvec("Opand1", v1.u64);                                            \
+      PRINT_MASK(8, 5, "  Mask", mask);                                        \
+      print_u64("Scalar", result);                                             \
+      print_u64("Vector", mresult);                                            \
+    }                                                                          \
+  }
+
+#define DOONE_REDUCE_S64(OP, FUNC)                                             \
+  {                                                                            \
+    __int64 result;                                                            \
+    __int64 mresult;                                                           \
+    result = mask_s64_reduce_op(OP, mask_true, v1.s64);                        \
+    mresult = FUNC(v1.zmmi);                                                   \
+    CHECK_REDUCE_S64(FUNC);                                                    \
+  }
+
+#define DOONE_MASK_REDUCE_S64(OP, mask, FUNC)                                  \
+  {                                                                            \
+    __int64 result;                                                            \
+    __int64 mresult;                                                           \
+    result = mask_s64_reduce_op(OP, mask, v1.s64);                             \
+    mresult = FUNC(mask, v1.zmmi);                                             \
+    CHECK_MASK_REDUCE_S64(FUNC);                                               \
+  }
+
+#define DOONE_REDUCE_U64(OP, FUNC)                                             \
+  {                                                                            \
+    unsigned __int64 result;                                                   \
+    unsigned __int64 mresult;                                                  \
+    result = mask_u64_reduce_op(OP, mask_true, v1.u64);                        \
+    mresult = FUNC(v1.zmmi);                                                   \
+    CHECK_REDUCE_U64(FUNC);                                                    \
+  }
+
+#define DOONE_MASK_REDUCE_U64(OP, mask, FUNC)                                  \
+  {                                                                            \
+    unsigned __int64 result;                                                   \
+    unsigned __int64 mresult;                                                  \
+    result = mask_u64_reduce_op(OP, mask, v1.u64);                             \
+    mresult = FUNC(mask, v1.zmmi);                                             \
+    CHECK_MASK_REDUCE_U64(FUNC);                                               \
+  }
+
+__mmask16 mvals[] = {0, 0x82a5};
+__int64 init1[8] = {7, 1, 11, 3, 1, 1, 2, 3};
+
+void NOINLINE init() {
+  volatile int i;
+
+  for (i = 0; i < 8; i++) {
+    init1[i] = init1[i]; /* No change, but compiler does not know this. */
+  }
+}
+
+int main(int argc, char *argv[]) {
+  int i;
+  int err = 0;
+
+  V512 v1;
+  __mmask16 mask = 0;
+
+  verbose = argc;
+
+  init();
+
+  /* zmmi/s64 tests ---------------------------------------- */
+  /* _mm512_reduce_add_epi64 */
+  init_s64(v1.s64, init1);
+  DOONE_REDUCE_S64(REDUCE_ADD, _mm512_reduce_add_epi64);
+
+  /* _mm512_reduce_mul_epi64 */
+  init_s64(v1.s64, init1);
+  DOONE_REDUCE_S64(REDUCE_MUL, _mm512_reduce_mul_epi64);
+
+  /* _mm512_reduce_min_epi64 */
+  init_s64(v1.s64, init1);
+  DOONE_REDUCE_S64(REDUCE_MIN, _mm512_reduce_min_epi64);
+
+  /* _mm512_reduce_max_epi64 */
+  init_s64(v1.s64, init1);
+  DOONE_REDUCE_S64(REDUCE_MAX, _mm512_reduce_max_epi64);
+
+  /* _mm512_reduce_and_epi64 */
+  init_s64(v1.s64, init1);
+  DOONE_REDUCE_S64(REDUCE_AND, _mm512_reduce_and_epi64);
+
+  /* _mm512_reduce_or_epi64 */
+  init_s64(v1.s64, init1);
+  DOONE_REDUCE_S64(REDUCE_OR, _mm512_reduce_or_epi64);
+
+  /* _mm512_reduce_min_epu64 */
+  init_s64(v1.s64, init1);
+  DOONE_REDUCE_U64(REDUCE_MIN, _mm512_reduce_min_epu64);
+
+  /* _mm512_reduce_max_epu64 */
+  init_s64(v1.s64, init1);
+  DOONE_REDUCE_U64(REDUCE_MAX, _mm512_reduce_max_epu64);
+
+  for (i = 0; i < 2; i += 1) {
+    mask = mvals[i];
+    /* _mm512_mask_reduce_min_epu64 */
+    init_s64(v1.s64, init1);
+    DOONE_MASK_REDUCE_U64(REDUCE_MIN, mask, _mm512_mask_reduce_min_epu64);
+
+    /* _mm512_mask_reduce_max_epu64 */
+    init_s64(v1.s64, init1);
+    DOONE_MASK_REDUCE_U64(REDUCE_MAX, mask, _mm512_mask_reduce_max_epu64);
+  }
+
+  for (i = 0; i < 2; i += 1) {
+    mask = mvals[i];
+    /* _mm512_mask_reduce_add_epi64 */
+    init_s64(v1.s64, init1);
+    DOONE_MASK_REDUCE_S64(REDUCE_ADD, mask, _mm512_mask_reduce_add_epi64);
+
+    /* _mm512_mask_reduce_mul_epi64 */
+    init_s64(v1.s64, init1);
+    DOONE_MASK_REDUCE_S64(REDUCE_MUL, mask, _mm512_mask_reduce_mul_epi64);
+
+    /* _mm512_mask_reduce_min_epi64 */
+    init_s64(v1.s64, init1);
+    DOONE_MASK_REDUCE_S64(REDUCE_MIN, mask, _mm512_mask_reduce_min_epi64);
+
+    /* _mm512_mask_reduce_max_epi64 */
+    init_s64(v1.s64, init1);
+    DOONE_MASK_REDUCE_S64(REDUCE_MAX, mask, _mm512_mask_reduce_max_epi64);
+
+    /* _mm512_mask_reduce_and_epi64 */
+    init_s64(v1.s64, init1);
+    DOONE_MASK_REDUCE_S64(REDUCE_AND, mask, _mm512_mask_reduce_and_epi64);
+
+    /* _mm512_mask_reduce_or_epi64 */
+    init_s64(v1.s64, init1);
+    DOONE_MASK_REDUCE_S64(REDUCE_OR, mask, _mm512_mask_reduce_or_epi64);
+  }
+
+  if (err) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/reduce_int64.reference_output b/SingleSource/UnitTests/Vector/AVX512/reduce_int64.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/reduce_int64.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/roundscale_m512.c b/SingleSource/UnitTests/Vector/AVX512/roundscale_m512.c
new file mode 100644
index 0000000..4da406c
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/roundscale_m512.c
@@ -0,0 +1,133 @@
+#include "m512_test_util.h"
+#include <stdio.h>
+
+/*
+ * Here we check for _mm512_[mask|mmaskz]roundscale[_round]_[ss|sd] intrinsics.
+ */
+
+void __declspec(noinline) do_roundscale_pd() {
+  __mmask8 k8;
+  volatile __m512d v1 = _mm512_set1_pd((1.6));
+  volatile __m512d undef = _mm512_set1_pd(3.0);
+  __m512d exp = _mm512_set1_pd((2.0));
+  __m512d exp1 = _mm512_set1_pd((1.0));
+  __m512d expm = _mm512_set_pd((2.0), 3.0, (2.0), 3.0, (2.0), 3.0, (2.0), 3.0);
+  __m512d expzm = _mm512_set_pd((1.0), 0, (1.0), 0, (1.0), 0, (1.0), 0);
+
+  {
+    volatile __m512d r = _mm512_roundscale_pd(v1, 0x8);
+    check_equal_ndf(&r, &exp, 8, "_mm512_roundscale_pd{0x8}", __LINE__);
+    r = _mm512_roundscale_pd(v1, 0x9);
+    check_equal_ndf(&r, &exp1, 8, "_mm512_roundscale_pd{0x9}", __LINE__);
+    k8 = 0xAA;
+    r = _mm512_mask_roundscale_pd(undef, k8, v1, 0xA);
+    check_equal_ndf(&r, &expm, 8, "_mm512_mask_roundscale_pd{1}{0xA}",
+                    __LINE__);
+    r = _mm512_maskz_roundscale_pd(k8, v1, 0xB);
+    check_equal_ndf(&r, &expzm, 8, "_mm512_maskz_roundscale_pd{0}{0xB}",
+                    __LINE__);
+  }
+
+  {
+    volatile __m512d r =
+        _mm512_roundscale_round_pd(v1, 0x8, (_MM_FROUND_CUR_DIRECTION));
+    check_equal_ndf(&r, &exp, 8, "_mm512_roundscale_round_pd{0x8}", __LINE__);
+    r = _mm512_roundscale_round_pd(v1, 0x9, (_MM_FROUND_CUR_DIRECTION));
+    check_equal_ndf(&r, &exp1, 8, "_mm512_roundscale_round_pd{0x9}", __LINE__);
+    k8 = 0xAA;
+    r = _mm512_mask_roundscale_round_pd(undef, k8, v1, 0xA,
+                                        (_MM_FROUND_CUR_DIRECTION));
+    check_equal_ndf(&r, &expm, 8, "_mm512_mask_roundscale_round_pd{1}{0xA}",
+                    __LINE__);
+    r = _mm512_maskz_roundscale_round_pd(k8, v1, 0xB,
+                                         (_MM_FROUND_CUR_DIRECTION));
+    check_equal_ndf(&r, &expzm, 8, "_mm512_maskz_roundscale_round_pd{0}{0xB}",
+                    __LINE__);
+  }
+
+  {
+    volatile __m512d r =
+        _mm512_roundscale_round_pd(v1, 0x8, ((_MM_FROUND_NO_EXC)));
+    check_equal_ndf(&r, &exp, 8, "_mm512_roundscale_round_pd{0x8}", __LINE__);
+    r = _mm512_roundscale_round_pd(v1, 0x9, ((_MM_FROUND_NO_EXC)));
+    check_equal_ndf(&r, &exp1, 8, "_mm512_roundscale_round_pd{0x9}", __LINE__);
+    k8 = 0xAA;
+    r = _mm512_mask_roundscale_round_pd(undef, k8, v1, 0xA,
+                                        ((_MM_FROUND_NO_EXC)));
+    check_equal_ndf(&r, &expm, 8, "_mm512_mask_roundscale_round_pd{1}{0xA}",
+                    __LINE__);
+    r = _mm512_maskz_roundscale_round_pd(k8, v1, 0xB, ((_MM_FROUND_NO_EXC)));
+    check_equal_ndf(&r, &expzm, 8, "_mm512_maskz_roundscale_round_pd{0}{0xB}",
+                    __LINE__);
+  }
+}
+void __declspec(noinline) do_roundscale_ps() {
+  __mmask16 k8;
+  volatile __m512 v1 = _mm512_set1_ps((-1.6f));
+  volatile __m512 undef = _mm512_set1_ps(3.0);
+  __m512 exp = _mm512_set1_ps((-2.0f));
+  __m512 exp1 = _mm512_set1_ps((-2.0f));
+  __m512 expm =
+      _mm512_set_ps((-1.0f), 3.0, (-1.0f), 3.0, (-1.0f), 3.0, (-1.0f), 3.0,
+                    (-1.0f), 3.0, (-1.0f), 3.0, (-1.0f), 3.0, (-1.0f), 3.0);
+  __m512 expzm = _mm512_set_ps((-1.0f), 0, (-1.0f), 0, (-1.0f), 0, (-1.0f), 0,
+                               (-1.0f), 0, (-1.0f), 0, (-1.0f), 0, (-1.0f), 0);
+
+  {
+    volatile __m512 r = _mm512_roundscale_ps(v1, 0x8);
+    check_equal_nsf(&r, &exp, 16, "_mm512_roundscale_{0x8}", __LINE__);
+    r = _mm512_roundscale_ps(v1, 0x9);
+    check_equal_nsf(&r, &exp, 16, "_mm512_roundscale_{0x9}", __LINE__);
+    k8 = 0xAAAA;
+    r = _mm512_mask_roundscale_ps(undef, k8, v1, 0xA);
+    check_equal_nsf(&r, &expm, 16, "_mm512_mask_roundscale_{1}{A}", __LINE__);
+    r = _mm512_maskz_roundscale_ps(k8, v1, 0xB);
+    check_equal_nsf(&r, &expzm, 16, "_mm512_maskz_roundscale_{0}{B}", __LINE__);
+  }
+
+  {
+    volatile __m512 r =
+        _mm512_roundscale_round_ps(v1, 0x8, (_MM_FROUND_CUR_DIRECTION));
+    check_equal_nsf(&r, &exp, 16, "_mm512_roundscale_round_ps", __LINE__);
+    r = _mm512_roundscale_round_ps(v1, 0x9, (_MM_FROUND_CUR_DIRECTION));
+    check_equal_nsf(&r, &exp1, 16, "_mm512_roundscale_round_ps", __LINE__);
+    k8 = 0xAAAA;
+    r = _mm512_mask_roundscale_round_ps(undef, k8, v1, 0xA,
+                                        (_MM_FROUND_CUR_DIRECTION));
+    check_equal_nsf(&r, &expm, 16, "_mm512_mask_roundscale_round_{1}{A}",
+                    __LINE__);
+    r = _mm512_maskz_roundscale_round_ps(k8, v1, 0xB,
+                                         (_MM_FROUND_CUR_DIRECTION));
+    check_equal_nsf(&r, &expzm, 16, "_mm512_maskz_roundscale_round_{0}{B}",
+                    __LINE__);
+  }
+
+  {
+    volatile __m512 r =
+        _mm512_roundscale_round_ps(v1, 0x8, ((_MM_FROUND_NO_EXC)));
+    check_equal_nsf(&r, &exp, 16, "_mm512_roundscale_round_ps", __LINE__);
+    r = _mm512_roundscale_round_ps(v1, 0x9, ((_MM_FROUND_NO_EXC)));
+    check_equal_nsf(&r, &exp1, 16, "_mm512_roundscale_round_ps", __LINE__);
+    k8 = 0xAAAA;
+    r = _mm512_mask_roundscale_round_ps(undef, k8, v1, 0xA,
+                                        ((_MM_FROUND_NO_EXC)));
+    check_equal_nsf(&r, &expm, 16, "_mm512_mask_roundscale_round_{1}{A}",
+                    __LINE__);
+    r = _mm512_maskz_roundscale_round_ps(k8, v1, 0xB, ((_MM_FROUND_NO_EXC)));
+    check_equal_nsf(&r, &expzm, 16, "_mm512_maskz_roundscale_round_{0}{B}",
+                    __LINE__);
+  }
+}
+
+int main(int argc, char *argv[]) {
+  do_roundscale_pd();
+  do_roundscale_ps();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/roundscale_m512.reference_output b/SingleSource/UnitTests/Vector/AVX512/roundscale_m512.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/roundscale_m512.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/roundscale_scalar.c b/SingleSource/UnitTests/Vector/AVX512/roundscale_scalar.c
new file mode 100644
index 0000000..de2a75a
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/roundscale_scalar.c
@@ -0,0 +1,98 @@
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+
+/*
+ * Here we check for _mm_[mask|maskz]roundscale_[ss|sd] intrinsics.
+ */
+
+void __declspec(noinline) do_roundscale_sd() {
+  __mmask8 k8;
+  volatile __m128d v1 = _mm_set_pd(2.0, (15.0));
+  volatile __m128d v2 = _mm_set_pd(4.0, (1.6));
+  volatile __m128d undef = _mm_set_pd(333.0, 111.0);
+  __m128d exp = _mm_set_pd(2.0, (2.0));
+  __m128d exp1 = _mm_set_pd(2.0, (1.0));
+  __m128d expm1 = _mm_set_pd(2.0, (2.0));
+  __m128d expm0 = _mm_set_pd(2.0, 111.0);
+  __m128d expzm1 = _mm_set_pd(2.0, (1.0));
+  __m128d expzm0 = _mm_set_pd(2.0, 0.0);
+
+  {
+    volatile __m128d r = _mm_roundscale_sd(v1, v2, 0x8);
+    check_equal_ndf(&r, &exp, 2,
+                    "_mm_"
+                    "roundscale_sd{imm=0x8}",
+                    __LINE__);
+    r = _mm_roundscale_sd(v1, v2, 0x9);
+    check_equal_ndf(&r, &exp1, 2,
+                    "_mm_"
+                    "roundscale_sd{imm=0x9}",
+                    __LINE__);
+    k8 = 1;
+    r = _mm_mask_roundscale_sd(undef, k8, v1, v2, 0xA);
+    check_equal_ndf(&r, &expm1, 2, "_mm_mask_roundscale_sd{1}{imm=0xA}",
+                    __LINE__);
+    k8 = 0;
+    r = _mm_mask_roundscale_sd(undef, k8, v1, v2, 0x8);
+    check_equal_ndf(&r, &expm0, 2, "_mm_mask_roundscale_sd{0}", __LINE__);
+    k8 = 1;
+    r = _mm_maskz_roundscale_sd(k8, v1, v2, 0xB);
+    check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_roundscale_sd{1}{imm=0xB}",
+                    __LINE__);
+    k8 = 0;
+    r = _mm_maskz_roundscale_sd(k8, v1, v2, 0x8);
+    check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_roundscale_sd{0}", __LINE__);
+  }
+}
+void __declspec(noinline) do_roundscale_ss() {
+  __mmask8 k8;
+  volatile __m128 v1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (17.0f));
+  volatile __m128 v2 = _mm_set_ps(8.0f, 7.0f, 6.0f, (-1.6f));
+  volatile __m128 undef = _mm_set_ps(777.0f, 555.0f, 333.0f, 111.0f);
+  __m128 exp = _mm_set_ps(4.0f, 3.0f, 2.0f, (-2.0f));
+  __m128 exp1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (-2.0f));
+  __m128 expm1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (-1.0f));
+  __m128 expm0 = _mm_set_ps(4.0f, 3.0f, 2.0f, 111.0f);
+  __m128 expzm1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (-1.0f));
+  __m128 expzm0 = _mm_set_ps(4.0f, 3.0f, 2.0f, 0.0f);
+
+  {
+    volatile __m128 r = _mm_roundscale_ss(v1, v2, 0x8);
+    check_equal_nsf(&r, &exp, 4,
+                    "_mm_"
+                    "roundscale_ss{imm=0x8}",
+                    __LINE__);
+    r = _mm_roundscale_ss(v1, v2, 0x9);
+    check_equal_nsf(&r, &exp1, 4,
+                    "_mm_"
+                    "roundscale_ss{imm=0x9}",
+                    __LINE__);
+    k8 = 1;
+    r = _mm_mask_roundscale_ss(undef, k8, v1, v2, 0xA);
+    check_equal_nsf(&r, &expm1, 4, "_mm_mask_roundscale_ss{imm=0xA}", __LINE__);
+    k8 = 0;
+    r = _mm_mask_roundscale_ss(undef, k8, v1, v2, 0x8);
+    check_equal_nsf(&r, &expm0, 4, "_mm_mask_roundscale_ss", __LINE__);
+    k8 = 1;
+    r = _mm_maskz_roundscale_ss(k8, v1, v2, 0xB);
+    check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_roundscale_ss{imm=0xB}",
+                    __LINE__);
+    k8 = 0;
+    r = _mm_maskz_roundscale_ss(k8, v1, v2, 0x8);
+    check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_roundscale_ss", __LINE__);
+  }
+}
+
+int main(int argc, char *argv[]) {
+  do_roundscale_sd();
+  do_roundscale_ss();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/roundscale_scalar.reference_output b/SingleSource/UnitTests/Vector/AVX512/roundscale_scalar.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/roundscale_scalar.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/scalef.c b/SingleSource/UnitTests/Vector/AVX512/scalef.c
new file mode 100644
index 0000000..5e7b75a
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/scalef.c
@@ -0,0 +1,311 @@
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+
+/*
+ * Here we check for _mm_[mask|maskz]scalef_[round]_[ss|sd] intrinsics.
+ */
+
+void __declspec(noinline) do_scalef_sd() {
+  __mmask8 k8;
+  volatile __m128d v1 = _mm_set_pd(2.0, (2.0));
+  volatile __m128d v2 = _mm_set_pd(4.0, (2.5));
+  volatile __m128d undef = _mm_set_pd(333.0, 111.0);
+  __m128d exp = _mm_set_pd(2.0, (8.0));
+  __m128d expm1 = _mm_set_pd(2.0, (8.0));
+  __m128d expm0 = _mm_set_pd(2.0, 111.0);
+  __m128d expzm1 = _mm_set_pd(2.0, (8.0));
+  __m128d expzm0 = _mm_set_pd(2.0, 0.0);
+
+  {
+    volatile __m128d r = _mm_scalef_sd(v1, v2);
+    check_equal_ndf(&r, &exp, 2,
+                    "_mm_"
+                    "scalef_sd",
+                    __LINE__);
+    k8 = 1;
+    r = _mm_mask_scalef_sd(undef, k8, v1, v2);
+    check_equal_ndf(&r, &expm1, 2, "_mm_mask_scalef_sd{1}", __LINE__);
+    k8 = 0;
+    r = _mm_mask_scalef_sd(undef, k8, v1, v2);
+    check_equal_ndf(&r, &expm0, 2, "_mm_mask_scalef_sd{0}", __LINE__);
+    k8 = 1;
+    r = _mm_maskz_scalef_sd(k8, v1, v2);
+    check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_scalef_sd{1}", __LINE__);
+    k8 = 0;
+    r = _mm_maskz_scalef_sd(k8, v1, v2);
+    check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_scalef_sd{0}", __LINE__);
+  }
+
+  {
+    volatile __m128d r =
+        _mm_scalef_round_sd(v1, v2, (_MM_FROUND_CUR_DIRECTION));
+    check_equal_ndf(&r, &exp, 2,
+                    "_mm_"
+                    "scalef_round_sd",
+                    __LINE__);
+    k8 = 1;
+    r = _mm_mask_scalef_round_sd(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+    check_equal_ndf(&r, &expm1, 2, "_mm_mask_scalef_round_sd{1}", __LINE__);
+    k8 = 0;
+    r = _mm_mask_scalef_round_sd(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+    check_equal_ndf(&r, &expm0, 2, "_mm_mask_scalef_round_sd{0}", __LINE__);
+    k8 = 1;
+    r = _mm_maskz_scalef_round_sd(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+    check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_scalef_round_sd{1}", __LINE__);
+    k8 = 0;
+    r = _mm_maskz_scalef_round_sd(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+    check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_scalef_round_sd{0}", __LINE__);
+  }
+
+  {
+    volatile __m128d r = _mm_scalef_round_sd(v1, v2, ((_MM_FROUND_NO_EXC)));
+    check_equal_ndf(&r, &exp, 2, "_mm_scalef_round_sd", __LINE__);
+    k8 = 1;
+    r = _mm_mask_scalef_round_sd(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+    check_equal_ndf(&r, &expm1, 2, "_mm_mask_scalef_round_sd{1}", __LINE__);
+    k8 = 0;
+    r = _mm_mask_scalef_round_sd(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+    check_equal_ndf(&r, &expm0, 2, "_mm_mask_scalef_round_sd{0}", __LINE__);
+    k8 = 1;
+    r = _mm_maskz_scalef_round_sd(k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+    check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_scalef_round_sd{1}", __LINE__);
+    k8 = 0;
+    r = _mm_maskz_scalef_round_sd(k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+    check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_scalef_round_sd{0}", __LINE__);
+  }
+
+  {
+    volatile __m128d r =
+        _mm_scalef_round_sd(v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+    check_equal_ndf(&r, &exp, 2,
+                    "_mm_"
+                    "scalef_round_sd",
+                    __LINE__);
+    k8 = 1;
+    r = _mm_mask_scalef_round_sd(undef, k8, v1, v2,
+                                 ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+    check_equal_ndf(&r, &expm1, 2, "_mm_mask_scalef_round_sd{1}", __LINE__);
+    k8 = 0;
+    r = _mm_mask_scalef_round_sd(undef, k8, v1, v2,
+                                 ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+    check_equal_ndf(&r, &expm0, 2, "_mm_mask_scalef_round_sd{0}", __LINE__);
+    k8 = 1;
+    r = _mm_maskz_scalef_round_sd(k8, v1, v2,
+                                  ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+    check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_scalef_round_sd{1}", __LINE__);
+    k8 = 0;
+    r = _mm_maskz_scalef_round_sd(k8, v1, v2,
+                                  ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+    check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_scalef_round_sd{0}", __LINE__);
+  }
+
+  {
+    volatile __m128d r = _mm_scalef_round_sd(
+        v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+    check_equal_ndf(&r, &exp, 2,
+                    "_mm_"
+                    "scalef_round_sd",
+                    __LINE__);
+    k8 = 1;
+    r = _mm_mask_scalef_round_sd(
+        undef, k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+    check_equal_ndf(&r, &expm1, 2, "_mm_mask_scalef_round_sd{1}", __LINE__);
+    k8 = 0;
+    r = _mm_mask_scalef_round_sd(
+        undef, k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+    check_equal_ndf(&r, &expm0, 2, "_mm_mask_scalef_round_sd{0}", __LINE__);
+    k8 = 1;
+    r = _mm_maskz_scalef_round_sd(
+        k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+    check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_scalef_round_sd{1}", __LINE__);
+    k8 = 0;
+    r = _mm_maskz_scalef_round_sd(
+        k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+    check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_scalef_round_sd{0}", __LINE__);
+  }
+
+  {
+    volatile __m128d r = _mm_scalef_round_sd(
+        v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+    check_equal_ndf(&r, &exp, 2,
+                    "_mm_"
+                    "scalef_round_sd",
+                    __LINE__);
+    k8 = 1;
+    r = _mm_mask_scalef_round_sd(undef, k8, v1, v2,
+                                 ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+    check_equal_ndf(&r, &expm1, 2, "_mm_mask_scalef_round_sd{1}", __LINE__);
+    k8 = 0;
+    r = _mm_mask_scalef_round_sd(undef, k8, v1, v2,
+                                 ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+    check_equal_ndf(&r, &expm0, 2, "_mm_mask_scalef_round_sd{0}", __LINE__);
+    k8 = 1;
+    r = _mm_maskz_scalef_round_sd(
+        k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+    check_equal_ndf(&r, &expzm1, 2, "_mm_maskz_scalef_round_sd{1}", __LINE__);
+    k8 = 0;
+    r = _mm_maskz_scalef_round_sd(
+        k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+    check_equal_ndf(&r, &expzm0, 2, "_mm_maskz_scalef_round_sd{0}", __LINE__);
+  };
+}
+void __declspec(noinline) do_scalef_ss() {
+  __mmask8 k8;
+  volatile __m128 v1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (2.0f));
+  volatile __m128 v2 = _mm_set_ps(8.0f, 7.0f, 6.0f, (2.5f));
+  volatile __m128 undef = _mm_set_ps(777.0f, 555.0f, 333.0f, 111.0f);
+  __m128 exp = _mm_set_ps(4.0f, 3.0f, 2.0f, (8.0f));
+  __m128 expm1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (8.0f));
+  __m128 expm0 = _mm_set_ps(4.0f, 3.0f, 2.0f, 111.0f);
+  __m128 expzm1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (8.0f));
+  __m128 expzm0 = _mm_set_ps(4.0f, 3.0f, 2.0f, 0.0f);
+
+  {
+    volatile __m128 r = _mm_scalef_ss(v1, v2);
+    check_equal_nsf(&r, &exp, 4,
+                    "_mm_"
+                    "scalef_ss",
+                    __LINE__);
+    k8 = 1;
+    r = _mm_mask_scalef_ss(undef, k8, v1, v2);
+    check_equal_nsf(&r, &expm1, 4, "_mm_mask_scalef_ss", __LINE__);
+    k8 = 0;
+    r = _mm_mask_scalef_ss(undef, k8, v1, v2);
+    check_equal_nsf(&r, &expm0, 4, "_mm_mask_scalef_ss", __LINE__);
+    k8 = 1;
+    r = _mm_maskz_scalef_ss(k8, v1, v2);
+    check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_scalef_ss", __LINE__);
+    k8 = 0;
+    r = _mm_maskz_scalef_ss(k8, v1, v2);
+    check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_scalef_ss", __LINE__);
+  }
+
+  {
+    volatile __m128 r = _mm_scalef_round_ss(v1, v2, (_MM_FROUND_CUR_DIRECTION));
+    check_equal_nsf(&r, &exp, 4,
+                    "_mm_"
+                    "scalef_round_ss",
+                    __LINE__);
+    k8 = 1;
+    r = _mm_mask_scalef_round_ss(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+    check_equal_nsf(&r, &expm1, 4, "_mm_mask_scalef_round_ss", __LINE__);
+    k8 = 0;
+    r = _mm_mask_scalef_round_ss(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+    check_equal_nsf(&r, &expm0, 4, "_mm_mask_scalef_round_ss", __LINE__);
+    k8 = 1;
+    r = _mm_maskz_scalef_round_ss(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+    check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_scalef_round_ss", __LINE__);
+    k8 = 0;
+    r = _mm_maskz_scalef_round_ss(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+    check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_scalef_round_ss", __LINE__);
+  }
+
+  {
+    volatile __m128 r = _mm_scalef_round_ss(v1, v2, ((_MM_FROUND_NO_EXC)));
+    check_equal_nsf(&r, &exp, 4,
+                    "_mm_"
+                    "scalef_round_ss",
+                    __LINE__);
+    k8 = 1;
+    r = _mm_mask_scalef_round_ss(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+    check_equal_nsf(&r, &expm1, 4, "_mm_mask_scalef_round_ss", __LINE__);
+    k8 = 0;
+    r = _mm_mask_scalef_round_ss(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+    check_equal_nsf(&r, &expm0, 4, "_mm_mask_scalef_round_ss", __LINE__);
+    k8 = 1;
+    r = _mm_maskz_scalef_round_ss(k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+    check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_scalef_round_ss", __LINE__);
+    k8 = 0;
+    r = _mm_maskz_scalef_round_ss(k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+    check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_scalef_round_ss", __LINE__);
+  }
+
+  {
+    volatile __m128 r =
+        _mm_scalef_round_ss(v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+    check_equal_nsf(&r, &exp, 4,
+                    "_mm_"
+                    "scalef_round_ss",
+                    __LINE__);
+    k8 = 1;
+    r = _mm_mask_scalef_round_ss(undef, k8, v1, v2,
+                                 ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+    check_equal_nsf(&r, &expm1, 4, "_mm_mask_scalef_round_ss", __LINE__);
+    k8 = 0;
+    r = _mm_mask_scalef_round_ss(undef, k8, v1, v2,
+                                 ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+    check_equal_nsf(&r, &expm0, 4, "_mm_mask_scalef_round_ss", __LINE__);
+    k8 = 1;
+    r = _mm_maskz_scalef_round_ss(k8, v1, v2,
+                                  ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+    check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_scalef_round_ss", __LINE__);
+    k8 = 0;
+    r = _mm_maskz_scalef_round_ss(k8, v1, v2,
+                                  ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+    check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_scalef_round_ss", __LINE__);
+  }
+
+  {
+    volatile __m128 r = _mm_scalef_round_ss(
+        v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+    check_equal_nsf(&r, &exp, 4,
+                    "_mm_"
+                    "scalef_round_ss",
+                    __LINE__);
+    k8 = 1;
+    r = _mm_mask_scalef_round_ss(
+        undef, k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+    check_equal_nsf(&r, &expm1, 4, "_mm_mask_scalef_round_ss", __LINE__);
+    k8 = 0;
+    r = _mm_mask_scalef_round_ss(
+        undef, k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+    check_equal_nsf(&r, &expm0, 4, "_mm_mask_scalef_round_ss", __LINE__);
+    k8 = 1;
+    r = _mm_maskz_scalef_round_ss(
+        k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+    check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_scalef_round_ss", __LINE__);
+    k8 = 0;
+    r = _mm_maskz_scalef_round_ss(
+        k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+    check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_scalef_round_ss", __LINE__);
+  }
+
+  {
+    volatile __m128 r = _mm_scalef_round_ss(
+        v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+    check_equal_nsf(&r, &exp, 4,
+                    "_mm_"
+                    "scalef_round_ss",
+                    __LINE__);
+    k8 = 1;
+    r = _mm_mask_scalef_round_ss(undef, k8, v1, v2,
+                                 ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+    check_equal_nsf(&r, &expm1, 4, "_mm_mask_scalef_round_ss", __LINE__);
+    k8 = 0;
+    r = _mm_mask_scalef_round_ss(undef, k8, v1, v2,
+                                 ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+    check_equal_nsf(&r, &expm0, 4, "_mm_mask_scalef_round_ss", __LINE__);
+    k8 = 1;
+    r = _mm_maskz_scalef_round_ss(
+        k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+    check_equal_nsf(&r, &expzm1, 4, "_mm_maskz_scalef_round_ss", __LINE__);
+    k8 = 0;
+    r = _mm_maskz_scalef_round_ss(
+        k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+    check_equal_nsf(&r, &expzm0, 4, "_mm_maskz_scalef_round_ss", __LINE__);
+  }
+}
+
+int main(int argc, char *argv[]) {
+  do_scalef_sd();
+  do_scalef_ss();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/scalef.reference_output b/SingleSource/UnitTests/Vector/AVX512/scalef.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/scalef.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/sqrt_scalar.c b/SingleSource/UnitTests/Vector/AVX512/sqrt_scalar.c
new file mode 100644
index 0000000..bd83818
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/sqrt_scalar.c
@@ -0,0 +1,478 @@
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+
+/*
+ * Here we check for _mm_[mask|mmaskz]sqrt_[round]_[ss|sd] intrinsics, but
+ * _mm_sqrt_ss,_mm_sqrt_sd intrinsics that belong to
+ * earlier versions of ISA.
+ */
+
+void __declspec(noinline) do_sqrt_sd() {
+  __mmask8 k8;
+  volatile __m128d v1 = _mm_set_pd(2.0, (15.0));
+  volatile __m128d v2 = _mm_set_pd(4.0, (0.25));
+  volatile __m128d undef = _mm_set_pd(333.0, 111.0);
+  __m128d exp = _mm_set_pd(2.0, (0.5));
+  __m128d expm1 = _mm_set_pd(2.0, (0.5));
+  __m128d expm0 = _mm_set_pd(2.0, 111.0);
+  __m128d expzm1 = _mm_set_pd(2.0, (0.5));
+  __m128d expzm0 = _mm_set_pd(2.0, 0.0);
+  volatile __m128d r;
+  k8 = 1;
+  r = _mm_mask_sqrt_sd(undef, k8, v1, v2);
+  check_equal_ndf(&r, &expm1, 2,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_sd{1}",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_mask_sqrt_sd(undef, k8, v1, v2);
+  check_equal_ndf(&r, &expm0, 2,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_sd{0}",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_maskz_sqrt_sd(k8, v1, v2);
+  check_equal_ndf(&r, &expzm1, 2,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_sd{1}",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_maskz_sqrt_sd(k8, v1, v2);
+  check_equal_ndf(&r, &expzm0, 2,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_sd{0}",
+                  __LINE__);
+
+  r = _mm_sqrt_round_sd(v1, v2, (_MM_FROUND_CUR_DIRECTION));
+  check_equal_ndf(&r, &exp, 2,
+                  "_mm_"
+                  "sqrt"
+                  "_round_sd",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+  check_equal_ndf(&r, &expm1, 2,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_sd{1}",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+  check_equal_ndf(&r, &expm0, 2,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_sd{0}",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_maskz_sqrt_round_sd(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+  check_equal_ndf(&r, &expzm1, 2,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_sd{1}",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_maskz_sqrt_round_sd(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+  check_equal_ndf(&r, &expzm0, 2,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_sd{0}",
+                  __LINE__);
+
+  r = _mm_sqrt_round_sd(v1, v2, ((_MM_FROUND_NO_EXC)));
+  check_equal_ndf(&r, &exp, 2,
+                  "_mm_"
+                  "sqrt"
+                  "_round_sd",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+  check_equal_ndf(&r, &expm1, 2,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_sd{1}",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+  check_equal_ndf(&r, &expm0, 2,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_sd{0}",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_maskz_sqrt_round_sd(k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+  check_equal_ndf(&r, &expzm1, 2,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_sd{1}",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_maskz_sqrt_round_sd(k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+  check_equal_ndf(&r, &expzm0, 2,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_sd{0}",
+                  __LINE__);
+
+  r = _mm_sqrt_round_sd(v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+  check_equal_ndf(&r, &exp, 2,
+                  "_mm_"
+                  "sqrt"
+                  "_round_sd",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2,
+                             ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+  check_equal_ndf(&r, &expm1, 2,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_sd{1}",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2,
+                             ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+  check_equal_ndf(&r, &expm0, 2,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_sd{0}",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_maskz_sqrt_round_sd(k8, v1, v2,
+                              ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+  check_equal_ndf(&r, &expzm1, 2,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_sd{1}",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_maskz_sqrt_round_sd(k8, v1, v2,
+                              ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+  check_equal_ndf(&r, &expzm0, 2,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_sd{0}",
+                  __LINE__);
+
+  r = _mm_sqrt_round_sd(v1, v2,
+                        ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+  check_equal_ndf(&r, &exp, 2,
+                  "_mm_"
+                  "sqrt"
+                  "_round_sd",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2,
+                             ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+  check_equal_ndf(&r, &expm1, 2,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_sd{1}",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2,
+                             ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+  check_equal_ndf(&r, &expm0, 2,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_sd{0}",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_maskz_sqrt_round_sd(
+      k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+  check_equal_ndf(&r, &expzm1, 2,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_sd{1}",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_maskz_sqrt_round_sd(
+      k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+  check_equal_ndf(&r, &expzm0, 2,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_sd{0}",
+                  __LINE__);
+
+  r = _mm_sqrt_round_sd(v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+  check_equal_ndf(&r, &exp, 2,
+                  "_mm_"
+                  "sqrt"
+                  "_round_sd",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2,
+                             ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+  check_equal_ndf(&r, &expm1, 2,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_sd{1}",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_mask_sqrt_round_sd(undef, k8, v1, v2,
+                             ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+  check_equal_ndf(&r, &expm0, 2,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_sd{0}",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_maskz_sqrt_round_sd(k8, v1, v2,
+                              ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+  check_equal_ndf(&r, &expzm1, 2,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_sd{1}",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_maskz_sqrt_round_sd(k8, v1, v2,
+                              ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+  check_equal_ndf(&r, &expzm0, 2,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_sd{0}",
+                  __LINE__);
+}
+void __declspec(noinline) do_sqrt_ss() {
+  __mmask8 k8;
+  volatile __m128 v1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (17.0f));
+  volatile __m128 v2 = _mm_set_ps(8.0f, 7.0f, 6.0f, (0.16f));
+  volatile __m128 undef = _mm_set_ps(777.0f, 555.0f, 333.0f, 111.0f);
+  __m128 exp = _mm_set_ps(4.0f, 3.0f, 2.0f, (0.4f));
+  __m128 expm1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (0.4f));
+  __m128 expm0 = _mm_set_ps(4.0f, 3.0f, 2.0f, 111.0f);
+  __m128 expzm1 = _mm_set_ps(4.0f, 3.0f, 2.0f, (0.4f));
+  __m128 expzm0 = _mm_set_ps(4.0f, 3.0f, 2.0f, 0.0f);
+  volatile __m128 r;
+  k8 = 1;
+  r = _mm_mask_sqrt_ss(undef, k8, v1, v2);
+  check_equal_nsf(&r, &expm1, 4,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_ss",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_mask_sqrt_ss(undef, k8, v1, v2);
+  check_equal_nsf(&r, &expm0, 4,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_ss",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_maskz_sqrt_ss(k8, v1, v2);
+  check_equal_nsf(&r, &expzm1, 4,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_ss",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_maskz_sqrt_ss(k8, v1, v2);
+  check_equal_nsf(&r, &expzm0, 4,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_ss",
+                  __LINE__);
+
+  r = _mm_sqrt_round_ss(v1, v2, (_MM_FROUND_CUR_DIRECTION));
+  check_equal_nsf(&r, &exp, 4,
+                  "_mm_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+  check_equal_nsf(&r, &expm1, 4,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+  check_equal_nsf(&r, &expm0, 4,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_maskz_sqrt_round_ss(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+  check_equal_nsf(&r, &expzm1, 4,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_maskz_sqrt_round_ss(k8, v1, v2, (_MM_FROUND_CUR_DIRECTION));
+  check_equal_nsf(&r, &expzm0, 4,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+
+  r = _mm_sqrt_round_ss(v1, v2, ((_MM_FROUND_NO_EXC)));
+  check_equal_nsf(&r, &exp, 4,
+                  "_mm_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+  check_equal_nsf(&r, &expm1, 4,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+  check_equal_nsf(&r, &expm0, 4,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_maskz_sqrt_round_ss(k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+  check_equal_nsf(&r, &expzm1, 4,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_maskz_sqrt_round_ss(k8, v1, v2, ((_MM_FROUND_NO_EXC)));
+  check_equal_nsf(&r, &expzm0, 4,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+
+  r = _mm_sqrt_round_ss(v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+  check_equal_nsf(&r, &exp, 4,
+                  "_mm_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2,
+                             ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+  check_equal_nsf(&r, &expm1, 4,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2,
+                             ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+  check_equal_nsf(&r, &expm0, 4,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_maskz_sqrt_round_ss(k8, v1, v2,
+                              ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+  check_equal_nsf(&r, &expzm1, 4,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_maskz_sqrt_round_ss(k8, v1, v2,
+                              ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO)));
+  check_equal_nsf(&r, &expzm0, 4,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+
+  r = _mm_sqrt_round_ss(v1, v2,
+                        ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+  check_equal_nsf(&r, &exp, 4,
+                  "_mm_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2,
+                             ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+  check_equal_nsf(&r, &expm1, 4,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2,
+                             ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+  check_equal_nsf(&r, &expm0, 4,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_maskz_sqrt_round_ss(
+      k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+  check_equal_nsf(&r, &expzm1, 4,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_maskz_sqrt_round_ss(
+      k8, v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT)));
+  check_equal_nsf(&r, &expzm0, 4,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+
+  r = _mm_sqrt_round_ss(v1, v2, ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+  check_equal_nsf(&r, &exp, 4,
+                  "_mm_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2,
+                             ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+  check_equal_nsf(&r, &expm1, 4,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_mask_sqrt_round_ss(undef, k8, v1, v2,
+                             ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+  check_equal_nsf(&r, &expm0, 4,
+                  "_mm_mask_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 1;
+  r = _mm_maskz_sqrt_round_ss(k8, v1, v2,
+                              ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+  check_equal_nsf(&r, &expzm1, 4,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+  k8 = 0;
+  r = _mm_maskz_sqrt_round_ss(k8, v1, v2,
+                              ((_MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF)));
+  check_equal_nsf(&r, &expzm0, 4,
+                  "_mm_maskz_"
+                  "sqrt"
+                  "_round_ss",
+                  __LINE__);
+}
+
+int main(int argc, char *argv[]) {
+  do_sqrt_sd();
+  do_sqrt_ss();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/sqrt_scalar.reference_output b/SingleSource/UnitTests/Vector/AVX512/sqrt_scalar.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/sqrt_scalar.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/xor.c b/SingleSource/UnitTests/Vector/AVX512/xor.c
new file mode 100644
index 0000000..01fbeb7
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/xor.c
@@ -0,0 +1,93 @@
+/*
+ * Test xor intrinsics.
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm512_xor_epi32()
+ * _mm512_mask_xor_epi32()
+ * _mm512_xor_epi64()
+ * _mm512_mask_xor_epi64()
+ */
+
+#include "m512_test_util.h"
+#include <stdio.h>
+
+volatile int vol = 0; /* Inhibit optimization */
+
+__m512 f1, f2, f3, f3_orig;
+__m512d d1, d2, d3, d3_orig;
+__m512i i1, i2, i3, i3_orig;
+
+void NOINLINE set_nonzero(void *vp, int c) {
+  int i;
+  V512 *v = (V512 *)vp;
+
+  for (i = 0; i < 16; i++) {
+    v->u32[i] = 10 * i * i - 3 * i + c;
+    if (v->u32[i] == 0) {
+      v->u32[i] = 1234;
+    }
+  }
+}
+
+void NOINLINE check_xor(void *vp1, void *vp2, void *vp3, void *vp_orig,
+                        int mask, char *banner) {
+  int i;
+  V512 *v1 = (V512 *)vp1;
+  V512 *v2 = (V512 *)vp2;
+  V512 *v3 = (V512 *)vp3;
+  V512 *v_orig = (V512 *)vp_orig;
+
+  for (i = 0; i < 16; i++) {
+    int actual = v3->u32[i];
+    int expected = v_orig->u32[i];
+    if (mask & (1 << i)) {
+      expected = v1->u32[i] ^ v2->u32[i];
+    }
+    if (actual + vol != expected - vol) {
+      printf("ERROR: %s failed\n", banner ? banner : "");
+      n_errs++;
+      break;
+    }
+  }
+}
+
+void NOINLINE do_xor() {
+  set_nonzero(&i1, 99);
+  set_nonzero(&i2, 100);
+  set_nonzero(&f1, 33);
+  set_nonzero(&f2, -35);
+  set_nonzero(&d1, -11);
+  set_nonzero(&d2, 14);
+
+  set_nonzero(&i3, 1000);
+  i3_orig = i3;
+  i3 = _mm512_xor_epi32(i1, i2);
+  check_xor(&i1, &i2, &i3, &i3_orig, 0xffff, "_mm512_xor_epi32");
+
+  set_nonzero(&i3, 1500);
+  i3_orig = i3;
+  i3 = _mm512_mask_xor_epi32(i3_orig, 0x5555, i1, i2);
+  check_xor(&i1, &i2, &i3, &i3_orig, 0x5555, "_mm512_mask_xor_epi32");
+
+  set_nonzero(&i3, 2000);
+  i3_orig = i3;
+  i3 = _mm512_xor_epi64(i1, i2);
+  check_xor(&i1, &i2, &i3, &i3_orig, 0xffff, "_mm512_xor_epi64");
+
+  set_nonzero(&i3, 2500);
+  i3_orig = i3;
+  i3 = _mm512_mask_xor_epi64(i3_orig, 0x55, i1, i2);
+  check_xor(&i1, &i2, &i3, &i3_orig, 0x3333, "_mm512_mask_xor_epi64");
+}
+
+int main() {
+  do_xor();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/xor.reference_output b/SingleSource/UnitTests/Vector/AVX512/xor.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/xor.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi32.c b/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi32.c
new file mode 100644
index 0000000..50986e7
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi32.c
@@ -0,0 +1,70 @@
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+/* This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm512_or_epi32()
+ * _mm512_mask_or_epi32()
+ * _mm512_xor_epi32()
+ * _mm512_mask_xor_epi32()
+ */
+void __declspec(noinline)
+    check_equal_epi32(__m512i vres, __m512i vexp, char *banner, int line) {
+  int i;
+
+  __declspec(align(64)) int res[16];
+  __declspec(align(64)) int exp[16];
+
+  _mm512_store_epi32(res, vres);
+  _mm512_store_epi32(exp, vexp);
+
+  for (i = 0; i < 16; i++) {
+    if (res[i] != exp[i]) {
+      printf("ERROR: %s failed at line %d with result (%d) != "
+             "(%d)  element %d\n",
+             banner, line, res[i], exp[i], i);
+      ++n_errs;
+    }
+  }
+}
+
+void __declspec(noinline) do_or_() {
+  __mmask16 k8 = 0xAAAA;
+  volatile __m512i undef = _mm512_set1_epi32(3);
+  volatile __m512i v1 = _mm512_set1_epi32((10));
+  volatile __m512i v2 = _mm512_set1_epi32((3));
+  volatile __m512i exp1 = _mm512_set1_epi32((11));
+  volatile __m512i exp2 = _mm512_set_epi32((11), 3, (11), 3, (11), 3, (11), 3,
+                                           (11), 3, (11), 3, (11), 3, (11), 3);
+  volatile __m512i r = _mm512_or_epi32(v1, v2);
+  check_equal_nd(&r, &exp1, 16, "_mm512_or_epi32", __LINE__);
+  r = _mm512_mask_or_epi32(undef, k8, v1, v2);
+  check_equal_nd(&r, &exp2, 16, "_mm512_mask_or_epi32", __LINE__);
+}
+
+void __declspec(noinline) do_xor_() {
+  __mmask16 k8 = 0xAAAA;
+  volatile __m512i undef = _mm512_set1_epi32(3);
+  volatile __m512i v1 = _mm512_set1_epi32((10));
+  volatile __m512i v2 = _mm512_set1_epi32((3));
+  volatile __m512i exp1 = _mm512_set1_epi32((9));
+  volatile __m512i exp2 = _mm512_set_epi32((9), 3, (9), 3, (9), 3, (9), 3, (9),
+                                           3, (9), 3, (9), 3, (9), 3);
+  volatile __m512i r = _mm512_xor_epi32(v1, v2);
+  check_equal_nd(&r, &exp1, 16, "_mm512_xor_epi32", __LINE__);
+  r = _mm512_mask_xor_epi32(undef, k8, v1, v2);
+  check_equal_nd(&r, &exp2, 16, "_mm512_mask_xor_epi32", __LINE__);
+}
+
+int main(int argc, char *argv[]) {
+  do_or_();
+  do_xor_();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi32.reference_output b/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi32.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi32.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi64.c b/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi64.c
new file mode 100644
index 0000000..bfb4f83
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi64.c
@@ -0,0 +1,48 @@
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+/* This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm512_or_epi64()
+ * _mm512_mask_or_epi64()
+ * _mm512_xor_epi64()
+ * _mm512_mask_xor_epi64()
+ */
+void __declspec(noinline) do_or_() {
+  __mmask16 k8 = 0xAAAA;
+  volatile __m512i undef = _mm512_set1_epi64(3);
+  volatile __m512i v1 = _mm512_set1_epi64((10));
+  volatile __m512i v2 = _mm512_set1_epi64((3));
+  volatile __m512i exp1 = _mm512_set1_epi64((11));
+  volatile __m512i exp2 = _mm512_set_epi64((11), 3, (11), 3, (11), 3, (11), 3);
+  volatile __m512i r = _mm512_or_epi64(v1, v2);
+  check_equal_nq(&r, &exp1, 8, "_mm512_or_epi64", __LINE__);
+  r = _mm512_mask_or_epi64(undef, k8, v1, v2);
+  check_equal_nq(&r, &exp2, 8, "_mm512_mask_or_epi64", __LINE__);
+}
+
+void __declspec(noinline) do_xor_() {
+  __mmask16 k8 = 0xAAAA;
+  volatile __m512i undef = _mm512_set1_epi64(3);
+  volatile __m512i v1 = _mm512_set1_epi64((10));
+  volatile __m512i v2 = _mm512_set1_epi64((3));
+  volatile __m512i exp1 = _mm512_set1_epi64((9));
+  volatile __m512i exp2 = _mm512_set_epi64((9), 3, (9), 3, (9), 3, (9), 3);
+  volatile __m512i r = _mm512_xor_epi64(v1, v2);
+  check_equal_nq(&r, &exp1, 8, "_mm512_xor_epi64", __LINE__);
+  r = _mm512_mask_xor_epi64(undef, k8, v1, v2);
+  check_equal_nq(&r, &exp2, 8, "_mm512_mask_xor_epi64", __LINE__);
+}
+
+int main(int argc, char *argv[]) {
+  do_or_();
+  do_xor_();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi64.reference_output b/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi64.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_epi64.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_si512.c b/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_si512.c
new file mode 100644
index 0000000..e54976b
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_si512.c
@@ -0,0 +1,36 @@
+#include "m512_test_util.h"
+#include <stdio.h>
+#include <string.h>
+/* This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm512_or_si512()
+ * _mm512_xor_si512()
+ */
+void __declspec(noinline) do_or_() {
+  volatile __m512i v1 = _mm512_set1_epi32((10));
+  volatile __m512i v2 = _mm512_set1_epi32((3));
+  volatile __m512i exp1 = _mm512_set1_epi32((11));
+  volatile __m512i r = _mm512_or_si512(v1, v2);
+  check_equal_nd(&r, &exp1, 16, "_mm512_or_si512", __LINE__);
+}
+
+void __declspec(noinline) do_xor_() {
+  volatile __m512i v1 = _mm512_set1_epi32((10));
+  volatile __m512i v2 = _mm512_set1_epi32((3));
+  volatile __m512i exp1 = _mm512_set1_epi32((9));
+  volatile __m512i r = _mm512_xor_si512(v1, v2);
+  check_equal_nd(&r, &exp1, 16, "_mm512_xor_si512", __LINE__);
+}
+
+int main(int argc, char *argv[]) {
+  do_or_();
+  do_xor_();
+
+  if (n_errs != 0) {
+    printf("FAILED\n");
+    return 1;
+  }
+
+  printf("PASSED\n");
+  return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_si512.reference_output b/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_si512.reference_output
new file mode 100644
index 0000000..bfae62d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512/xor_or_m512_si512.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/CMakeLists.txt b/SingleSource/UnitTests/Vector/CMakeLists.txt
index 36c7ef1..839bd69 100644
--- a/SingleSource/UnitTests/Vector/CMakeLists.txt
+++ b/SingleSource/UnitTests/Vector/CMakeLists.txt
@@ -12,4 +12,9 @@
   add_subdirectory(AArch64)
 endif()
 
+if(CMAKE_C_COMPILER_ID STREQUAL "Clang")
+  if(ARCH STREQUAL "x86" AND X86CPU_ARCH STREQUAL "skylake-avx512")
+    add_subdirectory(AVX512)
+  endif()
+endif()
 llvm_singlesource(PREFIX "Vector-")
diff --git a/SingleSource/UnitTests/Vector/Makefile b/SingleSource/UnitTests/Vector/Makefile
index 3913cf1..302b803 100644
--- a/SingleSource/UnitTests/Vector/Makefile
+++ b/SingleSource/UnitTests/Vector/Makefile
@@ -16,6 +16,12 @@
 DIRS += SSE
 endif
 
+ifeq ($(CC_UNDER_TEST_IS_CLANG), 1)
+ifeq ($(HAVE_X86_AVX512F_INSTRUCTIONS), 1)
+DIRS += AVX512
+endif
+endif
+
 # Assume ARMv7 implies NEON.
 ifneq ($(CC_UNDER_TEST_TARGET_IS_THUMBV7),)
 DIRS += NEON