New subgroups - full changes set (#1074)

* Extended subgroups - extended types types

* Extended subgroups - non uniform vote tests

* Extended subgroups - non uniform arithmetic tests

* Extended subgroups - ballot tests

* Extended subgroups - clustered reduce tests

* Extended subgroups - shuffle tests

* Extended subgroups - formating issues

* Extended subgroups - review fixes

* Extended subgroups - review fixes

Fixed: removed additional brakes, kernel_sstr

* Extended subgroups - fix macos build error

* Extended subgroups - review fixes

Fixed: mac os build error

* Extended subgroups - data type verification example

* Extended subgroups - error unification

* Extended subgroups - fix header years

* Extended subgroups - use is_half_nan

* Extended subgroups - compare half as float

* Review fixes mostly for ballot functions.

- Modify kernels for better handling active/inactive workitems
- Modify gen/chk functions for handling non uniform workgroup sizes
- Introduce new variables naming convention
- minor fixes

* Extended subgroups - simplification data generation for ballot lsb/msb functions

* Extended subgroups - minor fixes

* Extended subgroups - move common code to function

* Extended subgroups - formatting errors fix

* Extended subgroups - fix build error

* Extended subgroups - sub_group_elect more sophisticated

Define mask which is 4bytes pattern where bit 1 means work item is active.
If workitem in subgroup matches pattern then run sub_group_elect()

* Extended subgroups - fix Ubuntu build error

* Extended subgroups - voting function review fixes

* adjust all function for using masks
* remove calculate templates
* merge code to one common template
* check results only in active workitems
* normalize values on host side
* minor fixes

* Extended subgroups - fix typos

* Set of fixes and improvements after review

* define WorkGroupParams to stop extended parameters list in function
* better workitems mask handing (WorkGroupParams)
* narrow values of data input generation to avoid overflows (arithmetic func)
* implement work item masks for arithmetic functions
* enable half type testing for reduction/scan/broadcast
* minor fixes

* Extended subgroups - fix Linux issues

* Extended subgroups - fix sub_group_local_id data type

* Extended subgroups - use vector instead of array.

* Extended subgroups - change names to subgroup

* Extended subgroups - uncomment code, fix build

* Extended subgroups - build fix, use cl_half_from_float func

* Extended subgroups - remove is_half_nan

* Extended subgroups - do no use undef min/max

* Extended subgroups - use parenthesis, fix formatting
diff --git a/test_conformance/subgroups/CMakeLists.txt b/test_conformance/subgroups/CMakeLists.txt
index eb6a607..d48af9c 100644
--- a/test_conformance/subgroups/CMakeLists.txt
+++ b/test_conformance/subgroups/CMakeLists.txt
@@ -5,8 +5,16 @@
     test_barrier.cpp
     test_queries.cpp
     test_workitem.cpp
-    test_workgroup.cpp
+    test_subgroup.cpp
     test_ifp.cpp
+    test_subgroup_extended_types.cpp
+    subgroup_common_kernels.cpp
+    test_subgroup_non_uniform_vote.cpp
+    test_subgroup_non_uniform_arithmetic.cpp
+    test_subgroup_ballot.cpp
+    test_subgroup_clustered_reduce.cpp
+    test_subgroup_shuffle.cpp
+    test_subgroup_shuffle_relative.cpp
 )
 
 include(../CMakeCommon.txt)
diff --git a/test_conformance/subgroups/main.cpp b/test_conformance/subgroups/main.cpp
index f9a9a9d..44416dd 100644
--- a/test_conformance/subgroups/main.cpp
+++ b/test_conformance/subgroups/main.cpp
@@ -27,12 +27,19 @@
     ADD_TEST_VERSION(sub_group_info_core, Version(2, 1)),
     ADD_TEST_VERSION(work_item_functions_ext, Version(2, 0)),
     ADD_TEST_VERSION(work_item_functions_core, Version(2, 1)),
-    ADD_TEST_VERSION(work_group_functions_ext, Version(2, 0)),
-    ADD_TEST_VERSION(work_group_functions_core, Version(2, 1)),
+    ADD_TEST_VERSION(subgroup_functions_ext, Version(2, 0)),
+    ADD_TEST_VERSION(subgroup_functions_core, Version(2, 1)),
     ADD_TEST_VERSION(barrier_functions_ext, Version(2, 0)),
     ADD_TEST_VERSION(barrier_functions_core, Version(2, 1)),
     ADD_TEST_VERSION(ifp_ext, Version(2, 0)),
-    ADD_TEST_VERSION(ifp_core, Version(2, 1))
+    ADD_TEST_VERSION(ifp_core, Version(2, 1)),
+    ADD_TEST(subgroup_functions_extended_types),
+    ADD_TEST(subgroup_functions_non_uniform_vote),
+    ADD_TEST(subgroup_functions_non_uniform_arithmetic),
+    ADD_TEST(subgroup_functions_ballot),
+    ADD_TEST(subgroup_functions_clustered_reduce),
+    ADD_TEST(subgroup_functions_shuffle),
+    ADD_TEST(subgroup_functions_shuffle_relative)
 };
 
 const int test_num = ARRAY_SIZE(test_list);
diff --git a/test_conformance/subgroups/procs.h b/test_conformance/subgroups/procs.h
index 3ebb13b..d09e824 100644
--- a/test_conformance/subgroups/procs.h
+++ b/test_conformance/subgroups/procs.h
@@ -37,14 +37,12 @@
                                          cl_context context,
                                          cl_command_queue queue,
                                          int num_elements);
-extern int test_work_group_functions_ext(cl_device_id device,
-                                         cl_context context,
-                                         cl_command_queue queue,
-                                         int num_elements);
-extern int test_work_group_functions_core(cl_device_id device,
-                                          cl_context context,
-                                          cl_command_queue queue,
-                                          int num_elements);
+extern int test_subgroup_functions_ext(cl_device_id device, cl_context context,
+                                       cl_command_queue queue,
+                                       int num_elements);
+extern int test_subgroup_functions_core(cl_device_id device, cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements);
 extern int test_barrier_functions_ext(cl_device_id device, cl_context context,
                                       cl_command_queue queue, int num_elements);
 extern int test_barrier_functions_core(cl_device_id device, cl_context context,
@@ -56,5 +54,31 @@
                         cl_command_queue queue, int num_elements);
 extern int test_ifp_core(cl_device_id device, cl_context context,
                          cl_command_queue queue, int num_elements);
-
+extern int test_subgroup_functions_extended_types(cl_device_id device,
+                                                  cl_context context,
+                                                  cl_command_queue queue,
+                                                  int num_elements);
+extern int test_subgroup_functions_non_uniform_vote(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements);
+extern int test_subgroup_functions_non_uniform_arithmetic(
+    cl_device_id device, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_subgroup_functions_ballot(cl_device_id device,
+                                          cl_context context,
+                                          cl_command_queue queue,
+                                          int num_elements);
+extern int test_subgroup_functions_clustered_reduce(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements);
+extern int test_subgroup_functions_shuffle(cl_device_id device,
+                                           cl_context context,
+                                           cl_command_queue queue,
+                                           int num_elements);
+extern int test_subgroup_functions_shuffle_relative(cl_device_id device,
+                                                    cl_context context,
+                                                    cl_command_queue queue,
+                                                    int num_elements);
 #endif /*_procs_h*/
diff --git a/test_conformance/subgroups/subgroup_common_kernels.cpp b/test_conformance/subgroups/subgroup_common_kernels.cpp
new file mode 100644
index 0000000..f8b2445
--- /dev/null
+++ b/test_conformance/subgroups/subgroup_common_kernels.cpp
@@ -0,0 +1,106 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "subgroup_common_kernels.h"
+
+const char* bcast_source =
+    "__kernel void test_bcast(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint which_sub_group_local_id = xy[gid].z;\n"
+    "    out[gid] = sub_group_broadcast(x, which_sub_group_local_id);\n"
+
+    "}\n";
+
+const char* redadd_source = "__kernel void test_redadd(const __global Type "
+                            "*in, __global int4 *xy, __global Type *out)\n"
+                            "{\n"
+                            "    int gid = get_global_id(0);\n"
+                            "    XY(xy,gid);\n"
+                            "    out[gid] = sub_group_reduce_add(in[gid]);\n"
+                            "}\n";
+
+const char* redmax_source = "__kernel void test_redmax(const __global Type "
+                            "*in, __global int4 *xy, __global Type *out)\n"
+                            "{\n"
+                            "    int gid = get_global_id(0);\n"
+                            "    XY(xy,gid);\n"
+                            "    out[gid] = sub_group_reduce_max(in[gid]);\n"
+                            "}\n";
+
+const char* redmin_source = "__kernel void test_redmin(const __global Type "
+                            "*in, __global int4 *xy, __global Type *out)\n"
+                            "{\n"
+                            "    int gid = get_global_id(0);\n"
+                            "    XY(xy,gid);\n"
+                            "    out[gid] = sub_group_reduce_min(in[gid]);\n"
+                            "}\n";
+
+const char* scinadd_source =
+    "__kernel void test_scinadd(const __global Type *in, __global int4 *xy, "
+    "__global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
+    "}\n";
+
+const char* scinmax_source =
+    "__kernel void test_scinmax(const __global Type *in, __global int4 *xy, "
+    "__global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
+    "}\n";
+
+const char* scinmin_source =
+    "__kernel void test_scinmin(const __global Type *in, __global int4 *xy, "
+    "__global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
+    "}\n";
+
+const char* scexadd_source =
+    "__kernel void test_scexadd(const __global Type *in, __global int4 *xy, "
+    "__global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
+    "}\n";
+
+const char* scexmax_source =
+    "__kernel void test_scexmax(const __global Type *in, __global int4 *xy, "
+    "__global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
+    "}\n";
+
+const char* scexmin_source =
+    "__kernel void test_scexmin(const __global Type *in, __global int4 *xy, "
+    "__global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
+    "}\n";
diff --git a/test_conformance/subgroups/subgroup_common_kernels.h b/test_conformance/subgroups/subgroup_common_kernels.h
new file mode 100644
index 0000000..8ae97d9
--- /dev/null
+++ b/test_conformance/subgroups/subgroup_common_kernels.h
@@ -0,0 +1,32 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef SUBGROUPKERNELSOURCES_H
+#define SUBGROUPKERNELSOURCES_H
+#include "subhelpers.h"
+
+
+extern const char* bcast_source;
+extern const char* redadd_source;
+extern const char* redmax_source;
+extern const char* redmin_source;
+extern const char* scinadd_source;
+extern const char* scinmax_source;
+extern const char* scinmin_source;
+extern const char* scexadd_source;
+extern const char* scexmax_source;
+extern const char* scexmin_source;
+
+#endif
diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h
new file mode 100644
index 0000000..b30c416
--- /dev/null
+++ b/test_conformance/subgroups/subgroup_common_templates.h
@@ -0,0 +1,911 @@
+//
+// Copyright (c) 2020 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef SUBGROUPCOMMONTEMPLATES_H
+#define SUBGROUPCOMMONTEMPLATES_H
+
+#include "typeWrappers.h"
+#include <bitset>
+#include "CL/cl_half.h"
+#include "subhelpers.h"
+
+#include <set>
+
+typedef std::bitset<128> bs128;
+static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id,
+                                  const std::string &mask_type,
+                                  cl_uint max_sub_group_size)
+{
+    bs128 mask128;
+    cl_uint4 mask;
+    cl_uint pos = subgroup_local_id;
+    if (mask_type == "eq") mask128.set(pos);
+    if (mask_type == "le" || mask_type == "lt")
+    {
+        for (cl_uint i = 0; i <= pos; i++) mask128.set(i);
+        if (mask_type == "lt") mask128.reset(pos);
+    }
+    if (mask_type == "ge" || mask_type == "gt")
+    {
+        for (cl_uint i = pos; i < max_sub_group_size; i++) mask128.set(i);
+        if (mask_type == "gt") mask128.reset(pos);
+    }
+
+    // convert std::bitset<128> to uint4
+    auto const uint_mask = bs128{ static_cast<unsigned long>(-1) };
+    mask.s0 = (mask128 & uint_mask).to_ulong();
+    mask128 >>= 32;
+    mask.s1 = (mask128 & uint_mask).to_ulong();
+    mask128 >>= 32;
+    mask.s2 = (mask128 & uint_mask).to_ulong();
+    mask128 >>= 32;
+    mask.s3 = (mask128 & uint_mask).to_ulong();
+
+    return mask;
+}
+
+// DESCRIPTION :
+// sub_group_broadcast - each work_item registers it's own value.
+// All work_items in subgroup takes one value from only one (any) work_item
+// sub_group_broadcast_first - same as type 0. All work_items in
+// subgroup takes only one value from only one chosen (the smallest subgroup ID)
+// work_item
+// sub_group_non_uniform_broadcast - same as type 0 but
+// only 4 work_items from subgroup enter the code (are active)
+template <typename Ty, SubgroupsBroadcastOp operation> struct BC
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int i, ii, j, k, n;
+        int ng = test_params.global_workgroup_size;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int nj = (nw + ns - 1) / ns;
+        int d = ns > 100 ? 100 : ns;
+        int non_uniform_size = ng % nw;
+        ng = ng / nw;
+        int last_subgroup_size = 0;
+        ii = 0;
+
+        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
+                 TypeManager<Ty>::name());
+        if (non_uniform_size)
+        {
+            log_info("  non uniform work group size mode ON\n");
+            ng++;
+        }
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            if (non_uniform_size && k == ng - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, nj, ns, nw,
+                                          last_subgroup_size);
+            }
+            for (j = 0; j < nj; ++j)
+            { // for each subgroup
+                ii = j * ns;
+                if (last_subgroup_size && j == nj - 1)
+                {
+                    n = last_subgroup_size;
+                }
+                else
+                {
+                    n = ii + ns > nw ? nw - ii : ns;
+                }
+                int bcast_if = 0;
+                int bcast_elseif = 0;
+                int bcast_index = (int)(genrand_int32(gMTdata) & 0x7fffffff)
+                    % (d > n ? n : d);
+                // l - calculate subgroup local id from which value will be
+                // broadcasted (one the same value for whole subgroup)
+                if (operation != SubgroupsBroadcastOp::broadcast)
+                {
+                    // reduce brodcasting index in case of non_uniform and
+                    // last workgroup last subgroup
+                    if (last_subgroup_size && j == nj - 1
+                        && last_subgroup_size < NR_OF_ACTIVE_WORK_ITEMS)
+                    {
+                        bcast_if = bcast_index % last_subgroup_size;
+                        bcast_elseif = bcast_if;
+                    }
+                    else
+                    {
+                        bcast_if = bcast_index % NR_OF_ACTIVE_WORK_ITEMS;
+                        bcast_elseif = NR_OF_ACTIVE_WORK_ITEMS
+                            + bcast_index % (n - NR_OF_ACTIVE_WORK_ITEMS);
+                    }
+                }
+
+                for (i = 0; i < n; ++i)
+                {
+                    if (operation == SubgroupsBroadcastOp::broadcast)
+                    {
+                        int midx = 4 * ii + 4 * i + 2;
+                        m[midx] = (cl_int)bcast_index;
+                    }
+                    else
+                    {
+                        if (i < NR_OF_ACTIVE_WORK_ITEMS)
+                        {
+                            // index of the third
+                            // element int the vector.
+                            int midx = 4 * ii + 4 * i + 2;
+                            // storing information about
+                            // broadcasting index -
+                            // earlier calculated
+                            m[midx] = (cl_int)bcast_if;
+                        }
+                        else
+                        { // index of the third
+                          // element int the vector.
+                            int midx = 4 * ii + 4 * i + 3;
+                            m[midx] = (cl_int)bcast_elseif;
+                        }
+                    }
+
+                    // calculate value for broadcasting
+                    cl_ulong number = genrand_int64(gMTdata);
+                    set_value(t[ii + i], number);
+                }
+            }
+            // Now map into work group using map from device
+            for (j = 0; j < nw; ++j)
+            { // for each element in work_group
+                // calculate index as number of subgroup
+                // plus subgroup local id
+                x[j] = t[j];
+            }
+            x += nw;
+            m += 4 * nw;
+        }
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int ii, i, j, k, l, n;
+        int ng = test_params.global_workgroup_size;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int nj = (nw + ns - 1) / ns;
+        Ty tr, rr;
+        int non_uniform_size = ng % nw;
+        ng = ng / nw;
+        int last_subgroup_size = 0;
+        if (non_uniform_size) ng++;
+
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            if (non_uniform_size && k == ng - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, nj, ns, nw,
+                                          last_subgroup_size);
+            }
+            for (j = 0; j < nw; ++j)
+            { // inside the work_group
+                mx[j] = x[j]; // read host inputs for work_group
+                my[j] = y[j]; // read device outputs for work_group
+            }
+
+            for (j = 0; j < nj; ++j)
+            { // for each subgroup
+                ii = j * ns;
+                if (last_subgroup_size && j == nj - 1)
+                {
+                    n = last_subgroup_size;
+                }
+                else
+                {
+                    n = ii + ns > nw ? nw - ii : ns;
+                }
+
+                // Check result
+                if (operation == SubgroupsBroadcastOp::broadcast_first)
+                {
+                    int lowest_active_id = -1;
+                    for (i = 0; i < n; ++i)
+                    {
+
+                        lowest_active_id = i < NR_OF_ACTIVE_WORK_ITEMS
+                            ? 0
+                            : NR_OF_ACTIVE_WORK_ITEMS;
+                        //  findout if broadcasted
+                        //  value is the same
+                        tr = mx[ii + lowest_active_id];
+                        //  findout if broadcasted to all
+                        rr = my[ii + i];
+
+                        if (!compare(rr, tr))
+                        {
+                            log_error(
+                                "ERROR: sub_group_broadcast_first(%s) "
+                                "mismatch "
+                                "for local id %d in sub group %d in group "
+                                "%d\n",
+                                TypeManager<Ty>::name(), i, j, k);
+                            return TEST_FAIL;
+                        }
+                    }
+                }
+                else
+                {
+                    for (i = 0; i < n; ++i)
+                    {
+                        if (operation == SubgroupsBroadcastOp::broadcast)
+                        {
+                            int midx = 4 * ii + 4 * i + 2;
+                            l = (int)m[midx];
+                            tr = mx[ii + l];
+                        }
+                        else
+                        {
+                            if (i < NR_OF_ACTIVE_WORK_ITEMS)
+                            { // take index of array where info
+                              // which work_item will be
+                              // broadcast its value is stored
+                                int midx = 4 * ii + 4 * i + 2;
+                                // take subgroup local id of
+                                // this work_item
+                                l = (int)m[midx];
+                                // take value generated on host
+                                // for this work_item
+                                tr = mx[ii + l];
+                            }
+                            else
+                            {
+                                int midx = 4 * ii + 4 * i + 3;
+                                l = (int)m[midx];
+                                tr = mx[ii + l];
+                            }
+                        }
+                        rr = my[ii + i]; // read device outputs for
+                                         // work_item in the subgroup
+
+                        if (!compare(rr, tr))
+                        {
+                            log_error("ERROR: sub_group_%s(%s) "
+                                      "mismatch for local id %d in sub "
+                                      "group %d in group %d - got %lu "
+                                      "expected %lu\n",
+                                      operation_names(operation),
+                                      TypeManager<Ty>::name(), i, j, k, rr, tr);
+                            return TEST_FAIL;
+                        }
+                    }
+                }
+            }
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+        log_info("  sub_group_%s(%s)... passed\n", operation_names(operation),
+                 TypeManager<Ty>::name());
+        return TEST_PASS;
+    }
+};
+
+static float to_float(subgroups::cl_half x) { return cl_half_to_float(x.data); }
+
+static subgroups::cl_half to_half(float x)
+{
+    subgroups::cl_half value;
+    value.data = cl_half_from_float(x, CL_HALF_RTE);
+    return value;
+}
+
+// for integer types
+template <typename Ty> inline Ty calculate(Ty a, Ty b, ArithmeticOp operation)
+{
+    switch (operation)
+    {
+        case ArithmeticOp::add_: return a + b;
+        case ArithmeticOp::max_: return a > b ? a : b;
+        case ArithmeticOp::min_: return a < b ? a : b;
+        case ArithmeticOp::mul_: return a * b;
+        case ArithmeticOp::and_: return a & b;
+        case ArithmeticOp::or_: return a | b;
+        case ArithmeticOp::xor_: return a ^ b;
+        case ArithmeticOp::logical_and: return a && b;
+        case ArithmeticOp::logical_or: return a || b;
+        case ArithmeticOp::logical_xor: return !a ^ !b;
+        default: log_error("Unknown operation request"); break;
+    }
+    return 0;
+}
+// Specialize for floating points.
+template <>
+inline cl_double calculate(cl_double a, cl_double b, ArithmeticOp operation)
+{
+    switch (operation)
+    {
+        case ArithmeticOp::add_: {
+            return a + b;
+        }
+        case ArithmeticOp::max_: {
+            return a > b ? a : b;
+        }
+        case ArithmeticOp::min_: {
+            return a < b ? a : b;
+        }
+        case ArithmeticOp::mul_: {
+            return a * b;
+        }
+        default: log_error("Unknown operation request"); break;
+    }
+    return 0;
+}
+
+template <>
+inline cl_float calculate(cl_float a, cl_float b, ArithmeticOp operation)
+{
+    switch (operation)
+    {
+        case ArithmeticOp::add_: {
+            return a + b;
+        }
+        case ArithmeticOp::max_: {
+            return a > b ? a : b;
+        }
+        case ArithmeticOp::min_: {
+            return a < b ? a : b;
+        }
+        case ArithmeticOp::mul_: {
+            return a * b;
+        }
+        default: log_error("Unknown operation request"); break;
+    }
+    return 0;
+}
+
+template <>
+inline subgroups::cl_half calculate(subgroups::cl_half a, subgroups::cl_half b,
+                                    ArithmeticOp operation)
+{
+    switch (operation)
+    {
+        case ArithmeticOp::add_: return to_half(to_float(a) + to_float(b));
+        case ArithmeticOp::max_:
+            return to_float(a) > to_float(b) || is_half_nan(b.data) ? a : b;
+        case ArithmeticOp::min_:
+            return to_float(a) < to_float(b) || is_half_nan(b.data) ? a : b;
+        case ArithmeticOp::mul_: return to_half(to_float(a) * to_float(b));
+        default: log_error("Unknown operation request"); break;
+    }
+    return to_half(0);
+}
+
+template <typename Ty> bool is_floating_point()
+{
+    return std::is_floating_point<Ty>::value
+        || std::is_same<Ty, subgroups::cl_half>::value;
+}
+
+template <typename Ty, ArithmeticOp operation>
+void genrand(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
+{
+    int nj = (nw + ns - 1) / ns;
+
+    for (int k = 0; k < ng; ++k)
+    {
+        for (int j = 0; j < nj; ++j)
+        {
+            int ii = j * ns;
+            int n = ii + ns > nw ? nw - ii : ns;
+
+            for (int i = 0; i < n; ++i)
+            {
+                cl_ulong out_value;
+                double y;
+                if (operation == ArithmeticOp::mul_
+                    || operation == ArithmeticOp::add_)
+                {
+                    // work around to avoid overflow, do not use 0 for
+                    // multiplication
+                    out_value = (genrand_int32(gMTdata) % 4) + 1;
+                }
+                else
+                {
+                    out_value = genrand_int64(gMTdata) % (32 * n);
+                    if ((operation == ArithmeticOp::logical_and
+                         || operation == ArithmeticOp::logical_or
+                         || operation == ArithmeticOp::logical_xor)
+                        && ((out_value >> 32) & 1) == 0)
+                        out_value = 0; // increase probability of false
+                }
+                set_value(t[ii + i], out_value);
+            }
+        }
+
+        // Now map into work group using map from device
+        for (int j = 0; j < nw; ++j)
+        {
+            x[j] = t[j];
+        }
+
+        x += nw;
+        m += 4 * nw;
+    }
+}
+
+template <typename Ty, ShuffleOp operation> struct SHF
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int i, ii, j, k, l, n, delta;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        int nj = (nw + ns - 1) / ns;
+        int d = ns > 100 ? 100 : ns;
+        ii = 0;
+        ng = ng / nw;
+        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
+                 TypeManager<Ty>::name());
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            for (j = 0; j < nj; ++j)
+            { // for each subgroup
+                ii = j * ns;
+                n = ii + ns > nw ? nw - ii : ns;
+                for (i = 0; i < n; ++i)
+                {
+                    int midx = 4 * ii + 4 * i + 2;
+                    l = (int)(genrand_int32(gMTdata) & 0x7fffffff)
+                        % (d > n ? n : d);
+                    switch (operation)
+                    {
+                        case ShuffleOp::shuffle:
+                        case ShuffleOp::shuffle_xor:
+                            // storing information about shuffle index
+                            m[midx] = (cl_int)l;
+                            break;
+                        case ShuffleOp::shuffle_up:
+                            delta = l; // calculate delta for shuffle up
+                            if (i - delta < 0)
+                            {
+                                delta = i;
+                            }
+                            m[midx] = (cl_int)delta;
+                            break;
+                        case ShuffleOp::shuffle_down:
+                            delta = l; // calculate delta for shuffle down
+                            if (i + delta >= n)
+                            {
+                                delta = n - 1 - i;
+                            }
+                            m[midx] = (cl_int)delta;
+                            break;
+                        default: break;
+                    }
+                    cl_ulong number = genrand_int64(gMTdata);
+                    set_value(t[ii + i], number);
+                }
+            }
+            // Now map into work group using map from device
+            for (j = 0; j < nw; ++j)
+            { // for each element in work_group
+                x[j] = t[j];
+            }
+            x += nw;
+            m += 4 * nw;
+        }
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int ii, i, j, k, l, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        int nj = (nw + ns - 1) / ns;
+        Ty tr, rr;
+        ng = ng / nw;
+
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            for (j = 0; j < nw; ++j)
+            { // inside the work_group
+                mx[j] = x[j]; // read host inputs for work_group
+                my[j] = y[j]; // read device outputs for work_group
+            }
+
+            for (j = 0; j < nj; ++j)
+            { // for each subgroup
+                ii = j * ns;
+                n = ii + ns > nw ? nw - ii : ns;
+
+                for (i = 0; i < n; ++i)
+                { // inside the subgroup
+                  // shuffle index storage
+                    int midx = 4 * ii + 4 * i + 2;
+                    l = (int)m[midx];
+                    rr = my[ii + i];
+                    switch (operation)
+                    {
+                        // shuffle basic - treat l as index
+                        case ShuffleOp::shuffle: tr = mx[ii + l]; break;
+                        // shuffle up - treat l as delta
+                        case ShuffleOp::shuffle_up: tr = mx[ii + i - l]; break;
+                        // shuffle up - treat l as delta
+                        case ShuffleOp::shuffle_down:
+                            tr = mx[ii + i + l];
+                            break;
+                        // shuffle xor - treat l as mask
+                        case ShuffleOp::shuffle_xor:
+                            tr = mx[ii + (i ^ l)];
+                            break;
+                        default: break;
+                    }
+
+                    if (!compare(rr, tr))
+                    {
+                        log_error("ERROR: sub_group_%s(%s) mismatch for "
+                                  "local id %d in sub group %d in group %d\n",
+                                  operation_names(operation),
+                                  TypeManager<Ty>::name(), i, j, k);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+        log_info("  sub_group_%s(%s)... passed\n", operation_names(operation),
+                 TypeManager<Ty>::name());
+        return TEST_PASS;
+    }
+};
+
+template <typename Ty, ArithmeticOp operation> struct SCEX_NU
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        ng = ng / nw;
+        std::string func_name;
+        work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
+                        : func_name = "sub_group_scan_exclusive";
+        log_info("  %s_%s(%s)...\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name());
+        log_info("  test params: global size = %d local size = %d subgroups "
+                 "size = %d work item mask = 0x%x \n",
+                 test_params.global_workgroup_size, nw, ns, work_items_mask);
+        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int ii, i, j, k, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        int nj = (nw + ns - 1) / ns;
+        Ty tr, rr;
+        ng = ng / nw;
+
+        std::string func_name;
+        work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
+                        : func_name = "sub_group_scan_exclusive";
+
+        uint32_t use_work_items_mask;
+        // for uniform case take into consideration all workitems
+        use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            // Map to array indexed to array indexed by local ID and sub group
+            for (j = 0; j < nw; ++j)
+            { // inside the work_group
+                mx[j] = x[j]; // read host inputs for work_group
+                my[j] = y[j]; // read device outputs for work_group
+            }
+            for (j = 0; j < nj; ++j)
+            {
+                ii = j * ns;
+                n = ii + ns > nw ? nw - ii : ns;
+                std::set<int> active_work_items;
+                for (i = 0; i < n; ++i)
+                {
+                    uint32_t check_work_item = 1 << (i % 32);
+                    if (use_work_items_mask & check_work_item)
+                    {
+                        active_work_items.insert(i);
+                    }
+                }
+                if (active_work_items.empty())
+                {
+                    log_info("  No acitve workitems in workgroup id = %d "
+                             "subgroup id = %d - no calculation\n",
+                             k, j);
+                    continue;
+                }
+                else if (active_work_items.size() == 1)
+                {
+                    log_info("  One active workitem in workgroup id = %d "
+                             "subgroup id = %d - no calculation\n",
+                             k, j);
+                    continue;
+                }
+                else
+                {
+                    tr = TypeManager<Ty>::identify_limits(operation);
+                    int idx = 0;
+                    for (const int &active_work_item : active_work_items)
+                    {
+                        rr = my[ii + active_work_item];
+                        if (idx == 0) continue;
+
+                        if (!compare_ordered(rr, tr))
+                        {
+                            log_error(
+                                "ERROR: %s_%s(%s) "
+                                "mismatch for local id %d in sub group %d in "
+                                "group %d Expected: %d Obtained: %d\n",
+                                func_name.c_str(), operation_names(operation),
+                                TypeManager<Ty>::name(), i, j, k, tr, rr);
+                            return TEST_FAIL;
+                        }
+                        tr = calculate<Ty>(tr, mx[ii + active_work_item],
+                                           operation);
+                        idx++;
+                    }
+                }
+            }
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+
+        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name());
+        return TEST_PASS;
+    }
+};
+
+// Test for scan inclusive non uniform functions
+template <typename Ty, ArithmeticOp operation> struct SCIN_NU
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        ng = ng / nw;
+        std::string func_name;
+        work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
+                        : func_name = "sub_group_scan_inclusive";
+
+        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+        log_info("  %s_%s(%s)...\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name());
+        log_info("  test params: global size = %d local size = %d subgroups "
+                 "size = %d work item mask = 0x%x \n",
+                 test_params.global_workgroup_size, nw, ns, work_items_mask);
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int ii, i, j, k, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        int nj = (nw + ns - 1) / ns;
+        Ty tr, rr;
+        ng = ng / nw;
+
+        std::string func_name;
+        work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
+                        : func_name = "sub_group_scan_inclusive";
+
+        uint32_t use_work_items_mask;
+        // for uniform case take into consideration all workitems
+        use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
+        // std::bitset<32> mask32(use_work_items_mask);
+        // for (int k) mask32.count();
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            // Map to array indexed to array indexed by local ID and sub group
+            for (j = 0; j < nw; ++j)
+            { // inside the work_group
+                mx[j] = x[j]; // read host inputs for work_group
+                my[j] = y[j]; // read device outputs for work_group
+            }
+            for (j = 0; j < nj; ++j)
+            {
+                ii = j * ns;
+                n = ii + ns > nw ? nw - ii : ns;
+                std::set<int> active_work_items;
+                int catch_frist_active = -1;
+
+                for (i = 0; i < n; ++i)
+                {
+                    uint32_t check_work_item = 1 << (i % 32);
+                    if (use_work_items_mask & check_work_item)
+                    {
+                        if (catch_frist_active == -1)
+                        {
+                            catch_frist_active = i;
+                        }
+                        active_work_items.insert(i);
+                    }
+                }
+                if (active_work_items.empty())
+                {
+                    log_info("  No acitve workitems in workgroup id = %d "
+                             "subgroup id = %d - no calculation\n",
+                             k, j);
+                    continue;
+                }
+                else
+                {
+                    tr = TypeManager<Ty>::identify_limits(operation);
+                    for (const int &active_work_item : active_work_items)
+                    {
+                        rr = my[ii + active_work_item];
+                        if (active_work_items.size() == 1)
+                        {
+                            tr = mx[ii + catch_frist_active];
+                        }
+                        else
+                        {
+                            tr = calculate<Ty>(tr, mx[ii + active_work_item],
+                                               operation);
+                        }
+                        if (!compare_ordered<Ty>(rr, tr))
+                        {
+                            log_error(
+                                "ERROR: %s_%s(%s) "
+                                "mismatch for local id %d in sub group %d "
+                                "in "
+                                "group %d Expected: %d Obtained: %d\n",
+                                func_name.c_str(), operation_names(operation),
+                                TypeManager<Ty>::name(), active_work_item, j, k,
+                                tr, rr);
+                            return TEST_FAIL;
+                        }
+                    }
+                }
+            }
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+
+        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name());
+        return TEST_PASS;
+    }
+};
+
+// Test for reduce non uniform functions
+template <typename Ty, ArithmeticOp operation> struct RED_NU
+{
+
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        ng = ng / nw;
+        std::string func_name;
+
+        work_items_mask ? func_name = "sub_group_non_uniform_reduce"
+                        : func_name = "sub_group_reduce";
+        log_info("  %s_%s(%s)...\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name());
+        log_info("  test params: global size = %d local size = %d subgroups "
+                 "size = %d work item mask = 0x%x \n",
+                 test_params.global_workgroup_size, nw, ns, work_items_mask);
+        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int ii, i, j, k, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        int nj = (nw + ns - 1) / ns;
+        ng = ng / nw;
+        Ty tr, rr;
+
+        std::string func_name;
+        work_items_mask ? func_name = "sub_group_non_uniform_reduce"
+                        : func_name = "sub_group_reduce";
+
+        for (k = 0; k < ng; ++k)
+        {
+            // Map to array indexed to array indexed by local ID and sub
+            // group
+            for (j = 0; j < nw; ++j)
+            {
+                mx[j] = x[j];
+                my[j] = y[j];
+            }
+
+            uint32_t use_work_items_mask;
+            use_work_items_mask =
+                !work_items_mask ? 0xFFFFFFFF : work_items_mask;
+
+            for (j = 0; j < nj; ++j)
+            {
+                ii = j * ns;
+                n = ii + ns > nw ? nw - ii : ns;
+                std::set<int> active_work_items;
+                int catch_frist_active = -1;
+                for (i = 0; i < n; ++i)
+                {
+                    uint32_t check_work_item = 1 << (i % 32);
+                    if (use_work_items_mask & check_work_item)
+                    {
+                        if (catch_frist_active == -1)
+                        {
+                            catch_frist_active = i;
+                            tr = mx[ii + i];
+                            active_work_items.insert(i);
+                            continue;
+                        }
+                        active_work_items.insert(i);
+                        tr = calculate<Ty>(tr, mx[ii + i], operation);
+                    }
+                }
+
+                if (active_work_items.empty())
+                {
+                    log_info("  No acitve workitems in workgroup id = %d "
+                             "subgroup id = %d - no calculation\n",
+                             k, j);
+                    continue;
+                }
+
+                for (const int &active_work_item : active_work_items)
+                {
+                    rr = my[ii + active_work_item];
+                    if (!compare_ordered<Ty>(rr, tr))
+                    {
+                        log_error("ERROR: %s_%s(%s) "
+                                  "mismatch for local id %d in sub group %d in "
+                                  "group %d Expected: %d Obtained: %d\n",
+                                  func_name.c_str(), operation_names(operation),
+                                  TypeManager<Ty>::name(), active_work_item, j,
+                                  k, tr, rr);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+
+        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
+                 operation_names(operation), TypeManager<Ty>::name());
+        return TEST_PASS;
+    }
+};
+
+#endif
diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h
index dc49af2..93673b3 100644
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -19,13 +19,176 @@
 #include "testHarness.h"
 #include "kernelHelpers.h"
 #include "typeWrappers.h"
+#include "imageHelpers.h"
 
 #include <limits>
 #include <vector>
+#include <type_traits>
+
+#define NR_OF_ACTIVE_WORK_ITEMS 4
+
+extern MTdata gMTdata;
+
+struct WorkGroupParams
+{
+    WorkGroupParams(size_t gws, size_t lws,
+                    const std::vector<std::string> &req_ext = {},
+                    const std::vector<uint32_t> &all_wim = {})
+        : global_workgroup_size(gws), local_workgroup_size(lws),
+          required_extensions(req_ext), all_work_item_masks(all_wim)
+    {
+        subgroup_size = 0;
+        work_items_mask = 0;
+        use_core_subgroups = true;
+        dynsc = 0;
+    }
+    size_t global_workgroup_size;
+    size_t local_workgroup_size;
+    size_t subgroup_size;
+    uint32_t work_items_mask;
+    int dynsc;
+    bool use_core_subgroups;
+    std::vector<std::string> required_extensions;
+    std::vector<uint32_t> all_work_item_masks;
+};
+
+enum class SubgroupsBroadcastOp
+{
+    broadcast,
+    broadcast_first,
+    non_uniform_broadcast
+};
+
+enum class NonUniformVoteOp
+{
+    elect,
+    all,
+    any,
+    all_equal
+};
+
+enum class BallotOp
+{
+    ballot,
+    inverse_ballot,
+    ballot_bit_extract,
+    ballot_bit_count,
+    ballot_inclusive_scan,
+    ballot_exclusive_scan,
+    ballot_find_lsb,
+    ballot_find_msb,
+    eq_mask,
+    ge_mask,
+    gt_mask,
+    le_mask,
+    lt_mask,
+};
+
+enum class ShuffleOp
+{
+    shuffle,
+    shuffle_up,
+    shuffle_down,
+    shuffle_xor
+};
+
+enum class ArithmeticOp
+{
+    add_,
+    max_,
+    min_,
+    mul_,
+    and_,
+    or_,
+    xor_,
+    logical_and,
+    logical_or,
+    logical_xor
+};
+
+static const char *const operation_names(ArithmeticOp operation)
+{
+    switch (operation)
+    {
+        case ArithmeticOp::add_: return "add";
+        case ArithmeticOp::max_: return "max";
+        case ArithmeticOp::min_: return "min";
+        case ArithmeticOp::mul_: return "mul";
+        case ArithmeticOp::and_: return "and";
+        case ArithmeticOp::or_: return "or";
+        case ArithmeticOp::xor_: return "xor";
+        case ArithmeticOp::logical_and: return "logical_and";
+        case ArithmeticOp::logical_or: return "logical_or";
+        case ArithmeticOp::logical_xor: return "logical_xor";
+        default: log_error("Unknown operation request"); break;
+    }
+    return "";
+}
+
+static const char *const operation_names(BallotOp operation)
+{
+    switch (operation)
+    {
+        case BallotOp::ballot: return "ballot";
+        case BallotOp::inverse_ballot: return "inverse_ballot";
+        case BallotOp::ballot_bit_extract: return "bit_extract";
+        case BallotOp::ballot_bit_count: return "bit_count";
+        case BallotOp::ballot_inclusive_scan: return "inclusive_scan";
+        case BallotOp::ballot_exclusive_scan: return "exclusive_scan";
+        case BallotOp::ballot_find_lsb: return "find_lsb";
+        case BallotOp::ballot_find_msb: return "find_msb";
+        case BallotOp::eq_mask: return "eq";
+        case BallotOp::ge_mask: return "ge";
+        case BallotOp::gt_mask: return "gt";
+        case BallotOp::le_mask: return "le";
+        case BallotOp::lt_mask: return "lt";
+        default: log_error("Unknown operation request"); break;
+    }
+    return "";
+}
+
+static const char *const operation_names(ShuffleOp operation)
+{
+    switch (operation)
+    {
+        case ShuffleOp::shuffle: return "shuffle";
+        case ShuffleOp::shuffle_up: return "shuffle_up";
+        case ShuffleOp::shuffle_down: return "shuffle_down";
+        case ShuffleOp::shuffle_xor: return "shuffle_xor";
+        default: log_error("Unknown operation request"); break;
+    }
+    return "";
+}
+
+static const char *const operation_names(NonUniformVoteOp operation)
+{
+    switch (operation)
+    {
+        case NonUniformVoteOp::all: return "all";
+        case NonUniformVoteOp::all_equal: return "all_equal";
+        case NonUniformVoteOp::any: return "any";
+        case NonUniformVoteOp::elect: return "elect";
+        default: log_error("Unknown operation request"); break;
+    }
+    return "";
+}
+
+static const char *const operation_names(SubgroupsBroadcastOp operation)
+{
+    switch (operation)
+    {
+        case SubgroupsBroadcastOp::broadcast: return "broadcast";
+        case SubgroupsBroadcastOp::broadcast_first: return "broadcast_first";
+        case SubgroupsBroadcastOp::non_uniform_broadcast:
+            return "non_uniform_broadcast";
+        default: log_error("Unknown operation request"); break;
+    }
+    return "";
+}
 
 class subgroupsAPI {
 public:
-    subgroupsAPI(cl_platform_id platform, bool useCoreSubgroups)
+    subgroupsAPI(cl_platform_id platform, bool use_core_subgroups)
     {
         static_assert(CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE
                           == CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,
@@ -33,7 +196,7 @@
         static_assert(CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE
                           == CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR,
                       "Enums have to be the same");
-        if (useCoreSubgroups)
+        if (use_core_subgroups)
         {
             _clGetKernelSubGroupInfo_ptr = &clGetKernelSubGroupInfo;
             clGetKernelSubGroupInfo_name = "clGetKernelSubGroupInfo";
@@ -56,163 +219,76 @@
     clGetKernelSubGroupInfoKHR_fn _clGetKernelSubGroupInfo_ptr;
 };
 
-// Some template helpers
-template <typename Ty> struct TypeName;
-template <> struct TypeName<cl_half>
+// Need to defined custom type for vector size = 3 and half type. This is
+// because of 3-component types are otherwise indistinguishable from the
+// 4-component types, and because the half type is indistinguishable from some
+// other 16-bit type (ushort)
+namespace subgroups {
+struct cl_char3
 {
-    static const char *val() { return "half"; }
+    ::cl_char3 data;
 };
-template <> struct TypeName<cl_uint>
+struct cl_uchar3
 {
-    static const char *val() { return "uint"; }
+    ::cl_uchar3 data;
 };
-template <> struct TypeName<cl_int>
+struct cl_short3
 {
-    static const char *val() { return "int"; }
+    ::cl_short3 data;
 };
-template <> struct TypeName<cl_ulong>
+struct cl_ushort3
 {
-    static const char *val() { return "ulong"; }
+    ::cl_ushort3 data;
 };
-template <> struct TypeName<cl_long>
+struct cl_int3
 {
-    static const char *val() { return "long"; }
+    ::cl_int3 data;
 };
-template <> struct TypeName<float>
+struct cl_uint3
 {
-    static const char *val() { return "float"; }
+    ::cl_uint3 data;
 };
-template <> struct TypeName<double>
+struct cl_long3
 {
-    static const char *val() { return "double"; }
+    ::cl_long3 data;
 };
-
-template <typename Ty> struct TypeDef;
-template <> struct TypeDef<cl_half>
+struct cl_ulong3
 {
-    static const char *val() { return "typedef half Type;\n"; }
+    ::cl_ulong3 data;
 };
-template <> struct TypeDef<cl_uint>
+struct cl_float3
 {
-    static const char *val() { return "typedef uint Type;\n"; }
+    ::cl_float3 data;
 };
-template <> struct TypeDef<cl_int>
+struct cl_double3
 {
-    static const char *val() { return "typedef int Type;\n"; }
+    ::cl_double3 data;
 };
-template <> struct TypeDef<cl_ulong>
+struct cl_half
 {
-    static const char *val() { return "typedef ulong Type;\n"; }
+    ::cl_half data;
 };
-template <> struct TypeDef<cl_long>
+struct cl_half2
 {
-    static const char *val() { return "typedef long Type;\n"; }
+    ::cl_half2 data;
 };
-template <> struct TypeDef<float>
+struct cl_half3
 {
-    static const char *val() { return "typedef float Type;\n"; }
+    ::cl_half3 data;
 };
-template <> struct TypeDef<double>
+struct cl_half4
 {
-    static const char *val() { return "typedef double Type;\n"; }
+    ::cl_half4 data;
 };
-
-template <typename Ty, int Which> struct TypeIdentity;
-// template <> struct TypeIdentity<cl_half,0> { static cl_half val() { return
-// (cl_half)0.0; } }; template <> struct TypeIdentity<cl_half,0> { static
-// cl_half val() { return -(cl_half)65536.0; } }; template <> struct
-// TypeIdentity<cl_half,0> { static cl_half val() { return (cl_half)65536.0; }
-// };
-
-template <> struct TypeIdentity<cl_uint, 0>
+struct cl_half8
 {
-    static cl_uint val() { return (cl_uint)0; }
+    ::cl_half8 data;
 };
-template <> struct TypeIdentity<cl_uint, 1>
+struct cl_half16
 {
-    static cl_uint val() { return (cl_uint)0; }
+    ::cl_half16 data;
 };
-template <> struct TypeIdentity<cl_uint, 2>
-{
-    static cl_uint val() { return (cl_uint)0xffffffff; }
-};
-
-template <> struct TypeIdentity<cl_int, 0>
-{
-    static cl_int val() { return (cl_int)0; }
-};
-template <> struct TypeIdentity<cl_int, 1>
-{
-    static cl_int val() { return (cl_int)0x80000000; }
-};
-template <> struct TypeIdentity<cl_int, 2>
-{
-    static cl_int val() { return (cl_int)0x7fffffff; }
-};
-
-template <> struct TypeIdentity<cl_ulong, 0>
-{
-    static cl_ulong val() { return (cl_ulong)0; }
-};
-template <> struct TypeIdentity<cl_ulong, 1>
-{
-    static cl_ulong val() { return (cl_ulong)0; }
-};
-template <> struct TypeIdentity<cl_ulong, 2>
-{
-    static cl_ulong val() { return (cl_ulong)0xffffffffffffffffULL; }
-};
-
-template <> struct TypeIdentity<cl_long, 0>
-{
-    static cl_long val() { return (cl_long)0; }
-};
-template <> struct TypeIdentity<cl_long, 1>
-{
-    static cl_long val() { return (cl_long)0x8000000000000000ULL; }
-};
-template <> struct TypeIdentity<cl_long, 2>
-{
-    static cl_long val() { return (cl_long)0x7fffffffffffffffULL; }
-};
-
-
-template <> struct TypeIdentity<float, 0>
-{
-    static float val() { return 0.F; }
-};
-template <> struct TypeIdentity<float, 1>
-{
-    static float val() { return -std::numeric_limits<float>::infinity(); }
-};
-template <> struct TypeIdentity<float, 2>
-{
-    static float val() { return std::numeric_limits<float>::infinity(); }
-};
-
-template <> struct TypeIdentity<double, 0>
-{
-    static double val() { return 0.L; }
-};
-
-template <> struct TypeIdentity<double, 1>
-{
-    static double val() { return -std::numeric_limits<double>::infinity(); }
-};
-template <> struct TypeIdentity<double, 2>
-{
-    static double val() { return std::numeric_limits<double>::infinity(); }
-};
-
-template <typename Ty> struct TypeCheck;
-template <> struct TypeCheck<cl_uint>
-{
-    static bool val(cl_device_id) { return true; }
-};
-template <> struct TypeCheck<cl_int>
-{
-    static bool val(cl_device_id) { return true; }
-};
+}
 
 static bool int64_ok(cl_device_id device)
 {
@@ -233,43 +309,860 @@
     return true;
 }
 
-template <> struct TypeCheck<cl_ulong>
+static bool double_ok(cl_device_id device)
 {
-    static bool val(cl_device_id device) { return int64_ok(device); }
-};
-template <> struct TypeCheck<cl_long>
-{
-    static bool val(cl_device_id device) { return int64_ok(device); }
-};
-template <> struct TypeCheck<cl_float>
-{
-    static bool val(cl_device_id) { return true; }
-};
-template <> struct TypeCheck<cl_half>
-{
-    static bool val(cl_device_id device)
+    int error;
+    cl_device_fp_config c;
+    error = clGetDeviceInfo(device, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(c),
+                            (void *)&c, NULL);
+    if (error)
     {
-        return is_extension_available(device, "cl_khr_fp16");
+        log_info("clGetDeviceInfo failed with CL_DEVICE_DOUBLE_FP_CONFIG\n");
+        return false;
     }
-};
-template <> struct TypeCheck<double>
+    return c != 0;
+}
+
+static bool half_ok(cl_device_id device)
 {
-    static bool val(cl_device_id device)
+    int error;
+    cl_device_fp_config c;
+    error = clGetDeviceInfo(device, CL_DEVICE_HALF_FP_CONFIG, sizeof(c),
+                            (void *)&c, NULL);
+    if (error)
     {
-        int error;
-        cl_device_fp_config c;
-        error = clGetDeviceInfo(device, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(c),
-                                (void *)&c, NULL);
-        if (error)
+        log_info("clGetDeviceInfo failed with CL_DEVICE_HALF_FP_CONFIG\n");
+        return false;
+    }
+    return c != 0;
+}
+
+template <typename Ty> struct CommonTypeManager
+{
+
+    static const char *name() { return ""; }
+    static const char *add_typedef() { return "\n"; }
+    typedef std::false_type is_vector_type;
+    typedef std::false_type is_sb_vector_size3;
+    typedef std::false_type is_sb_vector_type;
+    typedef std::false_type is_sb_scalar_type;
+    static const bool type_supported(cl_device_id) { return true; }
+    static const Ty identify_limits(ArithmeticOp operation)
+    {
+        switch (operation)
         {
-            log_info(
-                "clGetDeviceInfo failed with CL_DEVICE_DOUBLE_FP_CONFIG\n");
-            return false;
+            case ArithmeticOp::add_: return (Ty)0;
+            case ArithmeticOp::max_: return (std::numeric_limits<Ty>::min)();
+            case ArithmeticOp::min_: return (std::numeric_limits<Ty>::max)();
+            case ArithmeticOp::mul_: return (Ty)1;
+            case ArithmeticOp::and_: return (Ty)~0;
+            case ArithmeticOp::or_: return (Ty)0;
+            case ArithmeticOp::xor_: return (Ty)0;
+            default: log_error("Unknown operation request"); break;
         }
-        return c != 0;
+        return 0;
     }
 };
 
+template <typename> struct TypeManager;
+
+template <> struct TypeManager<cl_int> : public CommonTypeManager<cl_int>
+{
+    static const char *name() { return "int"; }
+    static const char *add_typedef() { return "typedef int Type;\n"; }
+    static cl_int identify_limits(ArithmeticOp operation)
+    {
+        switch (operation)
+        {
+            case ArithmeticOp::add_: return (cl_int)0;
+            case ArithmeticOp::max_:
+                return (std::numeric_limits<cl_int>::min)();
+            case ArithmeticOp::min_:
+                return (std::numeric_limits<cl_int>::max)();
+            case ArithmeticOp::mul_: return (cl_int)1;
+            case ArithmeticOp::and_: return (cl_int)~0;
+            case ArithmeticOp::or_: return (cl_int)0;
+            case ArithmeticOp::xor_: return (cl_int)0;
+            case ArithmeticOp::logical_and: return (cl_int)1;
+            case ArithmeticOp::logical_or: return (cl_int)0;
+            case ArithmeticOp::logical_xor: return (cl_int)0;
+            default: log_error("Unknown operation request"); break;
+        }
+        return 0;
+    }
+};
+template <> struct TypeManager<cl_int2> : public CommonTypeManager<cl_int2>
+{
+    static const char *name() { return "int2"; }
+    static const char *add_typedef() { return "typedef int2 Type;\n"; }
+    typedef std::true_type is_vector_type;
+    using scalar_type = cl_int;
+};
+template <>
+struct TypeManager<subgroups::cl_int3>
+    : public CommonTypeManager<subgroups::cl_int3>
+{
+    static const char *name() { return "int3"; }
+    static const char *add_typedef() { return "typedef int3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_int;
+};
+template <> struct TypeManager<cl_int4> : public CommonTypeManager<cl_int4>
+{
+    static const char *name() { return "int4"; }
+    static const char *add_typedef() { return "typedef int4 Type;\n"; }
+    using scalar_type = cl_int;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_int8> : public CommonTypeManager<cl_int8>
+{
+    static const char *name() { return "int8"; }
+    static const char *add_typedef() { return "typedef int8 Type;\n"; }
+    using scalar_type = cl_int;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_int16> : public CommonTypeManager<cl_int16>
+{
+    static const char *name() { return "int16"; }
+    static const char *add_typedef() { return "typedef int16 Type;\n"; }
+    using scalar_type = cl_int;
+    typedef std::true_type is_vector_type;
+};
+// cl_uint
+template <> struct TypeManager<cl_uint> : public CommonTypeManager<cl_uint>
+{
+    static const char *name() { return "uint"; }
+    static const char *add_typedef() { return "typedef uint Type;\n"; }
+};
+template <> struct TypeManager<cl_uint2> : public CommonTypeManager<cl_uint2>
+{
+    static const char *name() { return "uint2"; }
+    static const char *add_typedef() { return "typedef uint2 Type;\n"; }
+    using scalar_type = cl_uint;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<subgroups::cl_uint3>
+    : public CommonTypeManager<subgroups::cl_uint3>
+{
+    static const char *name() { return "uint3"; }
+    static const char *add_typedef() { return "typedef uint3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_uint;
+};
+template <> struct TypeManager<cl_uint4> : public CommonTypeManager<cl_uint4>
+{
+    static const char *name() { return "uint4"; }
+    static const char *add_typedef() { return "typedef uint4 Type;\n"; }
+    using scalar_type = cl_uint;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_uint8> : public CommonTypeManager<cl_uint8>
+{
+    static const char *name() { return "uint8"; }
+    static const char *add_typedef() { return "typedef uint8 Type;\n"; }
+    using scalar_type = cl_uint;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_uint16> : public CommonTypeManager<cl_uint16>
+{
+    static const char *name() { return "uint16"; }
+    static const char *add_typedef() { return "typedef uint16 Type;\n"; }
+    using scalar_type = cl_uint;
+    typedef std::true_type is_vector_type;
+};
+// cl_short
+template <> struct TypeManager<cl_short> : public CommonTypeManager<cl_short>
+{
+    static const char *name() { return "short"; }
+    static const char *add_typedef() { return "typedef short Type;\n"; }
+};
+template <> struct TypeManager<cl_short2> : public CommonTypeManager<cl_short2>
+{
+    static const char *name() { return "short2"; }
+    static const char *add_typedef() { return "typedef short2 Type;\n"; }
+    using scalar_type = cl_short;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<subgroups::cl_short3>
+    : public CommonTypeManager<subgroups::cl_short3>
+{
+    static const char *name() { return "short3"; }
+    static const char *add_typedef() { return "typedef short3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_short;
+};
+template <> struct TypeManager<cl_short4> : public CommonTypeManager<cl_short4>
+{
+    static const char *name() { return "short4"; }
+    static const char *add_typedef() { return "typedef short4 Type;\n"; }
+    using scalar_type = cl_short;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_short8> : public CommonTypeManager<cl_short8>
+{
+    static const char *name() { return "short8"; }
+    static const char *add_typedef() { return "typedef short8 Type;\n"; }
+    using scalar_type = cl_short;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<cl_short16> : public CommonTypeManager<cl_short16>
+{
+    static const char *name() { return "short16"; }
+    static const char *add_typedef() { return "typedef short16 Type;\n"; }
+    using scalar_type = cl_short;
+    typedef std::true_type is_vector_type;
+};
+// cl_ushort
+template <> struct TypeManager<cl_ushort> : public CommonTypeManager<cl_ushort>
+{
+    static const char *name() { return "ushort"; }
+    static const char *add_typedef() { return "typedef ushort Type;\n"; }
+};
+template <>
+struct TypeManager<cl_ushort2> : public CommonTypeManager<cl_ushort2>
+{
+    static const char *name() { return "ushort2"; }
+    static const char *add_typedef() { return "typedef ushort2 Type;\n"; }
+    using scalar_type = cl_ushort;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<subgroups::cl_ushort3>
+    : public CommonTypeManager<subgroups::cl_ushort3>
+{
+    static const char *name() { return "ushort3"; }
+    static const char *add_typedef() { return "typedef ushort3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_ushort;
+};
+template <>
+struct TypeManager<cl_ushort4> : public CommonTypeManager<cl_ushort4>
+{
+    static const char *name() { return "ushort4"; }
+    static const char *add_typedef() { return "typedef ushort4 Type;\n"; }
+    using scalar_type = cl_ushort;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<cl_ushort8> : public CommonTypeManager<cl_ushort8>
+{
+    static const char *name() { return "ushort8"; }
+    static const char *add_typedef() { return "typedef ushort8 Type;\n"; }
+    using scalar_type = cl_ushort;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<cl_ushort16> : public CommonTypeManager<cl_ushort16>
+{
+    static const char *name() { return "ushort16"; }
+    static const char *add_typedef() { return "typedef ushort16 Type;\n"; }
+    using scalar_type = cl_ushort;
+    typedef std::true_type is_vector_type;
+};
+// cl_char
+template <> struct TypeManager<cl_char> : public CommonTypeManager<cl_char>
+{
+    static const char *name() { return "char"; }
+    static const char *add_typedef() { return "typedef char Type;\n"; }
+};
+template <> struct TypeManager<cl_char2> : public CommonTypeManager<cl_char2>
+{
+    static const char *name() { return "char2"; }
+    static const char *add_typedef() { return "typedef char2 Type;\n"; }
+    using scalar_type = cl_char;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<subgroups::cl_char3>
+    : public CommonTypeManager<subgroups::cl_char3>
+{
+    static const char *name() { return "char3"; }
+    static const char *add_typedef() { return "typedef char3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_char;
+};
+template <> struct TypeManager<cl_char4> : public CommonTypeManager<cl_char4>
+{
+    static const char *name() { return "char4"; }
+    static const char *add_typedef() { return "typedef char4 Type;\n"; }
+    using scalar_type = cl_char;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_char8> : public CommonTypeManager<cl_char8>
+{
+    static const char *name() { return "char8"; }
+    static const char *add_typedef() { return "typedef char8 Type;\n"; }
+    using scalar_type = cl_char;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_char16> : public CommonTypeManager<cl_char16>
+{
+    static const char *name() { return "char16"; }
+    static const char *add_typedef() { return "typedef char16 Type;\n"; }
+    using scalar_type = cl_char;
+    typedef std::true_type is_vector_type;
+};
+// cl_uchar
+template <> struct TypeManager<cl_uchar> : public CommonTypeManager<cl_uchar>
+{
+    static const char *name() { return "uchar"; }
+    static const char *add_typedef() { return "typedef uchar Type;\n"; }
+};
+template <> struct TypeManager<cl_uchar2> : public CommonTypeManager<cl_uchar2>
+{
+    static const char *name() { return "uchar2"; }
+    static const char *add_typedef() { return "typedef uchar2 Type;\n"; }
+    using scalar_type = cl_uchar;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<subgroups::cl_uchar3>
+    : public CommonTypeManager<subgroups::cl_char3>
+{
+    static const char *name() { return "uchar3"; }
+    static const char *add_typedef() { return "typedef uchar3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_uchar;
+};
+template <> struct TypeManager<cl_uchar4> : public CommonTypeManager<cl_uchar4>
+{
+    static const char *name() { return "uchar4"; }
+    static const char *add_typedef() { return "typedef uchar4 Type;\n"; }
+    using scalar_type = cl_uchar;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_uchar8> : public CommonTypeManager<cl_uchar8>
+{
+    static const char *name() { return "uchar8"; }
+    static const char *add_typedef() { return "typedef uchar8 Type;\n"; }
+    using scalar_type = cl_uchar;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<cl_uchar16> : public CommonTypeManager<cl_uchar16>
+{
+    static const char *name() { return "uchar16"; }
+    static const char *add_typedef() { return "typedef uchar16 Type;\n"; }
+    using scalar_type = cl_uchar;
+    typedef std::true_type is_vector_type;
+};
+// cl_long
+template <> struct TypeManager<cl_long> : public CommonTypeManager<cl_long>
+{
+    static const char *name() { return "long"; }
+    static const char *add_typedef() { return "typedef long Type;\n"; }
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <> struct TypeManager<cl_long2> : public CommonTypeManager<cl_long2>
+{
+    static const char *name() { return "long2"; }
+    static const char *add_typedef() { return "typedef long2 Type;\n"; }
+    using scalar_type = cl_long;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <>
+struct TypeManager<subgroups::cl_long3>
+    : public CommonTypeManager<subgroups::cl_long3>
+{
+    static const char *name() { return "long3"; }
+    static const char *add_typedef() { return "typedef long3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_long;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <> struct TypeManager<cl_long4> : public CommonTypeManager<cl_long4>
+{
+    static const char *name() { return "long4"; }
+    static const char *add_typedef() { return "typedef long4 Type;\n"; }
+    using scalar_type = cl_long;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <> struct TypeManager<cl_long8> : public CommonTypeManager<cl_long8>
+{
+    static const char *name() { return "long8"; }
+    static const char *add_typedef() { return "typedef long8 Type;\n"; }
+    using scalar_type = cl_long;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <> struct TypeManager<cl_long16> : public CommonTypeManager<cl_long16>
+{
+    static const char *name() { return "long16"; }
+    static const char *add_typedef() { return "typedef long16 Type;\n"; }
+    using scalar_type = cl_long;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+// cl_ulong
+template <> struct TypeManager<cl_ulong> : public CommonTypeManager<cl_ulong>
+{
+    static const char *name() { return "ulong"; }
+    static const char *add_typedef() { return "typedef ulong Type;\n"; }
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <> struct TypeManager<cl_ulong2> : public CommonTypeManager<cl_ulong2>
+{
+    static const char *name() { return "ulong2"; }
+    static const char *add_typedef() { return "typedef ulong2 Type;\n"; }
+    using scalar_type = cl_ulong;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <>
+struct TypeManager<subgroups::cl_ulong3>
+    : public CommonTypeManager<subgroups::cl_ulong3>
+{
+    static const char *name() { return "ulong3"; }
+    static const char *add_typedef() { return "typedef ulong3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_ulong;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <> struct TypeManager<cl_ulong4> : public CommonTypeManager<cl_ulong4>
+{
+    static const char *name() { return "ulong4"; }
+    static const char *add_typedef() { return "typedef ulong4 Type;\n"; }
+    using scalar_type = cl_ulong;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <> struct TypeManager<cl_ulong8> : public CommonTypeManager<cl_ulong8>
+{
+    static const char *name() { return "ulong8"; }
+    static const char *add_typedef() { return "typedef ulong8 Type;\n"; }
+    using scalar_type = cl_ulong;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+template <>
+struct TypeManager<cl_ulong16> : public CommonTypeManager<cl_ulong16>
+{
+    static const char *name() { return "ulong16"; }
+    static const char *add_typedef() { return "typedef ulong16 Type;\n"; }
+    using scalar_type = cl_ulong;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return int64_ok(device);
+    }
+};
+
+// cl_float
+template <> struct TypeManager<cl_float> : public CommonTypeManager<cl_float>
+{
+    static const char *name() { return "float"; }
+    static const char *add_typedef() { return "typedef float Type;\n"; }
+    static cl_float identify_limits(ArithmeticOp operation)
+    {
+        switch (operation)
+        {
+            case ArithmeticOp::add_: return 0.0f;
+            case ArithmeticOp::max_:
+                return -std::numeric_limits<float>::infinity();
+            case ArithmeticOp::min_:
+                return std::numeric_limits<float>::infinity();
+            case ArithmeticOp::mul_: return (cl_float)1;
+            default: log_error("Unknown operation request"); break;
+        }
+        return 0;
+    }
+};
+template <> struct TypeManager<cl_float2> : public CommonTypeManager<cl_float2>
+{
+    static const char *name() { return "float2"; }
+    static const char *add_typedef() { return "typedef float2 Type;\n"; }
+    using scalar_type = cl_float;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<subgroups::cl_float3>
+    : public CommonTypeManager<subgroups::cl_float3>
+{
+    static const char *name() { return "float3"; }
+    static const char *add_typedef() { return "typedef float3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_float;
+};
+template <> struct TypeManager<cl_float4> : public CommonTypeManager<cl_float4>
+{
+    static const char *name() { return "float4"; }
+    static const char *add_typedef() { return "typedef float4 Type;\n"; }
+    using scalar_type = cl_float;
+    typedef std::true_type is_vector_type;
+};
+template <> struct TypeManager<cl_float8> : public CommonTypeManager<cl_float8>
+{
+    static const char *name() { return "float8"; }
+    static const char *add_typedef() { return "typedef float8 Type;\n"; }
+    using scalar_type = cl_float;
+    typedef std::true_type is_vector_type;
+};
+template <>
+struct TypeManager<cl_float16> : public CommonTypeManager<cl_float16>
+{
+    static const char *name() { return "float16"; }
+    static const char *add_typedef() { return "typedef float16 Type;\n"; }
+    using scalar_type = cl_float;
+    typedef std::true_type is_vector_type;
+};
+
+// cl_double
+template <> struct TypeManager<cl_double> : public CommonTypeManager<cl_double>
+{
+    static const char *name() { return "double"; }
+    static const char *add_typedef() { return "typedef double Type;\n"; }
+    static cl_double identify_limits(ArithmeticOp operation)
+    {
+        switch (operation)
+        {
+            case ArithmeticOp::add_: return 0.0;
+            case ArithmeticOp::max_:
+                return -std::numeric_limits<double>::infinity();
+            case ArithmeticOp::min_:
+                return std::numeric_limits<double>::infinity();
+            case ArithmeticOp::mul_: return (cl_double)1;
+            default: log_error("Unknown operation request"); break;
+        }
+        return 0;
+    }
+    static const bool type_supported(cl_device_id device)
+    {
+        return double_ok(device);
+    }
+};
+template <>
+struct TypeManager<cl_double2> : public CommonTypeManager<cl_double2>
+{
+    static const char *name() { return "double2"; }
+    static const char *add_typedef() { return "typedef double2 Type;\n"; }
+    using scalar_type = cl_double;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return double_ok(device);
+    }
+};
+template <>
+struct TypeManager<subgroups::cl_double3>
+    : public CommonTypeManager<subgroups::cl_double3>
+{
+    static const char *name() { return "double3"; }
+    static const char *add_typedef() { return "typedef double3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = cl_double;
+    static const bool type_supported(cl_device_id device)
+    {
+        return double_ok(device);
+    }
+};
+template <>
+struct TypeManager<cl_double4> : public CommonTypeManager<cl_double4>
+{
+    static const char *name() { return "double4"; }
+    static const char *add_typedef() { return "typedef double4 Type;\n"; }
+    using scalar_type = cl_double;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return double_ok(device);
+    }
+};
+template <>
+struct TypeManager<cl_double8> : public CommonTypeManager<cl_double8>
+{
+    static const char *name() { return "double8"; }
+    static const char *add_typedef() { return "typedef double8 Type;\n"; }
+    using scalar_type = cl_double;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return double_ok(device);
+    }
+};
+template <>
+struct TypeManager<cl_double16> : public CommonTypeManager<cl_double16>
+{
+    static const char *name() { return "double16"; }
+    static const char *add_typedef() { return "typedef double16 Type;\n"; }
+    using scalar_type = cl_double;
+    typedef std::true_type is_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return double_ok(device);
+    }
+};
+
+// cl_half
+template <>
+struct TypeManager<subgroups::cl_half>
+    : public CommonTypeManager<subgroups::cl_half>
+{
+    static const char *name() { return "half"; }
+    static const char *add_typedef() { return "typedef half Type;\n"; }
+    typedef std::true_type is_sb_scalar_type;
+    static subgroups::cl_half identify_limits(ArithmeticOp operation)
+    {
+        switch (operation)
+        {
+            case ArithmeticOp::add_: return { 0x0000 };
+            case ArithmeticOp::max_: return { 0xfc00 };
+            case ArithmeticOp::min_: return { 0x7c00 };
+            case ArithmeticOp::mul_: return { 0x3c00 };
+            default: log_error("Unknown operation request"); break;
+        }
+        return { 0 };
+    }
+    static const bool type_supported(cl_device_id device)
+    {
+        return half_ok(device);
+    }
+};
+template <>
+struct TypeManager<subgroups::cl_half2>
+    : public CommonTypeManager<subgroups::cl_half2>
+{
+    static const char *name() { return "half2"; }
+    static const char *add_typedef() { return "typedef half2 Type;\n"; }
+    using scalar_type = subgroups::cl_half;
+    typedef std::true_type is_sb_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return half_ok(device);
+    }
+};
+template <>
+struct TypeManager<subgroups::cl_half3>
+    : public CommonTypeManager<subgroups::cl_half3>
+{
+    static const char *name() { return "half3"; }
+    static const char *add_typedef() { return "typedef half3 Type;\n"; }
+    typedef std::true_type is_sb_vector_size3;
+    using scalar_type = subgroups::cl_half;
+
+    static const bool type_supported(cl_device_id device)
+    {
+        return half_ok(device);
+    }
+};
+template <>
+struct TypeManager<subgroups::cl_half4>
+    : public CommonTypeManager<subgroups::cl_half4>
+{
+    static const char *name() { return "half4"; }
+    static const char *add_typedef() { return "typedef half4 Type;\n"; }
+    using scalar_type = subgroups::cl_half;
+    typedef std::true_type is_sb_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return half_ok(device);
+    }
+};
+template <>
+struct TypeManager<subgroups::cl_half8>
+    : public CommonTypeManager<subgroups::cl_half8>
+{
+    static const char *name() { return "half8"; }
+    static const char *add_typedef() { return "typedef half8 Type;\n"; }
+    using scalar_type = subgroups::cl_half;
+    typedef std::true_type is_sb_vector_type;
+
+    static const bool type_supported(cl_device_id device)
+    {
+        return half_ok(device);
+    }
+};
+template <>
+struct TypeManager<subgroups::cl_half16>
+    : public CommonTypeManager<subgroups::cl_half16>
+{
+    static const char *name() { return "half16"; }
+    static const char *add_typedef() { return "typedef half16 Type;\n"; }
+    using scalar_type = subgroups::cl_half;
+    typedef std::true_type is_sb_vector_type;
+    static const bool type_supported(cl_device_id device)
+    {
+        return half_ok(device);
+    }
+};
+
+// set scalar value to vector of halfs
+template <typename Ty, int N = 0>
+typename std::enable_if<TypeManager<Ty>::is_sb_vector_type::value>::type
+set_value(Ty &lhs, const cl_ulong &rhs)
+{
+    const int size = sizeof(Ty) / sizeof(typename TypeManager<Ty>::scalar_type);
+    for (auto i = 0; i < size; ++i)
+    {
+        lhs.data.s[i] = rhs;
+    }
+}
+
+
+// set scalar value to vector
+template <typename Ty>
+typename std::enable_if<TypeManager<Ty>::is_vector_type::value>::type
+set_value(Ty &lhs, const cl_ulong &rhs)
+{
+    const int size = sizeof(Ty) / sizeof(typename TypeManager<Ty>::scalar_type);
+    for (auto i = 0; i < size; ++i)
+    {
+        lhs.s[i] = rhs;
+    }
+}
+
+// set vector to vector value
+template <typename Ty>
+typename std::enable_if<TypeManager<Ty>::is_vector_type::value>::type
+set_value(Ty &lhs, const Ty &rhs)
+{
+    lhs = rhs;
+}
+
+// set scalar value to vector size 3
+template <typename Ty, int N = 0>
+typename std::enable_if<TypeManager<Ty>::is_sb_vector_size3::value>::type
+set_value(Ty &lhs, const cl_ulong &rhs)
+{
+    for (auto i = 0; i < 3; ++i)
+    {
+        lhs.data.s[i] = rhs;
+    }
+}
+
+// set scalar value to scalar
+template <typename Ty>
+typename std::enable_if<std::is_scalar<Ty>::value>::type
+set_value(Ty &lhs, const cl_ulong &rhs)
+{
+    lhs = static_cast<Ty>(rhs);
+}
+
+// set scalar value to half scalar
+template <typename Ty>
+typename std::enable_if<TypeManager<Ty>::is_sb_scalar_type::value>::type
+set_value(Ty &lhs, const cl_ulong &rhs)
+{
+    lhs.data = rhs;
+}
+
+// compare for common vectors
+template <typename Ty>
+typename std::enable_if<TypeManager<Ty>::is_vector_type::value, bool>::type
+compare(const Ty &lhs, const Ty &rhs)
+{
+    const int size = sizeof(Ty) / sizeof(typename TypeManager<Ty>::scalar_type);
+    for (auto i = 0; i < size; ++i)
+    {
+        if (lhs.s[i] != rhs.s[i])
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+// compare for vectors 3
+template <typename Ty>
+typename std::enable_if<TypeManager<Ty>::is_sb_vector_size3::value, bool>::type
+compare(const Ty &lhs, const Ty &rhs)
+{
+    for (auto i = 0; i < 3; ++i)
+    {
+        if (lhs.data.s[i] != rhs.data.s[i])
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+// compare for half vectors
+template <typename Ty>
+typename std::enable_if<TypeManager<Ty>::is_sb_vector_type::value, bool>::type
+compare(const Ty &lhs, const Ty &rhs)
+{
+    const int size = sizeof(Ty) / sizeof(typename TypeManager<Ty>::scalar_type);
+    for (auto i = 0; i < size; ++i)
+    {
+        if (lhs.data.s[i] != rhs.data.s[i])
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+// compare for scalars
+template <typename Ty>
+typename std::enable_if<std::is_scalar<Ty>::value, bool>::type
+compare(const Ty &lhs, const Ty &rhs)
+{
+    return lhs == rhs;
+}
+
+// compare for scalar halfs
+template <typename Ty>
+typename std::enable_if<TypeManager<Ty>::is_sb_scalar_type::value, bool>::type
+compare(const Ty &lhs, const Ty &rhs)
+{
+    return lhs.data == rhs.data;
+}
+
+template <typename Ty> inline bool compare_ordered(const Ty &lhs, const Ty &rhs)
+{
+    return lhs == rhs;
+}
+
+template <>
+inline bool compare_ordered(const subgroups::cl_half &lhs,
+                            const subgroups::cl_half &rhs)
+{
+    return cl_half_to_float(lhs.data) == cl_half_to_float(rhs.data);
+}
+
+template <typename Ty>
+inline bool compare_ordered(const subgroups::cl_half &lhs, const int &rhs)
+{
+    return cl_half_to_float(lhs.data) == rhs;
+}
 
 // Run a test kernel to compute the result of a built-in on an input
 static int run_kernel(cl_context context, cl_command_queue queue,
@@ -318,6 +1211,9 @@
                                  NULL);
     test_error(error, "clEnqueueWriteBuffer failed");
 
+    error = clEnqueueWriteBuffer(queue, xy, CL_FALSE, 0, msize, mdata, 0, NULL,
+                                 NULL);
+    test_error(error, "clEnqueueWriteBuffer failed");
     error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0,
                                    NULL, NULL);
     test_error(error, "clEnqueueNDRangeKernel failed");
@@ -337,42 +1233,93 @@
 }
 
 // Driver for testing a single built in function
-template <typename Ty, typename Fns, size_t GSIZE, size_t LSIZE,
-          size_t TSIZE = 0>
-struct test
+template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
 {
+    static int mrun(cl_device_id device, cl_context context,
+                    cl_command_queue queue, int num_elements, const char *kname,
+                    const char *src, WorkGroupParams test_params)
+    {
+        int error = TEST_PASS;
+        for (auto &mask : test_params.all_work_item_masks)
+        {
+            test_params.work_items_mask = mask;
+            error |= run(device, context, queue, num_elements, kname, src,
+                         test_params);
+        }
+        return error;
+    };
     static int run(cl_device_id device, cl_context context,
                    cl_command_queue queue, int num_elements, const char *kname,
-                   const char *src, int dynscl, bool useCoreSubgroups)
+                   const char *src, WorkGroupParams test_params)
     {
         size_t tmp;
         int error;
         int subgroup_size, num_subgroups;
         size_t realSize;
-        size_t global;
-        size_t local;
+        size_t global = test_params.global_workgroup_size;
+        size_t local = test_params.local_workgroup_size;
         clProgramWrapper program;
         clKernelWrapper kernel;
         cl_platform_id platform;
-        cl_int sgmap[2 * GSIZE];
-        Ty mapin[LSIZE];
-        Ty mapout[LSIZE];
+        std::vector<cl_int> sgmap;
+        sgmap.resize(4 * global);
+        std::vector<Ty> mapin;
+        mapin.resize(local);
+        std::vector<Ty> mapout;
+        mapout.resize(local);
+        std::stringstream kernel_sstr;
+        if (test_params.work_items_mask != 0)
+        {
+            kernel_sstr << "#define WORK_ITEMS_MASK ";
+            kernel_sstr << "0x" << std::hex << test_params.work_items_mask
+                        << "\n";
+        }
 
+
+        kernel_sstr << "#define NR_OF_ACTIVE_WORK_ITEMS ";
+        kernel_sstr << NR_OF_ACTIVE_WORK_ITEMS << "\n";
         // Make sure a test of type Ty is supported by the device
-        if (!TypeCheck<Ty>::val(device)) return 0;
+        if (!TypeManager<Ty>::type_supported(device))
+        {
+            log_info("Data type not supported : %s\n", TypeManager<Ty>::name());
+            return 0;
+        }
+        else
+        {
+            if (strstr(TypeManager<Ty>::name(), "double"))
+            {
+                kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n";
+            }
+            else if (strstr(TypeManager<Ty>::name(), "half"))
+            {
+                kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp16: enable\n";
+            }
+        }
+
+        for (std::string extension : test_params.required_extensions)
+        {
+            if (!is_extension_available(device, extension.c_str()))
+            {
+                log_info("The extension %s not supported on this device. SKIP "
+                         "testing - kernel %s data type %s\n",
+                         extension.c_str(), kname, TypeManager<Ty>::name());
+                return TEST_PASS;
+            }
+            kernel_sstr << "#pragma OPENCL EXTENSION " + extension
+                    + ": enable\n";
+        }
 
         error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform),
                                 (void *)&platform, NULL);
         test_error(error, "clGetDeviceInfo failed for CL_DEVICE_PLATFORM");
-        std::stringstream kernel_sstr;
-        if (useCoreSubgroups)
+        if (test_params.use_core_subgroups)
         {
             kernel_sstr
                 << "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
         }
         kernel_sstr << "#define XY(M,I) M[I].x = get_sub_group_local_id(); "
                        "M[I].y = get_sub_group_id();\n";
-        kernel_sstr << TypeDef<Ty>::val();
+        kernel_sstr << TypeManager<Ty>::add_typedef();
         kernel_sstr << src;
         const std::string &kernel_str = kernel_sstr.str();
         const char *kernel_src = kernel_str.c_str();
@@ -382,16 +1329,18 @@
         if (error != 0) return error;
 
         // Determine some local dimensions to use for the test.
-        global = GSIZE;
-        error = get_max_common_work_group_size(context, kernel, GSIZE, &local);
+        error = get_max_common_work_group_size(
+            context, kernel, test_params.global_workgroup_size, &local);
         test_error(error, "get_max_common_work_group_size failed");
 
         // Limit it a bit so we have muliple work groups
-        // Ideally this will still be large enough to give us multiple subgroups
-        if (local > LSIZE) local = LSIZE;
+        // Ideally this will still be large enough to give us multiple
+        if (local > test_params.local_workgroup_size)
+            local = test_params.local_workgroup_size;
+
 
         // Get the sub group info
-        subgroupsAPI subgroupsApiSet(platform, useCoreSubgroups);
+        subgroupsAPI subgroupsApiSet(platform, test_params.use_core_subgroups);
         clGetKernelSubGroupInfoKHR_fn clGetKernelSubGroupInfo_ptr =
             subgroupsApiSet.clGetKernelSubGroupInfo_ptr();
         if (clGetKernelSubGroupInfo_ptr == NULL)
@@ -435,8 +1384,9 @@
 
         std::vector<Ty> idata;
         std::vector<Ty> odata;
-        size_t input_array_size = GSIZE;
-        size_t output_array_size = GSIZE;
+        size_t input_array_size = global;
+        size_t output_array_size = global;
+        int dynscl = test_params.dynsc;
 
         if (dynscl != 0)
         {
@@ -449,28 +1399,96 @@
         odata.resize(output_array_size);
 
         // Run the kernel once on zeroes to get the map
-        memset(&idata[0], 0, input_array_size * sizeof(Ty));
-        error = run_kernel(context, queue, kernel, global, local, &idata[0],
-                           input_array_size * sizeof(Ty), sgmap,
-                           global * sizeof(cl_int) * 2, &odata[0],
+        memset(idata.data(), 0, input_array_size * sizeof(Ty));
+        error = run_kernel(context, queue, kernel, global, local, idata.data(),
+                           input_array_size * sizeof(Ty), sgmap.data(),
+                           global * sizeof(cl_int4), odata.data(),
                            output_array_size * sizeof(Ty), TSIZE * sizeof(Ty));
-        if (error) return error;
+        test_error(error, "Running kernel first time failed");
 
         // Generate the desired input for the kernel
-        Fns::gen(&idata[0], mapin, sgmap, subgroup_size, (int)local,
-                 (int)global / (int)local);
 
-        error = run_kernel(context, queue, kernel, global, local, &idata[0],
-                           input_array_size * sizeof(Ty), sgmap,
-                           global * sizeof(cl_int) * 2, &odata[0],
+        test_params.subgroup_size = subgroup_size;
+        Fns::gen(idata.data(), mapin.data(), sgmap.data(), test_params);
+        error = run_kernel(context, queue, kernel, global, local, idata.data(),
+                           input_array_size * sizeof(Ty), sgmap.data(),
+                           global * sizeof(cl_int4), odata.data(),
                            output_array_size * sizeof(Ty), TSIZE * sizeof(Ty));
-        if (error) return error;
-
+        test_error(error, "Running kernel second time failed");
 
         // Check the result
-        return Fns::chk(&idata[0], &odata[0], mapin, mapout, sgmap,
-                        subgroup_size, (int)local, (int)global / (int)local);
+        error = Fns::chk(idata.data(), odata.data(), mapin.data(),
+                         mapout.data(), sgmap.data(), test_params);
+        test_error(error, "Data verification failed");
+        return TEST_PASS;
     }
 };
 
+static void set_last_workgroup_params(int non_uniform_size,
+                                      int &number_of_subgroups,
+                                      int subgroup_size, int &workgroup_size,
+                                      int &last_subgroup_size)
+{
+    number_of_subgroups = 1 + non_uniform_size / subgroup_size;
+    last_subgroup_size = non_uniform_size % subgroup_size;
+    workgroup_size = non_uniform_size;
+}
+
+template <typename Ty>
+static void set_randomdata_for_subgroup(Ty *workgroup, int wg_offset,
+                                        int current_sbs)
+{
+    int randomize_data = (int)(genrand_int32(gMTdata) % 3);
+    // Initialize data matrix indexed by local id and sub group id
+    switch (randomize_data)
+    {
+        case 0:
+            memset(&workgroup[wg_offset], 0, current_sbs * sizeof(Ty));
+            break;
+        case 1: {
+            memset(&workgroup[wg_offset], 0, current_sbs * sizeof(Ty));
+            int wi_id = (int)(genrand_int32(gMTdata) % (cl_uint)current_sbs);
+            set_value(workgroup[wg_offset + wi_id], 41);
+        }
+        break;
+        case 2:
+            memset(&workgroup[wg_offset], 0xff, current_sbs * sizeof(Ty));
+            break;
+    }
+}
+
+struct RunTestForType
+{
+    RunTestForType(cl_device_id device, cl_context context,
+                   cl_command_queue queue, int num_elements,
+                   WorkGroupParams test_params)
+        : device_(device), context_(context), queue_(queue),
+          num_elements_(num_elements), test_params_(test_params)
+    {}
+    template <typename T, typename U>
+    int run_impl(const char *kernel_name, const char *source)
+    {
+        int error = TEST_PASS;
+        if (test_params_.all_work_item_masks.size() > 0)
+        {
+            error = test<T, U>::mrun(device_, context_, queue_, num_elements_,
+                                     kernel_name, source, test_params_);
+        }
+        else
+        {
+            error = test<T, U>::run(device_, context_, queue_, num_elements_,
+                                    kernel_name, source, test_params_);
+        }
+
+        return error;
+    }
+
+private:
+    cl_device_id device_;
+    cl_context context_;
+    cl_command_queue queue_;
+    int num_elements_;
+    WorkGroupParams test_params_;
+};
+
 #endif
diff --git a/test_conformance/subgroups/test_barrier.cpp b/test_conformance/subgroups/test_barrier.cpp
index e6ce1d2..47e42f6 100644
--- a/test_conformance/subgroups/test_barrier.cpp
+++ b/test_conformance/subgroups/test_barrier.cpp
@@ -59,10 +59,15 @@
 // barrier test functions
 template <int Which> struct BAR
 {
-    static void gen(cl_int *x, cl_int *t, cl_int *m, int ns, int nw, int ng)
+    static void gen(cl_int *x, cl_int *t, cl_int *m,
+                    const WorkGroupParams &test_params)
     {
         int i, ii, j, k, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
         int nj = (nw + ns - 1) / ns;
+        ng = ng / nw;
         int e;
 
         ii = 0;
@@ -79,8 +84,7 @@
             // Now map into work group using map from device
             for (j = 0; j < nw; ++j)
             {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                x[j] = t[i];
+                x[j] = t[j];
             }
 
             x += nw;
@@ -89,10 +93,14 @@
     }
 
     static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
-                   int ns, int nw, int ng)
+                   const WorkGroupParams &test_params)
     {
         int ii, i, j, k, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
         int nj = (nw + ns - 1) / ns;
+        ng = ng / nw;
         cl_int tr, rr;
 
         if (Which == 0)
@@ -105,9 +113,8 @@
             // Map to array indexed to array indexed by local ID and sub group
             for (j = 0; j < nw; ++j)
             {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                mx[i] = x[j];
-                my[i] = y[j];
+                mx[j] = x[j];
+                my[j] = y[j];
             }
 
             for (j = 0; j < nj; ++j)
@@ -123,8 +130,9 @@
                     if (tr != rr)
                     {
                         log_error("ERROR: sub_group_barrier mismatch for local "
-                                  "id %d in sub group %d in group %d\n",
-                                  i, j, k);
+                                  "id %d in sub group %d in group %d expected "
+                                  "%d got %d\n",
+                                  i, j, k, tr, rr);
                         return -1;
                     }
                 }
@@ -144,18 +152,18 @@
                            cl_command_queue queue, int num_elements,
                            bool useCoreSubgroups)
 {
-    int error;
+    int error = TEST_PASS;
 
     // Adjust these individually below if desired/needed
-#define G 2000
-#define L 200
-
-    error = test<cl_int, BAR<0>, G, L>::run(device, context, queue,
-                                            num_elements, "test_lbar",
-                                            lbar_source, 0, useCoreSubgroups);
-    error = test<cl_int, BAR<1>, G, L, G>::run(
-        device, context, queue, num_elements, "test_gbar", gbar_source, 0,
-        useCoreSubgroups);
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.use_core_subgroups = useCoreSubgroups;
+    error = test<cl_int, BAR<0>>::run(device, context, queue, num_elements,
+                                      "test_lbar", lbar_source, test_params);
+    error |= test<cl_int, BAR<1>, global_work_size>::run(
+        device, context, queue, num_elements, "test_gbar", gbar_source,
+        test_params);
 
     return error;
 }
diff --git a/test_conformance/subgroups/test_ifp.cpp b/test_conformance/subgroups/test_ifp.cpp
index 02850e5..428f2cd 100644
--- a/test_conformance/subgroups/test_ifp.cpp
+++ b/test_conformance/subgroups/test_ifp.cpp
@@ -46,7 +46,7 @@
     "#define INST_COUNT 0x3\n"
     "\n"
     "__kernel void\n"
-    "test_ifp(const __global int *in, __global int2 *xy, __global int *out)\n"
+    "test_ifp(const __global int *in, __global int4 *xy, __global int *out)\n"
     "{\n"
     "    __local atomic_int loc[NUM_LOC];\n"
     "\n"
@@ -225,10 +225,15 @@
 
 struct IFP
 {
-    static void gen(cl_int *x, cl_int *t, cl_int *, int ns, int nw, int ng)
+    static void gen(cl_int *x, cl_int *t, cl_int *,
+                    const WorkGroupParams &test_params)
     {
         int k;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
         int nj = (nw + ns - 1) / ns;
+        ng = ng / nw;
 
         // We need at least 2 sub groups per group for this test
         if (nj == 1) return;
@@ -240,11 +245,15 @@
         }
     }
 
-    static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *, int ns,
-                   int nw, int ng)
+    static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *,
+                   const WorkGroupParams &test_params)
     {
         int i, k;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
         int nj = (nw + ns - 1) / ns;
+        ng = ng / nw;
 
         // We need at least 2 sub groups per group for this tes
         if (nj == 1) return 0;
@@ -275,14 +284,17 @@
 int test_ifp(cl_device_id device, cl_context context, cl_command_queue queue,
              int num_elements, bool useCoreSubgroups)
 {
-    int error;
+    int error = TEST_PASS;
 
+    // Global/local work group sizes
     // Adjust these individually below if desired/needed
-#define G 2000
-#define L 200
-    error = test<cl_int, IFP, G, L>::run(device, context, queue, num_elements,
-                                         "test_ifp", ifp_source, NUM_LOC + 1,
-                                         useCoreSubgroups);
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size);
+    test_params.use_core_subgroups = useCoreSubgroups;
+    test_params.dynsc = NUM_LOC + 1;
+    error = test<cl_int, IFP>::run(device, context, queue, num_elements,
+                                   "test_ifp", ifp_source, test_params);
     return error;
 }
 
diff --git a/test_conformance/subgroups/test_subgroup.cpp b/test_conformance/subgroups/test_subgroup.cpp
new file mode 100644
index 0000000..c0e4952
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup.cpp
@@ -0,0 +1,217 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "subgroup_common_kernels.h"
+#include "subgroup_common_templates.h"
+#include "harness/conversions.h"
+#include "harness/typeWrappers.h"
+
+namespace {
+// Any/All test functions
+template <NonUniformVoteOp operation> struct AA
+{
+    static void gen(cl_int *x, cl_int *t, cl_int *m,
+                    const WorkGroupParams &test_params)
+    {
+        int i, ii, j, k, n;
+        int ng = test_params.global_workgroup_size;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int nj = (nw + ns - 1) / ns;
+        int e;
+        ng = ng / nw;
+        ii = 0;
+        log_info("  sub_group_%s...\n", operation_names(operation));
+        for (k = 0; k < ng; ++k)
+        {
+            for (j = 0; j < nj; ++j)
+            {
+                ii = j * ns;
+                n = ii + ns > nw ? nw - ii : ns;
+                e = (int)(genrand_int32(gMTdata) % 3);
+
+                // Initialize data matrix indexed by local id and sub group id
+                switch (e)
+                {
+                    case 0: memset(&t[ii], 0, n * sizeof(cl_int)); break;
+                    case 1:
+                        memset(&t[ii], 0, n * sizeof(cl_int));
+                        i = (int)(genrand_int32(gMTdata) % (cl_uint)n);
+                        t[ii + i] = 41;
+                        break;
+                    case 2: memset(&t[ii], 0xff, n * sizeof(cl_int)); break;
+                }
+            }
+
+            // Now map into work group using map from device
+            for (j = 0; j < nw; ++j)
+            {
+                x[j] = t[j];
+            }
+
+            x += nw;
+            m += 4 * nw;
+        }
+    }
+
+    static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int ii, i, j, k, n;
+        int ng = test_params.global_workgroup_size;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int nj = (nw + ns - 1) / ns;
+        cl_int taa, raa;
+        ng = ng / nw;
+
+        for (k = 0; k < ng; ++k)
+        {
+            // Map to array indexed to array indexed by local ID and sub group
+            for (j = 0; j < nw; ++j)
+            {
+                mx[j] = x[j];
+                my[j] = y[j];
+            }
+
+            for (j = 0; j < nj; ++j)
+            {
+                ii = j * ns;
+                n = ii + ns > nw ? nw - ii : ns;
+
+                // Compute target
+                if (operation == NonUniformVoteOp::any)
+                {
+                    taa = 0;
+                    for (i = 0; i < n; ++i) taa |= mx[ii + i] != 0;
+                }
+
+                if (operation == NonUniformVoteOp::all)
+                {
+                    taa = 1;
+                    for (i = 0; i < n; ++i) taa &= mx[ii + i] != 0;
+                }
+
+                // Check result
+                for (i = 0; i < n; ++i)
+                {
+                    raa = my[ii + i] != 0;
+                    if (raa != taa)
+                    {
+                        log_error("ERROR: sub_group_%s mismatch for local id "
+                                  "%d in sub group %d in group %d\n",
+                                  operation_names(operation), i, j, k);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+        log_info("  sub_group_%s... passed\n", operation_names(operation));
+        return TEST_PASS;
+    }
+};
+
+static const char *any_source = "__kernel void test_any(const __global Type "
+                                "*in, __global int4 *xy, __global Type *out)\n"
+                                "{\n"
+                                "    int gid = get_global_id(0);\n"
+                                "    XY(xy,gid);\n"
+                                "    out[gid] = sub_group_any(in[gid]);\n"
+                                "}\n";
+
+static const char *all_source = "__kernel void test_all(const __global Type "
+                                "*in, __global int4 *xy, __global Type *out)\n"
+                                "{\n"
+                                "    int gid = get_global_id(0);\n"
+                                "    XY(xy,gid);\n"
+                                "    out[gid] = sub_group_all(in[gid]);\n"
+                                "}\n";
+
+
+template <typename T>
+int run_broadcast_scan_reduction_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
+        "test_bcast", bcast_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
+                                                            redadd_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
+                                                            redmax_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
+                                                            redmin_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
+                                                             scinadd_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
+                                                             scinmax_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
+                                                             scinmin_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
+                                                             scexadd_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
+                                                             scexmax_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
+                                                             scexmin_source);
+    return error;
+}
+
+}
+// Entry point from main
+int test_subgroup_functions(cl_device_id device, cl_context context,
+                            cl_command_queue queue, int num_elements,
+                            bool useCoreSubgroups)
+{
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+    int error =
+        rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("test_any", any_source);
+    error |=
+        rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("test_all", all_source);
+    error |= run_broadcast_scan_reduction_for_type<cl_int>(rft);
+    error |= run_broadcast_scan_reduction_for_type<cl_uint>(rft);
+    error |= run_broadcast_scan_reduction_for_type<cl_long>(rft);
+    error |= run_broadcast_scan_reduction_for_type<cl_ulong>(rft);
+    error |= run_broadcast_scan_reduction_for_type<cl_float>(rft);
+    error |= run_broadcast_scan_reduction_for_type<cl_double>(rft);
+    error |= run_broadcast_scan_reduction_for_type<subgroups::cl_half>(rft);
+    return error;
+}
+
+int test_subgroup_functions_core(cl_device_id device, cl_context context,
+                                 cl_command_queue queue, int num_elements)
+{
+    return test_subgroup_functions(device, context, queue, num_elements, true);
+}
+
+int test_subgroup_functions_ext(cl_device_id device, cl_context context,
+                                cl_command_queue queue, int num_elements)
+{
+    bool hasExtension = is_extension_available(device, "cl_khr_subgroups");
+
+    if (!hasExtension)
+    {
+        log_info(
+            "Device does not support 'cl_khr_subgroups'. Skipping the test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+    return test_subgroup_functions(device, context, queue, num_elements, false);
+}
diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp
new file mode 100644
index 0000000..f2e4060
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_ballot.cpp
@@ -0,0 +1,1089 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "subgroup_common_templates.h"
+#include "harness/typeWrappers.h"
+#include <bitset>
+
+namespace {
+// Test for ballot functions
+template <typename Ty> struct BALLOT
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        // no work here
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int non_uniform_size = gws % lws;
+        log_info("  sub_group_ballot...\n");
+        if (non_uniform_size)
+        {
+            log_info("  non uniform work group size mode ON\n");
+        }
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int wi_id, wg_id, sb_id;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        int current_sbs = 0;
+        cl_uint expected_result, device_result;
+        int non_uniform_size = gws % lws;
+        int wg_number = gws / lws;
+        wg_number = non_uniform_size ? wg_number + 1 : wg_number;
+        int last_subgroup_size = 0;
+
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            if (non_uniform_size && wg_id == wg_number - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws,
+                                          last_subgroup_size);
+            }
+
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            { // inside the work_group
+                // read device outputs for work_group
+                my[wi_id] = y[wi_id];
+            }
+
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                if (last_subgroup_size && sb_id == sb_number - 1)
+                {
+                    current_sbs = last_subgroup_size;
+                }
+                else
+                {
+                    current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                }
+                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                {
+                    device_result = my[wg_offset + wi_id];
+                    expected_result = 1;
+                    if (!compare(device_result, expected_result))
+                    {
+                        log_error(
+                            "ERROR: sub_group_ballot mismatch for local id "
+                            "%d in sub group %d in group %d obtained {%d}, "
+                            "expected {%d} \n",
+                            wi_id, sb_id, wg_id, device_result,
+                            expected_result);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+            y += lws;
+            m += 4 * lws;
+        }
+        log_info("  sub_group_ballot... passed\n");
+        return TEST_PASS;
+    }
+};
+
+// Test for bit extract ballot functions
+template <typename Ty, BallotOp operation> struct BALLOT_BIT_EXTRACT
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int wi_id, sb_id, wg_id, l;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        int wg_number = gws / lws;
+        int limit_sbs = sbs > 100 ? 100 : sbs;
+        int non_uniform_size = gws % lws;
+        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
+                 TypeManager<Ty>::name());
+
+        if (non_uniform_size)
+        {
+            log_info("  non uniform work group size mode ON\n");
+        }
+
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                int current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                // rand index to bit extract
+                int index_for_odd = (int)(genrand_int32(gMTdata) & 0x7fffffff)
+                    % (limit_sbs > current_sbs ? current_sbs : limit_sbs);
+                int index_for_even = (int)(genrand_int32(gMTdata) & 0x7fffffff)
+                    % (limit_sbs > current_sbs ? current_sbs : limit_sbs);
+                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                {
+                    // index of the third element int the vector.
+                    int midx = 4 * wg_offset + 4 * wi_id + 2;
+                    // storing information about index to bit extract
+                    m[midx] = (cl_int)index_for_odd;
+                    m[++midx] = (cl_int)index_for_even;
+                }
+                set_randomdata_for_subgroup<Ty>(t, wg_offset, current_sbs);
+            }
+
+            // Now map into work group using map from device
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            {
+                x[wi_id] = t[wi_id];
+            }
+
+            x += lws;
+            m += 4 * lws;
+        }
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int wi_id, wg_id, l, sb_id;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        int wg_number = gws / lws;
+        cl_uint4 expected_result, device_result;
+        int last_subgroup_size = 0;
+        int current_sbs = 0;
+        int non_uniform_size = gws % lws;
+
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            if (non_uniform_size && wg_id == wg_number - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws,
+                                          last_subgroup_size);
+            }
+            // Map to array indexed to array indexed by local ID and sub group
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            { // inside the work_group
+                // read host inputs for work_group
+                mx[wi_id] = x[wi_id];
+                // read device outputs for work_group
+                my[wi_id] = y[wi_id];
+            }
+
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                if (last_subgroup_size && sb_id == sb_number - 1)
+                {
+                    current_sbs = last_subgroup_size;
+                }
+                else
+                {
+                    current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                }
+                // take index of array where info which work_item will
+                // be broadcast its value is stored
+                int midx = 4 * wg_offset + 2;
+                // take subgroup local id of this work_item
+                int index_for_odd = (int)m[midx];
+                int index_for_even = (int)m[++midx];
+
+                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                { // for each subgroup
+                    int bit_value = 0;
+                    // from which value of bitfield bit
+                    // verification will be done
+                    int take_shift =
+                        (wi_id & 1) ? index_for_odd % 32 : index_for_even % 32;
+                    int bit_mask = 1 << take_shift;
+
+                    if (wi_id < 32)
+                        (mx[wg_offset + wi_id].s0 & bit_mask) > 0
+                            ? bit_value = 1
+                            : bit_value = 0;
+                    if (wi_id >= 32 && wi_id < 64)
+                        (mx[wg_offset + wi_id].s1 & bit_mask) > 0
+                            ? bit_value = 1
+                            : bit_value = 0;
+                    if (wi_id >= 64 && wi_id < 96)
+                        (mx[wg_offset + wi_id].s2 & bit_mask) > 0
+                            ? bit_value = 1
+                            : bit_value = 0;
+                    if (wi_id >= 96 && wi_id < 128)
+                        (mx[wg_offset + wi_id].s3 & bit_mask) > 0
+                            ? bit_value = 1
+                            : bit_value = 0;
+
+                    if (wi_id & 1)
+                    {
+                        bit_value ? expected_result = { 1, 0, 0, 1 }
+                                  : expected_result = { 0, 0, 0, 1 };
+                    }
+                    else
+                    {
+                        bit_value ? expected_result = { 1, 0, 0, 2 }
+                                  : expected_result = { 0, 0, 0, 2 };
+                    }
+
+                    device_result = my[wg_offset + wi_id];
+                    if (!compare(device_result, expected_result))
+                    {
+                        log_error(
+                            "ERROR: sub_group_%s mismatch for local id %d in "
+                            "sub group %d in group %d obtained {%d, %d, %d, "
+                            "%d}, expected {%d, %d, %d, %d}\n",
+                            operation_names(operation), wi_id, sb_id, wg_id,
+                            device_result.s0, device_result.s1,
+                            device_result.s2, device_result.s3,
+                            expected_result.s0, expected_result.s1,
+                            expected_result.s2, expected_result.s3);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+            x += lws;
+            y += lws;
+            m += 4 * lws;
+        }
+        log_info("  sub_group_%s(%s)... passed\n", operation_names(operation),
+                 TypeManager<Ty>::name());
+        return TEST_PASS;
+    }
+};
+
+template <typename Ty, BallotOp operation> struct BALLOT_INVERSE
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int non_uniform_size = gws % lws;
+        log_info("  sub_group_inverse_ballot...\n");
+        if (non_uniform_size)
+        {
+            log_info("  non uniform work group size mode ON\n");
+        }
+        // no work here
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int wi_id, wg_id, sb_id;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        cl_uint4 expected_result, device_result;
+        int non_uniform_size = gws % lws;
+        int wg_number = gws / lws;
+        int last_subgroup_size = 0;
+        int current_sbs = 0;
+        if (non_uniform_size) wg_number++;
+
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            if (non_uniform_size && wg_id == wg_number - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws,
+                                          last_subgroup_size);
+            }
+            // Map to array indexed to array indexed by local ID and sub group
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            { // inside the work_group
+                mx[wi_id] = x[wi_id]; // read host inputs for work_group
+                my[wi_id] = y[wi_id]; // read device outputs for work_group
+            }
+
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                if (last_subgroup_size && sb_id == sb_number - 1)
+                {
+                    current_sbs = last_subgroup_size;
+                }
+                else
+                {
+                    current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                }
+                // take index of array where info which work_item will
+                // be broadcast its value is stored
+                int midx = 4 * wg_offset + 2;
+                // take subgroup local id of this work_item
+                // Check result
+                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                { // for each subgroup work item
+
+                    wi_id & 1 ? expected_result = { 1, 0, 0, 1 }
+                              : expected_result = { 1, 0, 0, 2 };
+
+                    device_result = my[wg_offset + wi_id];
+                    if (!compare(device_result, expected_result))
+                    {
+                        log_error(
+                            "ERROR: sub_group_%s mismatch for local id %d in "
+                            "sub group %d in group %d obtained {%d, %d, %d, "
+                            "%d}, expected {%d, %d, %d, %d}\n",
+                            operation_names(operation), wi_id, sb_id, wg_id,
+                            device_result.s0, device_result.s1,
+                            device_result.s2, device_result.s3,
+                            expected_result.s0, expected_result.s1,
+                            expected_result.s2, expected_result.s3);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+            x += lws;
+            y += lws;
+            m += 4 * lws;
+        }
+
+        log_info("  sub_group_inverse_ballot... passed\n");
+        return TEST_PASS;
+    }
+};
+
+
+// Test for bit count/inclusive and exclusive scan/ find lsb msb ballot function
+template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int wi_id, wg_id, sb_id;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        int non_uniform_size = gws % lws;
+        int wg_number = gws / lws;
+        int last_subgroup_size = 0;
+        int current_sbs = 0;
+
+        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
+                 TypeManager<Ty>::name());
+        if (non_uniform_size)
+        {
+            log_info("  non uniform work group size mode ON\n");
+            wg_number++;
+        }
+        int e;
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            if (non_uniform_size && wg_id == wg_number - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws,
+                                          last_subgroup_size);
+            }
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                if (last_subgroup_size && sb_id == sb_number - 1)
+                {
+                    current_sbs = last_subgroup_size;
+                }
+                else
+                {
+                    current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                }
+                if (operation == BallotOp::ballot_bit_count
+                    || operation == BallotOp::ballot_inclusive_scan
+                    || operation == BallotOp::ballot_exclusive_scan)
+                {
+                    set_randomdata_for_subgroup<Ty>(t, wg_offset, current_sbs);
+                }
+                else if (operation == BallotOp::ballot_find_lsb
+                         || operation == BallotOp::ballot_find_msb)
+                {
+                    // Regarding to the spec, find lsb and find msb result is
+                    // undefined behavior if input value is zero, so generate
+                    // only non-zero values.
+                    for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                    {
+                        char x = (genrand_int32(gMTdata)) & 0xff;
+                        // undefined behaviour in case of 0;
+                        x = x ? x : 1;
+                        memset(&t[wg_offset + wi_id], x, sizeof(Ty));
+                    }
+                }
+                else
+                {
+                    log_error("Unknown operation...");
+                }
+            }
+
+            // Now map into work group using map from device
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            {
+                x[wi_id] = t[wi_id];
+            }
+
+            x += lws;
+            m += 4 * lws;
+        }
+    }
+
+    static bs128 getImportantBits(cl_uint sub_group_local_id,
+                                  cl_uint sub_group_size)
+    {
+        bs128 mask;
+        if (operation == BallotOp::ballot_bit_count
+            || operation == BallotOp::ballot_find_lsb
+            || operation == BallotOp::ballot_find_msb)
+        {
+            for (cl_uint i = 0; i < sub_group_size; ++i) mask.set(i);
+        }
+        else if (operation == BallotOp::ballot_inclusive_scan
+                 || operation == BallotOp::ballot_exclusive_scan)
+        {
+            for (cl_uint i = 0; i <= sub_group_local_id; ++i) mask.set(i);
+            if (operation == BallotOp::ballot_exclusive_scan)
+                mask.reset(sub_group_local_id);
+        }
+        return mask;
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int wi_id, wg_id, sb_id;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        int non_uniform_size = gws % lws;
+        int wg_number = gws / lws;
+        wg_number = non_uniform_size ? wg_number + 1 : wg_number;
+        cl_uint4 expected_result, device_result;
+        int last_subgroup_size = 0;
+        int current_sbs = 0;
+
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            if (non_uniform_size && wg_id == wg_number - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws,
+                                          last_subgroup_size);
+            }
+            // Map to array indexed to array indexed by local ID and sub group
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            { // inside the work_group
+                // read host inputs for work_group
+                mx[wi_id] = x[wi_id];
+                // read device outputs for work_group
+                my[wi_id] = y[wi_id];
+            }
+
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                if (last_subgroup_size && sb_id == sb_number - 1)
+                {
+                    current_sbs = last_subgroup_size;
+                }
+                else
+                {
+                    current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                }
+                // Check result
+                expected_result = { 0, 0, 0, 0 };
+                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                { // for subgroup element
+                    bs128 bs;
+                    // convert cl_uint4 input into std::bitset<128>
+                    bs |= bs128(mx[wg_offset + wi_id].s0)
+                        | (bs128(mx[wg_offset + wi_id].s1) << 32)
+                        | (bs128(mx[wg_offset + wi_id].s2) << 64)
+                        | (bs128(mx[wg_offset + wi_id].s3) << 96);
+                    bs &= getImportantBits(wi_id, current_sbs);
+                    device_result = my[wg_offset + wi_id];
+                    if (operation == BallotOp::ballot_inclusive_scan
+                        || operation == BallotOp::ballot_exclusive_scan
+                        || operation == BallotOp::ballot_bit_count)
+                    {
+                        expected_result.s0 = bs.count();
+                        if (!compare(device_result, expected_result))
+                        {
+                            log_error("ERROR: sub_group_%s "
+                                      "mismatch for local id %d in sub group "
+                                      "%d in group %d obtained {%d, %d, %d, "
+                                      "%d}, expected {%d, %d, %d, %d}\n",
+                                      operation_names(operation), wi_id, sb_id,
+                                      wg_id, device_result.s0, device_result.s1,
+                                      device_result.s2, device_result.s3,
+                                      expected_result.s0, expected_result.s1,
+                                      expected_result.s2, expected_result.s3);
+                            return TEST_FAIL;
+                        }
+                    }
+                    else if (operation == BallotOp::ballot_find_lsb)
+                    {
+                        for (int id = 0; id < current_sbs; ++id)
+                        {
+                            if (bs.test(id))
+                            {
+                                expected_result.s0 = id;
+                                break;
+                            }
+                        }
+                        if (!compare(device_result, expected_result))
+                        {
+                            log_error("ERROR: sub_group_ballot_find_lsb "
+                                      "mismatch for local id %d in sub group "
+                                      "%d in group %d obtained {%d, %d, %d, "
+                                      "%d}, expected {%d, %d, %d, %d}\n",
+                                      wi_id, sb_id, wg_id, device_result.s0,
+                                      device_result.s1, device_result.s2,
+                                      device_result.s3, expected_result.s0,
+                                      expected_result.s1, expected_result.s2,
+                                      expected_result.s3);
+                            return TEST_FAIL;
+                        }
+                    }
+                    else if (operation == BallotOp::ballot_find_msb)
+                    {
+                        for (int id = current_sbs - 1; id >= 0; --id)
+                        {
+                            if (bs.test(id))
+                            {
+                                expected_result.s0 = id;
+                                break;
+                            }
+                        }
+                        if (!compare(device_result, expected_result))
+                        {
+                            log_error("ERROR: sub_group_ballot_find_msb "
+                                      "mismatch for local id %d in sub group "
+                                      "%d in group %d obtained {%d, %d, %d, "
+                                      "%d}, expected {%d, %d, %d, %d}\n",
+                                      wi_id, sb_id, wg_id, device_result.s0,
+                                      device_result.s1, device_result.s2,
+                                      device_result.s3, expected_result.s0,
+                                      expected_result.s1, expected_result.s2,
+                                      expected_result.s3);
+                            return TEST_FAIL;
+                        }
+                    }
+                }
+            }
+            x += lws;
+            y += lws;
+            m += 4 * lws;
+        }
+        log_info("  sub_group_ballot_%s(%s)... passed\n",
+                 operation_names(operation), TypeManager<Ty>::name());
+        return TEST_PASS;
+    }
+};
+
+// test mask functions
+template <typename Ty, BallotOp operation> struct SMASK
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int wi_id, wg_id, l, sb_id;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        int wg_number = gws / lws;
+        log_info("  get_sub_group_%s_mask...\n", operation_names(operation));
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            { // for each subgroup
+                int wg_offset = sb_id * sbs;
+                int current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+                // Produce expected masks for each work item in the subgroup
+                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                {
+                    int midx = 4 * wg_offset + 4 * wi_id;
+                    cl_uint max_sub_group_size = m[midx + 2];
+                    cl_uint4 expected_mask = { 0 };
+                    expected_mask = generate_bit_mask(
+                        wi_id, operation_names(operation), max_sub_group_size);
+                    set_value(t[wg_offset + wi_id], expected_mask);
+                }
+            }
+
+            // Now map into work group using map from device
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            {
+                x[wi_id] = t[wi_id];
+            }
+            x += lws;
+            m += 4 * lws;
+        }
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int wi_id, wg_id, sb_id;
+        int gws = test_params.global_workgroup_size;
+        int lws = test_params.local_workgroup_size;
+        int sbs = test_params.subgroup_size;
+        int sb_number = (lws + sbs - 1) / sbs;
+        Ty expected_result, device_result;
+        int wg_number = gws / lws;
+
+        for (wg_id = 0; wg_id < wg_number; ++wg_id)
+        { // for each work_group
+            for (wi_id = 0; wi_id < lws; ++wi_id)
+            { // inside the work_group
+                mx[wi_id] = x[wi_id]; // read host inputs for work_group
+                my[wi_id] = y[wi_id]; // read device outputs for work_group
+            }
+
+            for (sb_id = 0; sb_id < sb_number; ++sb_id)
+            {
+                int wg_offset = sb_id * sbs;
+                int current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs;
+
+                // Check result
+                for (wi_id = 0; wi_id < current_sbs; ++wi_id)
+                { // inside the subgroup
+                    expected_result =
+                        mx[wg_offset + wi_id]; // read host input for subgroup
+                    device_result =
+                        my[wg_offset
+                           + wi_id]; // read device outputs for subgroup
+                    if (!compare(device_result, expected_result))
+                    {
+                        log_error("ERROR:  get_sub_group_%s_mask... mismatch "
+                                  "for local id %d in sub group %d in group "
+                                  "%d, obtained %d, expected %d\n",
+                                  operation_names(operation), wi_id, sb_id,
+                                  wg_id, device_result, expected_result);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+            x += lws;
+            y += lws;
+            m += 4 * lws;
+        }
+        log_info("  get_sub_group_%s_mask... passed\n",
+                 operation_names(operation));
+        return TEST_PASS;
+    }
+};
+
+static const char *bcast_non_uniform_source =
+    "__kernel void test_bcast_non_uniform(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n"
+    "        out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);\n"
+    "    } else {\n"
+    "       out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);\n"
+    "    }\n"
+    "}\n";
+
+static const char *bcast_first_source =
+    "__kernel void test_bcast_first(const __global Type *in, __global int4 "
+    "*xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n"
+    "       out[gid] = sub_group_broadcast_first(x);\n"
+    "    } else {\n"
+    "       out[gid] = sub_group_broadcast_first(x);\n"
+    "    }\n"
+    "}\n";
+
+static const char *ballot_bit_count_source =
+    "__kernel void test_sub_group_ballot_bit_count(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint4 value = (uint4)(0,0,0,0);\n"
+    "    value = (uint4)(sub_group_ballot_bit_count(x),0,0,0);\n"
+    "    out[gid] = value;\n"
+    "}\n";
+
+static const char *ballot_inclusive_scan_source =
+    "__kernel void test_sub_group_ballot_inclusive_scan(const __global Type "
+    "*in, __global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint4 value = (uint4)(0,0,0,0);\n"
+    "    value = (uint4)(sub_group_ballot_inclusive_scan(x),0,0,0);\n"
+    "    out[gid] = value;\n"
+    "}\n";
+
+static const char *ballot_exclusive_scan_source =
+    "__kernel void test_sub_group_ballot_exclusive_scan(const __global Type "
+    "*in, __global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint4 value = (uint4)(0,0,0,0);\n"
+    "    value = (uint4)(sub_group_ballot_exclusive_scan(x),0,0,0);\n"
+    "    out[gid] = value;\n"
+    "}\n";
+
+static const char *ballot_find_lsb_source =
+    "__kernel void test_sub_group_ballot_find_lsb(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint4 value = (uint4)(0,0,0,0);\n"
+    "    value = (uint4)(sub_group_ballot_find_lsb(x),0,0,0);\n"
+    "    out[gid] = value;\n"
+    "}\n";
+
+static const char *ballot_find_msb_source =
+    "__kernel void test_sub_group_ballot_find_msb(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint4 value = (uint4)(0,0,0,0);"
+    "    value = (uint4)(sub_group_ballot_find_msb(x),0,0,0);"
+    "    out[gid] = value ;"
+    "}\n";
+
+static const char *get_subgroup_ge_mask_source =
+    "__kernel void test_get_sub_group_ge_mask(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].z = get_max_sub_group_size();\n"
+    "    Type x = in[gid];\n"
+    "    uint4 mask = get_sub_group_ge_mask();"
+    "    out[gid] = mask;\n"
+    "}\n";
+
+static const char *get_subgroup_gt_mask_source =
+    "__kernel void test_get_sub_group_gt_mask(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].z = get_max_sub_group_size();\n"
+    "    Type x = in[gid];\n"
+    "    uint4 mask = get_sub_group_gt_mask();"
+    "    out[gid] = mask;\n"
+    "}\n";
+
+static const char *get_subgroup_le_mask_source =
+    "__kernel void test_get_sub_group_le_mask(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].z = get_max_sub_group_size();\n"
+    "    Type x = in[gid];\n"
+    "    uint4 mask = get_sub_group_le_mask();"
+    "    out[gid] = mask;\n"
+    "}\n";
+
+static const char *get_subgroup_lt_mask_source =
+    "__kernel void test_get_sub_group_lt_mask(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].z = get_max_sub_group_size();\n"
+    "    Type x = in[gid];\n"
+    "    uint4 mask = get_sub_group_lt_mask();"
+    "    out[gid] = mask;\n"
+    "}\n";
+
+static const char *get_subgroup_eq_mask_source =
+    "__kernel void test_get_sub_group_eq_mask(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].z = get_max_sub_group_size();\n"
+    "    Type x = in[gid];\n"
+    "    uint4 mask = get_sub_group_eq_mask();"
+    "    out[gid] = mask;\n"
+    "}\n";
+
+static const char *ballot_source =
+    "__kernel void test_sub_group_ballot(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "uint4 full_ballot = sub_group_ballot(1);\n"
+    "uint divergence_mask;\n"
+    "uint4 partial_ballot;\n"
+    "uint gid = get_global_id(0);"
+    "XY(xy,gid);\n"
+    "if (get_sub_group_local_id() & 1) {\n"
+    "    divergence_mask = 0xaaaaaaaa;\n"
+    "    partial_ballot = sub_group_ballot(1);\n"
+    "} else {\n"
+    "    divergence_mask = 0x55555555;\n"
+    "    partial_ballot = sub_group_ballot(1);\n"
+    "}\n"
+    " size_t lws = get_local_size(0);\n"
+    "uint4 masked_ballot = full_ballot;\n"
+    "masked_ballot.x &= divergence_mask;\n"
+    "masked_ballot.y &= divergence_mask;\n"
+    "masked_ballot.z &= divergence_mask;\n"
+    "masked_ballot.w &= divergence_mask;\n"
+    "out[gid] = all(masked_ballot == partial_ballot);\n"
+
+    "} \n";
+
+static const char *ballot_source_inverse =
+    "__kernel void test_sub_group_ballot_inverse(const __global "
+    "Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint4 value = (uint4)(10,0,0,0);\n"
+    "    if (get_sub_group_local_id() & 1) {"
+    "        uint4 partial_ballot_mask = "
+    "(uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA);"
+    "        if (sub_group_inverse_ballot(partial_ballot_mask)) {\n"
+    "            value = (uint4)(1,0,0,1);\n"
+    "        } else {\n"
+    "            value = (uint4)(0,0,0,1);\n"
+    "        }\n"
+    "    } else {\n"
+    "       uint4 partial_ballot_mask = "
+    "(uint4)(0x55555555,0x55555555,0x55555555,0x55555555);"
+    "        if (sub_group_inverse_ballot(partial_ballot_mask)) {\n"
+    "            value = (uint4)(1,0,0,2);\n"
+    "        } else {\n"
+    "            value = (uint4)(0,0,0,2);\n"
+    "        }\n"
+    "    }\n"
+    "    out[gid] = value;\n"
+    "}\n";
+
+static const char *ballot_bit_extract_source =
+    "__kernel void test_sub_group_ballot_bit_extract(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    uint index = xy[gid].z;\n"
+    "    uint4 value = (uint4)(10,0,0,0);\n"
+    "    if (get_sub_group_local_id() & 1) {"
+    "       if (sub_group_ballot_bit_extract(x, xy[gid].z)) {\n"
+    "           value = (uint4)(1,0,0,1);\n"
+    "       } else {\n"
+    "           value = (uint4)(0,0,0,1);\n"
+    "       }\n"
+    "    } else {\n"
+    "       if (sub_group_ballot_bit_extract(x, xy[gid].w)) {\n"
+    "           value = (uint4)(1,0,0,2);\n"
+    "       } else {\n"
+    "           value = (uint4)(0,0,0,2);\n"
+    "       }\n"
+    "    }\n"
+    "    out[gid] = value;\n"
+    "}\n";
+
+template <typename T> int run_non_uniform_broadcast_for_type(RunTestForType rft)
+{
+    int error =
+        rft.run_impl<T, BC<T, SubgroupsBroadcastOp::non_uniform_broadcast>>(
+            "test_bcast_non_uniform", bcast_non_uniform_source);
+    return error;
+}
+
+
+}
+
+int test_subgroup_functions_ballot(cl_device_id device, cl_context context,
+                                   cl_command_queue queue, int num_elements)
+{
+    std::vector<std::string> required_extensions = { "cl_khr_subgroup_ballot" };
+    constexpr size_t global_work_size = 170;
+    constexpr size_t local_work_size = 64;
+    WorkGroupParams test_params(global_work_size, local_work_size,
+                                required_extensions);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+
+    // non uniform broadcast functions
+    int error = run_non_uniform_broadcast_for_type<cl_int>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_int2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_int3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_int4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_int8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_int16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_uint>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uint2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_uint3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uint4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uint8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uint16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_char>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_char2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_char3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_char4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_char8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_char16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_uchar>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uchar2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_uchar3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uchar4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uchar8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_uchar16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_short>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_short2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_short3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_short4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_short8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_short16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_ushort>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ushort2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_ushort3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ushort4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ushort8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ushort16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_long>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_long2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_long3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_long4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_long8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_long16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_ulong>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ulong2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_ulong3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ulong4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ulong8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_ulong16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_float>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_float2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_float3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_float4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_float8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_float16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<cl_double>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_double2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_double3>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_double4>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_double8>(rft);
+    error |= run_non_uniform_broadcast_for_type<cl_double16>(rft);
+
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_half>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_half2>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_half3>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_half4>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_half8>(rft);
+    error |= run_non_uniform_broadcast_for_type<subgroups::cl_half16>(rft);
+
+    // broadcast first functions
+    error |=
+        rft.run_impl<cl_int, BC<cl_int, SubgroupsBroadcastOp::broadcast_first>>(
+            "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_uint,
+                          BC<cl_uint, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_long,
+                          BC<cl_long, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_ulong,
+                          BC<cl_ulong, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_short,
+                          BC<cl_short, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_ushort,
+                          BC<cl_ushort, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_char,
+                          BC<cl_char, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_uchar,
+                          BC<cl_uchar, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_float,
+                          BC<cl_float, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<cl_double,
+                          BC<cl_double, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+    error |= rft.run_impl<
+        subgroups::cl_half,
+        BC<subgroups::cl_half, SubgroupsBroadcastOp::broadcast_first>>(
+        "test_bcast_first", bcast_first_source);
+
+    // mask functions
+    error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::eq_mask>>(
+        "test_get_sub_group_eq_mask", get_subgroup_eq_mask_source);
+    error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::ge_mask>>(
+        "test_get_sub_group_ge_mask", get_subgroup_ge_mask_source);
+    error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::gt_mask>>(
+        "test_get_sub_group_gt_mask", get_subgroup_gt_mask_source);
+    error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::le_mask>>(
+        "test_get_sub_group_le_mask", get_subgroup_le_mask_source);
+    error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::lt_mask>>(
+        "test_get_sub_group_lt_mask", get_subgroup_lt_mask_source);
+
+    // ballot functions
+    error |= rft.run_impl<cl_uint, BALLOT<cl_uint>>("test_sub_group_ballot",
+                                                    ballot_source);
+    error |= rft.run_impl<cl_uint4,
+                          BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>(
+        "test_sub_group_ballot_inverse", ballot_source_inverse);
+    error |= rft.run_impl<
+        cl_uint4, BALLOT_BIT_EXTRACT<cl_uint4, BallotOp::ballot_bit_extract>>(
+        "test_sub_group_ballot_bit_extract", ballot_bit_extract_source);
+    error |= rft.run_impl<
+        cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_bit_count>>(
+        "test_sub_group_ballot_bit_count", ballot_bit_count_source);
+    error |= rft.run_impl<
+        cl_uint4,
+        BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_inclusive_scan>>(
+        "test_sub_group_ballot_inclusive_scan", ballot_inclusive_scan_source);
+    error |= rft.run_impl<
+        cl_uint4,
+        BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_exclusive_scan>>(
+        "test_sub_group_ballot_exclusive_scan", ballot_exclusive_scan_source);
+    error |= rft.run_impl<
+        cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_lsb>>(
+        "test_sub_group_ballot_find_lsb", ballot_find_lsb_source);
+    error |= rft.run_impl<
+        cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_msb>>(
+        "test_sub_group_ballot_find_msb", ballot_find_msb_source);
+    return error;
+}
diff --git a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
new file mode 100644
index 0000000..588e9ce
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
@@ -0,0 +1,340 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "subgroup_common_templates.h"
+#include "harness/typeWrappers.h"
+
+#define CLUSTER_SIZE 4
+#define CLUSTER_SIZE_STR "4"
+
+namespace {
+static const char *redadd_clustered_source =
+    "__kernel void test_redadd_clustered(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR
+    ");\n"
+    "}\n";
+
+static const char *redmax_clustered_source =
+    "__kernel void test_redmax_clustered(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR
+    ");\n"
+    "}\n";
+
+static const char *redmin_clustered_source =
+    "__kernel void test_redmin_clustered(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR
+    ");\n"
+    "}\n";
+
+static const char *redmul_clustered_source =
+    "__kernel void test_redmul_clustered(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR
+    ");\n"
+    "}\n";
+
+static const char *redand_clustered_source =
+    "__kernel void test_redand_clustered(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR
+    ");\n"
+    "}\n";
+
+static const char *redor_clustered_source =
+    "__kernel void test_redor_clustered(const __global Type *in, __global int4 "
+    "*xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR
+    ");\n"
+    "}\n";
+
+static const char *redxor_clustered_source =
+    "__kernel void test_redxor_clustered(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR
+    ");\n"
+    "}\n";
+
+static const char *redand_clustered_logical_source =
+    "__kernel void test_redand_clustered_logical(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR
+    ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = "
+    "sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR ");\n"
+    "}\n";
+
+static const char *redor_clustered_logical_source =
+    "__kernel void test_redor_clustered_logical(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if (sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR
+    ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = "
+    "sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR ");\n"
+    "}\n";
+
+static const char *redxor_clustered_logical_source =
+    "__kernel void test_redxor_clustered_logical(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    xy[gid].w = 0;\n"
+    "    if ( sizeof(in[gid]) == "
+    "sizeof(sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR
+    ")))\n"
+    "    {xy[gid].w = sizeof(in[gid]);}\n"
+    "    out[gid] = "
+    "sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR ");\n"
+    "}\n";
+
+
+// DESCRIPTION:
+// Test for reduce cluster functions
+template <typename Ty, ArithmeticOp operation> struct RED_CLU
+{
+    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        ng = ng / nw;
+        log_info("  sub_group_clustered_reduce_%s(%s, %d bytes) ...\n",
+                 operation_names(operation), TypeManager<Ty>::name(),
+                 sizeof(Ty));
+        genrand<Ty, operation>(x, t, m, ns, nw, ng);
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        int nj = (nw + ns - 1) / ns;
+        ng = ng / nw;
+
+        for (int k = 0; k < ng; ++k)
+        {
+            std::vector<cl_int> data_type_sizes;
+            // Map to array indexed to array indexed by local ID and sub group
+            for (int j = 0; j < nw; ++j)
+            {
+                mx[j] = x[j];
+                my[j] = y[j];
+                data_type_sizes.push_back(m[4 * j + 3]);
+            }
+
+            for (cl_int dts : data_type_sizes)
+            {
+                if (dts != sizeof(Ty))
+                {
+                    log_error("ERROR: sub_group_clustered_reduce_%s(%s) "
+                              "wrong data type size detected, expected: %d, "
+                              "used by device %d, in group %d\n",
+                              operation_names(operation),
+                              TypeManager<Ty>::name(), sizeof(Ty), dts, k);
+                    return TEST_FAIL;
+                }
+            }
+
+            for (int j = 0; j < nj; ++j)
+            {
+                int ii = j * ns;
+                int n = ii + ns > nw ? nw - ii : ns;
+                int midx = 4 * ii + 2;
+                std::vector<Ty> clusters_results;
+                int clusters_counter = ns / CLUSTER_SIZE;
+                clusters_results.resize(clusters_counter);
+
+                // Compute target
+                Ty tr = mx[ii];
+                for (int i = 0; i < n; ++i)
+                {
+                    if (i % CLUSTER_SIZE == 0)
+                        tr = mx[ii + i];
+                    else
+                        tr = calculate<Ty>(tr, mx[ii + i], operation);
+                    clusters_results[i / CLUSTER_SIZE] = tr;
+                }
+
+                // Check result
+                for (int i = 0; i < n; ++i)
+                {
+                    Ty rr = my[ii + i];
+                    tr = clusters_results[i / CLUSTER_SIZE];
+                    if (!compare(rr, tr))
+                    {
+                        log_error(
+                            "ERROR: sub_group_clustered_reduce_%s(%s) mismatch "
+                            "for local id %d in sub group %d in group %d\n",
+                            operation_names(operation), TypeManager<Ty>::name(),
+                            i, j, k);
+                        return TEST_FAIL;
+                    }
+                }
+            }
+
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+        log_info("  sub_group_clustered_reduce_%s(%s, %d bytes) ... passed\n",
+                 operation_names(operation), TypeManager<Ty>::name(),
+                 sizeof(Ty));
+        return TEST_PASS;
+    }
+};
+
+template <typename T>
+int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::add_>>(
+        "test_redadd_clustered", redadd_clustered_source);
+    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::max_>>(
+        "test_redmax_clustered", redmax_clustered_source);
+    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::min_>>(
+        "test_redmin_clustered", redmin_clustered_source);
+    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::mul_>>(
+        "test_redmul_clustered", redmul_clustered_source);
+    return error;
+}
+template <typename T> int run_cluster_and_or_xor_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::and_>>(
+        "test_redand_clustered", redand_clustered_source);
+    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::or_>>(
+        "test_redor_clustered", redor_clustered_source);
+    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::xor_>>(
+        "test_redxor_clustered", redxor_clustered_source);
+    return error;
+}
+template <typename T>
+int run_cluster_logical_and_or_xor_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_and>>(
+        "test_redand_clustered_logical", redand_clustered_logical_source);
+    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_or>>(
+        "test_redor_clustered_logical", redor_clustered_logical_source);
+    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_xor>>(
+        "test_redxor_clustered_logical", redxor_clustered_logical_source);
+
+    return error;
+}
+}
+
+int test_subgroup_functions_clustered_reduce(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements)
+{
+    std::vector<std::string> required_extensions = {
+        "cl_khr_subgroup_clustered_reduce"
+    };
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size,
+                                required_extensions);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+
+    int error = run_cluster_red_add_max_min_mul_for_type<cl_int>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_uint>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_long>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_ulong>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_short>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_ushort>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_char>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_uchar>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_float>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<cl_double>(rft);
+    error |= run_cluster_red_add_max_min_mul_for_type<subgroups::cl_half>(rft);
+
+    error |= run_cluster_and_or_xor_for_type<cl_int>(rft);
+    error |= run_cluster_and_or_xor_for_type<cl_uint>(rft);
+    error |= run_cluster_and_or_xor_for_type<cl_long>(rft);
+    error |= run_cluster_and_or_xor_for_type<cl_ulong>(rft);
+    error |= run_cluster_and_or_xor_for_type<cl_short>(rft);
+    error |= run_cluster_and_or_xor_for_type<cl_ushort>(rft);
+    error |= run_cluster_and_or_xor_for_type<cl_char>(rft);
+    error |= run_cluster_and_or_xor_for_type<cl_uchar>(rft);
+
+    error |= run_cluster_logical_and_or_xor_for_type<cl_int>(rft);
+    return error;
+}
diff --git a/test_conformance/subgroups/test_subgroup_extended_types.cpp b/test_conformance/subgroups/test_subgroup_extended_types.cpp
new file mode 100644
index 0000000..98401b8
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_extended_types.cpp
@@ -0,0 +1,138 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "subgroup_common_kernels.h"
+#include "subgroup_common_templates.h"
+#include "harness/typeWrappers.h"
+
+namespace {
+
+template <typename T> int run_broadcast_for_extended_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
+        "test_bcast", bcast_source);
+    return error;
+}
+
+template <typename T> int run_scan_reduction_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
+                                                               redadd_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
+                                                            redmax_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
+                                                            redmin_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
+                                                             scinadd_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
+                                                             scinmax_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
+                                                             scinmin_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
+                                                             scexadd_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
+                                                             scexmax_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
+                                                             scexmin_source);
+    return error;
+}
+
+
+}
+
+int test_subgroup_functions_extended_types(cl_device_id device,
+                                           cl_context context,
+                                           cl_command_queue queue,
+                                           int num_elements)
+{
+    std::vector<std::string> required_extensions = {
+        "cl_khr_subgroup_extended_types"
+    };
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size,
+                                required_extensions);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+
+    int error = run_broadcast_for_extended_type<cl_uint2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_uint3>(rft);
+    error |= run_broadcast_for_extended_type<cl_uint4>(rft);
+    error |= run_broadcast_for_extended_type<cl_uint8>(rft);
+    error |= run_broadcast_for_extended_type<cl_uint16>(rft);
+    error |= run_broadcast_for_extended_type<cl_int2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_int3>(rft);
+    error |= run_broadcast_for_extended_type<cl_int4>(rft);
+    error |= run_broadcast_for_extended_type<cl_int8>(rft);
+    error |= run_broadcast_for_extended_type<cl_int16>(rft);
+
+    error |= run_broadcast_for_extended_type<cl_ulong2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_ulong3>(rft);
+    error |= run_broadcast_for_extended_type<cl_ulong4>(rft);
+    error |= run_broadcast_for_extended_type<cl_ulong8>(rft);
+    error |= run_broadcast_for_extended_type<cl_ulong16>(rft);
+    error |= run_broadcast_for_extended_type<cl_long2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_long3>(rft);
+    error |= run_broadcast_for_extended_type<cl_long4>(rft);
+    error |= run_broadcast_for_extended_type<cl_long8>(rft);
+    error |= run_broadcast_for_extended_type<cl_long16>(rft);
+
+    error |= run_broadcast_for_extended_type<cl_float2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_float3>(rft);
+    error |= run_broadcast_for_extended_type<cl_float4>(rft);
+    error |= run_broadcast_for_extended_type<cl_float8>(rft);
+    error |= run_broadcast_for_extended_type<cl_float16>(rft);
+
+    error |= run_broadcast_for_extended_type<cl_double2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_double3>(rft);
+    error |= run_broadcast_for_extended_type<cl_double4>(rft);
+    error |= run_broadcast_for_extended_type<cl_double8>(rft);
+    error |= run_broadcast_for_extended_type<cl_double16>(rft);
+
+    error |= run_broadcast_for_extended_type<cl_ushort2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_ushort3>(rft);
+    error |= run_broadcast_for_extended_type<cl_ushort4>(rft);
+    error |= run_broadcast_for_extended_type<cl_ushort8>(rft);
+    error |= run_broadcast_for_extended_type<cl_ushort16>(rft);
+    error |= run_broadcast_for_extended_type<cl_short2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_short3>(rft);
+    error |= run_broadcast_for_extended_type<cl_short4>(rft);
+    error |= run_broadcast_for_extended_type<cl_short8>(rft);
+    error |= run_broadcast_for_extended_type<cl_short16>(rft);
+
+    error |= run_broadcast_for_extended_type<cl_uchar2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_uchar3>(rft);
+    error |= run_broadcast_for_extended_type<cl_uchar4>(rft);
+    error |= run_broadcast_for_extended_type<cl_uchar8>(rft);
+    error |= run_broadcast_for_extended_type<cl_uchar16>(rft);
+    error |= run_broadcast_for_extended_type<cl_char2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_char3>(rft);
+    error |= run_broadcast_for_extended_type<cl_char4>(rft);
+    error |= run_broadcast_for_extended_type<cl_char8>(rft);
+    error |= run_broadcast_for_extended_type<cl_char16>(rft);
+
+    error |= run_broadcast_for_extended_type<subgroups::cl_half2>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_half3>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_half4>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_half8>(rft);
+    error |= run_broadcast_for_extended_type<subgroups::cl_half16>(rft);
+
+    error |= run_scan_reduction_for_type<cl_uchar>(rft);
+    error |= run_scan_reduction_for_type<cl_char>(rft);
+    error |= run_scan_reduction_for_type<cl_ushort>(rft);
+    error |= run_scan_reduction_for_type<cl_short>(rft);
+    return error;
+}
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
new file mode 100644
index 0000000..eb46ff0
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
@@ -0,0 +1,473 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "harness/typeWrappers.h"
+#include "subgroup_common_templates.h"
+
+namespace {
+
+static const char *scinadd_non_uniform_source = R"(
+    __kernel void test_scinadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_add(in[gid]);
+            }
+    }
+)";
+
+static const char *scinmax_non_uniform_source = R"(
+    __kernel void test_scinmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_max(in[gid]);
+            }
+    }
+)";
+
+static const char *scinmin_non_uniform_source = R"(
+    __kernel void test_scinmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_min(in[gid]);
+            }
+    }
+)";
+
+static const char *scinmul_non_uniform_source = R"(
+    __kernel void test_scinmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_mul(in[gid]);
+            }
+    }
+)";
+
+static const char *scinand_non_uniform_source = R"(
+    __kernel void test_scinand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_and(in[gid]);
+            }
+    }
+)";
+
+static const char *scinor_non_uniform_source = R"(
+    __kernel void test_scinor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_or(in[gid]);
+            }
+    }
+)";
+
+static const char *scinxor_non_uniform_source = R"(
+    __kernel void test_scinxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_xor(in[gid]);
+            }
+    }
+)";
+
+static const char *scinand_non_uniform_logical_source = R"(
+    __kernel void test_scinand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_logical_and(in[gid]);
+            }
+    }
+)";
+
+static const char *scinor_non_uniform_logical_source = R"(
+    __kernel void test_scinor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_logical_or(in[gid]);
+            }
+    }
+)";
+
+static const char *scinxor_non_uniform_logical_source = R"(
+    __kernel void test_scinxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_inclusive_logical_xor(in[gid]);
+            }
+    }
+)";
+
+static const char *scexadd_non_uniform_source = R"(
+    __kernel void test_scexadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_add(in[gid]);
+            }
+    }
+)";
+
+static const char *scexmax_non_uniform_source = R"(
+    __kernel void test_scexmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_max(in[gid]);
+            }
+    }
+)";
+
+static const char *scexmin_non_uniform_source = R"(
+    __kernel void test_scexmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_min(in[gid]);
+            }
+    }
+)";
+
+static const char *scexmul_non_uniform_source = R"(
+    __kernel void test_scexmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_mul(in[gid]);
+            }
+    }
+)";
+
+static const char *scexand_non_uniform_source = R"(
+    __kernel void test_scexand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_and(in[gid]);
+            }
+    }
+)";
+
+static const char *scexor_non_uniform_source = R"(
+    __kernel void test_scexor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_or(in[gid]);
+            }
+    }
+)";
+
+static const char *scexxor_non_uniform_source = R"(
+    __kernel void test_scexxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_xor(in[gid]);
+            }
+    }
+)";
+
+static const char *scexand_non_uniform_logical_source = R"(
+    __kernel void test_scexand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_logical_and(in[gid]);
+            }
+    }
+)";
+
+static const char *scexor_non_uniform_logical_source = R"(
+    __kernel void test_scexor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_logical_or(in[gid]);
+            }
+    }
+)";
+
+static const char *scexxor_non_uniform_logical_source = R"(
+    __kernel void test_scexxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_scan_exclusive_logical_xor(in[gid]);
+            }
+    }
+)";
+
+static const char *redadd_non_uniform_source = R"(
+    __kernel void test_redadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_add(in[gid]);
+            }
+    }
+)";
+
+static const char *redmax_non_uniform_source = R"(
+    __kernel void test_redmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_max(in[gid]);
+            }
+    }
+)";
+
+static const char *redmin_non_uniform_source = R"(
+    __kernel void test_redmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_min(in[gid]);
+            }
+    }
+)";
+
+static const char *redmul_non_uniform_source = R"(
+    __kernel void test_redmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_mul(in[gid]);
+            }
+    }
+)";
+
+static const char *redand_non_uniform_source = R"(
+    __kernel void test_redand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_and(in[gid]);
+            }
+    }
+)";
+
+static const char *redor_non_uniform_source = R"(
+    __kernel void test_redor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_or(in[gid]);
+            }
+    }
+)";
+
+static const char *redxor_non_uniform_source = R"(
+    __kernel void test_redxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_xor(in[gid]);
+            }
+    }
+)";
+
+static const char *redand_non_uniform_logical_source = R"(
+    __kernel void test_redand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_logical_and(in[gid]);
+            }
+    }
+)";
+
+static const char *redor_non_uniform_logical_source = R"(
+    __kernel void test_redor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_logical_or(in[gid]);
+            }
+    }
+)";
+
+static const char *redxor_non_uniform_logical_source = R"(
+    __kernel void test_redxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_reduce_logical_xor(in[gid]);
+            }
+    }
+)";
+
+template <typename T>
+int run_functions_add_mul_max_min_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
+        "test_scinadd_non_uniform", scinadd_non_uniform_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::mul_>>(
+        "test_scinmul_non_uniform", scinmul_non_uniform_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
+        "test_scinmax_non_uniform", scinmax_non_uniform_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
+        "test_scinmin_non_uniform", scinmin_non_uniform_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
+        "test_scexadd_non_uniform", scexadd_non_uniform_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::mul_>>(
+        "test_scexmul_non_uniform", scexmul_non_uniform_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
+        "test_scexmax_non_uniform", scexmax_non_uniform_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
+        "test_scexmin_non_uniform", scexmin_non_uniform_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>(
+        "test_redadd_non_uniform", redadd_non_uniform_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::mul_>>(
+        "test_redmul_non_uniform", redmul_non_uniform_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>(
+        "test_redmax_non_uniform", redmax_non_uniform_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>(
+        "test_redmin_non_uniform", redmin_non_uniform_source);
+    return error;
+}
+
+template <typename T> int run_functions_and_or_xor_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::and_>>(
+        "test_scinand_non_uniform", scinand_non_uniform_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::or_>>(
+        "test_scinor_non_uniform", scinor_non_uniform_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::xor_>>(
+        "test_scinxor_non_uniform", scinxor_non_uniform_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::and_>>(
+        "test_scexand_non_uniform", scexand_non_uniform_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::or_>>(
+        "test_scexor_non_uniform", scexor_non_uniform_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::xor_>>(
+        "test_scexxor_non_uniform", scexxor_non_uniform_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::and_>>(
+        "test_redand_non_uniform", redand_non_uniform_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::or_>>(
+        "test_redor_non_uniform", redor_non_uniform_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::xor_>>(
+        "test_redxor_non_uniform", redxor_non_uniform_source);
+    return error;
+}
+
+template <typename T>
+int run_functions_logical_and_or_xor_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_and>>(
+        "test_scinand_non_uniform_logical", scinand_non_uniform_logical_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_or>>(
+        "test_scinor_non_uniform_logical", scinor_non_uniform_logical_source);
+    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_xor>>(
+        "test_scinxor_non_uniform_logical", scinxor_non_uniform_logical_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_and>>(
+        "test_scexand_non_uniform_logical", scexand_non_uniform_logical_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_or>>(
+        "test_scexor_non_uniform_logical", scexor_non_uniform_logical_source);
+    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_xor>>(
+        "test_scexxor_non_uniform_logical", scexxor_non_uniform_logical_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_and>>(
+        "test_redand_non_uniform_logical", redand_non_uniform_logical_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_or>>(
+        "test_redor_non_uniform_logical", redor_non_uniform_logical_source);
+    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_xor>>(
+        "test_redxor_non_uniform_logical", redxor_non_uniform_logical_source);
+    return error;
+}
+
+}
+
+int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device,
+                                                   cl_context context,
+                                                   cl_command_queue queue,
+                                                   int num_elements)
+{
+    std::vector<std::string> required_extensions = {
+        "cl_khr_subgroup_non_uniform_arithmetic"
+    };
+    std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
+                                 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
+                                 0x00ffff00, 0x80000000, 0xaaaaaaaa };
+
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size,
+                                required_extensions, masks);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+
+    int error = run_functions_add_mul_max_min_for_type<cl_int>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_uint>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_long>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_ulong>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_short>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_ushort>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_char>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_uchar>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_float>(rft);
+    error |= run_functions_add_mul_max_min_for_type<cl_double>(rft);
+    error |= run_functions_add_mul_max_min_for_type<subgroups::cl_half>(rft);
+
+    error |= run_functions_and_or_xor_for_type<cl_int>(rft);
+    error |= run_functions_and_or_xor_for_type<cl_uint>(rft);
+    error |= run_functions_and_or_xor_for_type<cl_long>(rft);
+    error |= run_functions_and_or_xor_for_type<cl_ulong>(rft);
+    error |= run_functions_and_or_xor_for_type<cl_short>(rft);
+    error |= run_functions_and_or_xor_for_type<cl_ushort>(rft);
+    error |= run_functions_and_or_xor_for_type<cl_char>(rft);
+    error |= run_functions_and_or_xor_for_type<cl_uchar>(rft);
+
+    error |= run_functions_logical_and_or_xor_for_type<cl_int>(rft);
+    return error;
+}
\ No newline at end of file
diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
new file mode 100644
index 0000000..2b00b4d
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
@@ -0,0 +1,303 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "harness/typeWrappers.h"
+#include <set>
+
+namespace {
+
+template <typename T, NonUniformVoteOp operation> struct VOTE
+{
+    static void gen(T *x, T *t, cl_int *m, const WorkGroupParams &test_params)
+    {
+        int i, ii, j, k, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        int nj = (nw + ns - 1) / ns;
+        int non_uniform_size = ng % nw;
+        ng = ng / nw;
+        int last_subgroup_size = 0;
+        ii = 0;
+
+        log_info("  sub_group_%s%s... \n",
+                 (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
+                 operation_names(operation));
+
+        log_info("  test params: global size = %d local size = %d subgroups "
+                 "size = %d work item mask = 0x%x data type (%s)\n",
+                 test_params.global_workgroup_size, nw, ns, work_items_mask,
+                 TypeManager<T>::name());
+        if (non_uniform_size)
+        {
+            log_info("  non uniform work group size mode ON\n");
+        }
+        if (operation == NonUniformVoteOp::elect) return;
+
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            if (non_uniform_size && k == ng - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, nj, ns, nw,
+                                          last_subgroup_size);
+            }
+            for (j = 0; j < nj; ++j)
+            { // for each subgroup
+                ii = j * ns;
+                if (last_subgroup_size && j == nj - 1)
+                {
+                    n = last_subgroup_size;
+                }
+                else
+                {
+                    n = ii + ns > nw ? nw - ii : ns;
+                }
+                int e = genrand_int32(gMTdata) % 3;
+
+                for (i = 0; i < n; i++)
+                {
+                    if (e == 2)
+                    { // set once 0 and once 1 alternately
+                        int value = i % 2;
+                        set_value(t[ii + i], value);
+                    }
+                    else
+                    { // set 0/1 for all work items in subgroup
+                        set_value(t[ii + i], e);
+                    }
+                }
+            }
+            // Now map into work group using map from device
+            for (j = 0; j < nw; ++j)
+            {
+                x[j] = t[j];
+            }
+            x += nw;
+            m += 4 * nw;
+        }
+    }
+
+    static int chk(T *x, T *y, T *mx, T *my, cl_int *m,
+                   const WorkGroupParams &test_params)
+    {
+        int ii, i, j, k, n;
+        int nw = test_params.local_workgroup_size;
+        int ns = test_params.subgroup_size;
+        int ng = test_params.global_workgroup_size;
+        uint32_t work_items_mask = test_params.work_items_mask;
+        int nj = (nw + ns - 1) / ns;
+        cl_int tr, rr;
+        int non_uniform_size = ng % nw;
+        ng = ng / nw;
+        if (non_uniform_size) ng++;
+        int last_subgroup_size = 0;
+
+        for (k = 0; k < ng; ++k)
+        { // for each work_group
+            if (non_uniform_size && k == ng - 1)
+            {
+                set_last_workgroup_params(non_uniform_size, nj, ns, nw,
+                                          last_subgroup_size);
+            }
+            for (j = 0; j < nw; ++j)
+            { // inside the work_group
+                mx[j] = x[j]; // read host inputs for work_group
+                my[j] = y[j]; // read device outputs for work_group
+            }
+
+            for (j = 0; j < nj; ++j)
+            { // for each subgroup
+                ii = j * ns;
+                if (last_subgroup_size && j == nj - 1)
+                {
+                    n = last_subgroup_size;
+                }
+                else
+                {
+                    n = ii + ns > nw ? nw - ii : ns;
+                }
+
+                rr = 0;
+                if (operation == NonUniformVoteOp::all
+                    || operation == NonUniformVoteOp::all_equal)
+                    tr = 1;
+                if (operation == NonUniformVoteOp::any) tr = 0;
+
+                std::set<int> active_work_items;
+                for (i = 0; i < n; ++i)
+                {
+                    uint32_t check_work_item = 1 << (i % 32);
+                    if (work_items_mask & check_work_item)
+                    {
+                        active_work_items.insert(i);
+                        switch (operation)
+                        {
+                            case NonUniformVoteOp::elect: break;
+
+                            case NonUniformVoteOp::all:
+                                tr &=
+                                    !compare_ordered<T>(mx[ii + i], 0) ? 1 : 0;
+                                break;
+                            case NonUniformVoteOp::any:
+                                tr |=
+                                    !compare_ordered<T>(mx[ii + i], 0) ? 1 : 0;
+                                break;
+                            case NonUniformVoteOp::all_equal:
+                                tr &= compare_ordered<T>(
+                                          mx[ii + i],
+                                          mx[ii + *active_work_items.begin()])
+                                    ? 1
+                                    : 0;
+                                break;
+                            default:
+                                log_error("Unknown operation\n");
+                                return TEST_FAIL;
+                        }
+                    }
+                }
+                if (active_work_items.empty())
+                {
+                    log_info("  no one workitem acitve... in workgroup id = %d "
+                             "subgroup id = %d\n",
+                             k, j);
+                }
+                else
+                {
+                    auto lowest_active = active_work_items.begin();
+                    for (const int &active_work_item : active_work_items)
+                    {
+                        i = active_work_item;
+                        if (operation == NonUniformVoteOp::elect)
+                        {
+                            i == *lowest_active ? tr = 1 : tr = 0;
+                        }
+
+                        // normalize device values on host, non zero set 1.
+                        rr = compare_ordered<T>(my[ii + i], 0) ? 0 : 1;
+
+                        if (rr != tr)
+                        {
+                            log_error("ERROR: sub_group_%s() \n",
+                                      operation_names(operation));
+                            log_error(
+                                "mismatch for work item %d sub group %d in "
+                                "work group %d. Expected: %d Obtained: %d\n",
+                                i, j, k, tr, rr);
+                            return TEST_FAIL;
+                        }
+                    }
+                }
+            }
+
+            x += nw;
+            y += nw;
+            m += 4 * nw;
+        }
+
+        log_info("  sub_group_%s%s... passed\n",
+                 (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
+                 operation_names(operation));
+        return TEST_PASS;
+    }
+};
+static const char *elect_source = R"(
+    __kernel void test_elect(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_elect();
+            }
+    }
+)";
+
+static const char *non_uniform_any_source = R"(
+    __kernel void test_non_uniform_any(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_any(in[gid]);
+            }
+    }
+)";
+
+static const char *non_uniform_all_source = R"(
+    __kernel void test_non_uniform_all(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_all(in[gid]);
+            }
+    }
+)";
+
+static const char *non_uniform_all_equal_source = R"(
+    __kernel void test_non_uniform_all_equal(const __global Type *in, __global int4 *xy, __global Type *out) {
+        int gid = get_global_id(0);
+        XY(xy,gid);
+        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
+            if (elect_work_item & WORK_ITEMS_MASK){
+                out[gid] = sub_group_non_uniform_all_equal(in[gid]);
+            }
+    }
+)";
+
+template <typename T> int run_vote_all_equal_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, VOTE<T, NonUniformVoteOp::all_equal>>(
+        "test_non_uniform_all_equal", non_uniform_all_equal_source);
+    return error;
+}
+}
+
+int test_subgroup_functions_non_uniform_vote(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements)
+{
+    std::vector<std::string> required_extensions = {
+        "cl_khr_subgroup_non_uniform_vote"
+    };
+
+    std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
+                                 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
+                                 0x00ffff00, 0x80000000 };
+    constexpr size_t global_work_size = 170;
+    constexpr size_t local_work_size = 64;
+    WorkGroupParams test_params(global_work_size, local_work_size,
+                                required_extensions, masks);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+
+    int error = run_vote_all_equal_for_type<cl_int>(rft);
+    error |= run_vote_all_equal_for_type<cl_uint>(rft);
+    error |= run_vote_all_equal_for_type<cl_long>(rft);
+    error |= run_vote_all_equal_for_type<cl_ulong>(rft);
+    error |= run_vote_all_equal_for_type<cl_float>(rft);
+    error |= run_vote_all_equal_for_type<cl_double>(rft);
+    error |= run_vote_all_equal_for_type<subgroups::cl_half>(rft);
+
+    error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::all>>(
+        "test_non_uniform_all", non_uniform_all_source);
+    error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::elect>>(
+        "test_elect", elect_source);
+    error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::any>>(
+        "test_non_uniform_any", non_uniform_any_source);
+    return error;
+}
diff --git a/test_conformance/subgroups/test_subgroup_shuffle.cpp b/test_conformance/subgroups/test_subgroup_shuffle.cpp
new file mode 100644
index 0000000..049f098
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_shuffle.cpp
@@ -0,0 +1,78 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "subgroup_common_templates.h"
+#include "harness/typeWrappers.h"
+#include <bitset>
+
+namespace {
+
+static const char* shuffle_xor_source =
+    "__kernel void test_sub_group_shuffle_xor(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    out[gid] = sub_group_shuffle_xor(x, xy[gid].z);"
+    "}\n";
+
+static const char* shuffle_source =
+    "__kernel void test_sub_group_shuffle(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    out[gid] = sub_group_shuffle(x, xy[gid].z);"
+    "}\n";
+
+template <typename T> int run_shuffle_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>(
+        "test_sub_group_shuffle", shuffle_source);
+    error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_xor>>(
+        "test_sub_group_shuffle_xor", shuffle_xor_source);
+    return error;
+}
+
+}
+
+int test_subgroup_functions_shuffle(cl_device_id device, cl_context context,
+                                    cl_command_queue queue, int num_elements)
+{
+    std::vector<std::string> required_extensions{ "cl_khr_subgroup_shuffle" };
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size,
+                                required_extensions);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+
+    int error = run_shuffle_for_type<cl_int>(rft);
+    error |= run_shuffle_for_type<cl_uint>(rft);
+    error |= run_shuffle_for_type<cl_long>(rft);
+    error |= run_shuffle_for_type<cl_ulong>(rft);
+    error |= run_shuffle_for_type<cl_short>(rft);
+    error |= run_shuffle_for_type<cl_ushort>(rft);
+    error |= run_shuffle_for_type<cl_char>(rft);
+    error |= run_shuffle_for_type<cl_uchar>(rft);
+    error |= run_shuffle_for_type<cl_float>(rft);
+    error |= run_shuffle_for_type<cl_double>(rft);
+    error |= run_shuffle_for_type<subgroups::cl_half>(rft);
+
+    return error;
+}
diff --git a/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
new file mode 100644
index 0000000..6000c97
--- /dev/null
+++ b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
@@ -0,0 +1,81 @@
+//
+// Copyright (c) 2021 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "subgroup_common_templates.h"
+#include "harness/conversions.h"
+#include "harness/typeWrappers.h"
+
+namespace {
+
+static const char* shuffle_down_source =
+    "__kernel void test_sub_group_shuffle_down(const __global Type *in, "
+    "__global int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    out[gid] = sub_group_shuffle_down(x, xy[gid].z);"
+    "}\n";
+static const char* shuffle_up_source =
+    "__kernel void test_sub_group_shuffle_up(const __global Type *in, __global "
+    "int4 *xy, __global Type *out)\n"
+    "{\n"
+    "    int gid = get_global_id(0);\n"
+    "    XY(xy,gid);\n"
+    "    Type x = in[gid];\n"
+    "    out[gid] = sub_group_shuffle_up(x, xy[gid].z);"
+    "}\n";
+
+template <typename T> int run_shuffle_relative_for_type(RunTestForType rft)
+{
+    int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>(
+        "test_sub_group_shuffle_up", shuffle_up_source);
+    error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_down>>(
+        "test_sub_group_shuffle_down", shuffle_down_source);
+    return error;
+}
+
+}
+
+int test_subgroup_functions_shuffle_relative(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements)
+{
+    std::vector<std::string> required_extensions = {
+        "cl_khr_subgroup_shuffle_relative"
+    };
+    constexpr size_t global_work_size = 2000;
+    constexpr size_t local_work_size = 200;
+    WorkGroupParams test_params(global_work_size, local_work_size,
+                                required_extensions);
+    RunTestForType rft(device, context, queue, num_elements, test_params);
+
+    int error = run_shuffle_relative_for_type<cl_int>(rft);
+    error |= run_shuffle_relative_for_type<cl_uint>(rft);
+    error |= run_shuffle_relative_for_type<cl_long>(rft);
+    error |= run_shuffle_relative_for_type<cl_ulong>(rft);
+    error |= run_shuffle_relative_for_type<cl_short>(rft);
+    error |= run_shuffle_relative_for_type<cl_ushort>(rft);
+    error |= run_shuffle_relative_for_type<cl_char>(rft);
+    error |= run_shuffle_relative_for_type<cl_uchar>(rft);
+    error |= run_shuffle_relative_for_type<cl_float>(rft);
+    error |= run_shuffle_relative_for_type<cl_double>(rft);
+    error |= run_shuffle_relative_for_type<subgroups::cl_half>(rft);
+
+    return error;
+}
diff --git a/test_conformance/subgroups/test_workgroup.cpp b/test_conformance/subgroups/test_workgroup.cpp
deleted file mode 100644
index 779d30f..0000000
--- a/test_conformance/subgroups/test_workgroup.cpp
+++ /dev/null
@@ -1,727 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "procs.h"
-#include "subhelpers.h"
-#include "harness/conversions.h"
-#include "harness/typeWrappers.h"
-
-static const char *any_source = "__kernel void test_any(const __global Type "
-                                "*in, __global int2 *xy, __global Type *out)\n"
-                                "{\n"
-                                "    int gid = get_global_id(0);\n"
-                                "    XY(xy,gid);\n"
-                                "    out[gid] = sub_group_any(in[gid]);\n"
-                                "}\n";
-
-static const char *all_source = "__kernel void test_all(const __global Type "
-                                "*in, __global int2 *xy, __global Type *out)\n"
-                                "{\n"
-                                "    int gid = get_global_id(0);\n"
-                                "    XY(xy,gid);\n"
-                                "    out[gid] = sub_group_all(in[gid]);\n"
-                                "}\n";
-
-static const char *bcast_source =
-    "__kernel void test_bcast(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    Type x = in[gid];\n"
-    "    size_t loid = (size_t)((int)x % 100);\n"
-    "    out[gid] = sub_group_broadcast(x, loid);\n"
-    "}\n";
-
-static const char *redadd_source =
-    "__kernel void test_redadd(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_reduce_add(in[gid]);\n"
-    "}\n";
-
-static const char *redmax_source =
-    "__kernel void test_redmax(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_reduce_max(in[gid]);\n"
-    "}\n";
-
-static const char *redmin_source =
-    "__kernel void test_redmin(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_reduce_min(in[gid]);\n"
-    "}\n";
-
-static const char *scinadd_source =
-    "__kernel void test_scinadd(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
-    "}\n";
-
-static const char *scinmax_source =
-    "__kernel void test_scinmax(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
-    "}\n";
-
-static const char *scinmin_source =
-    "__kernel void test_scinmin(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
-    "}\n";
-
-static const char *scexadd_source =
-    "__kernel void test_scexadd(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
-    "}\n";
-
-static const char *scexmax_source =
-    "__kernel void test_scexmax(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
-    "}\n";
-
-static const char *scexmin_source =
-    "__kernel void test_scexmin(const __global Type *in, __global int2 *xy, "
-    "__global Type *out)\n"
-    "{\n"
-    "    int gid = get_global_id(0);\n"
-    "    XY(xy,gid);\n"
-    "    out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
-    "}\n";
-
-
-// Any/All test functions
-template <int Which> struct AA
-{
-    static void gen(cl_int *x, cl_int *t, cl_int *m, int ns, int nw, int ng)
-    {
-        int i, ii, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-        int e;
-
-        ii = 0;
-        for (k = 0; k < ng; ++k)
-        {
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-                e = (int)(genrand_int32(gMTdata) % 3);
-
-                // Initialize data matrix indexed by local id and sub group id
-                switch (e)
-                {
-                    case 0: memset(&t[ii], 0, n * sizeof(cl_int)); break;
-                    case 1:
-                        memset(&t[ii], 0, n * sizeof(cl_int));
-                        i = (int)(genrand_int32(gMTdata) % (cl_uint)n);
-                        t[ii + i] = 41;
-                        break;
-                    case 2: memset(&t[ii], 0xff, n * sizeof(cl_int)); break;
-                }
-            }
-
-            // Now map into work group using map from device
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                x[j] = t[i];
-            }
-
-            x += nw;
-            m += 2 * nw;
-        }
-    }
-
-    static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
-                   int ns, int nw, int ng)
-    {
-        int ii, i, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-        cl_int taa, raa;
-
-        log_info("  sub_group_%s...\n", Which == 0 ? "any" : "all");
-
-        for (k = 0; k < ng; ++k)
-        {
-            // Map to array indexed to array indexed by local ID and sub group
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                mx[i] = x[j];
-                my[i] = y[j];
-            }
-
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-
-                // Compute target
-                if (Which == 0)
-                {
-                    taa = 0;
-                    for (i = 0; i < n; ++i) taa |= mx[ii + i] != 0;
-                }
-                else
-                {
-                    taa = 1;
-                    for (i = 0; i < n; ++i) taa &= mx[ii + i] != 0;
-                }
-
-                // Check result
-                for (i = 0; i < n; ++i)
-                {
-                    raa = my[ii + i] != 0;
-                    if (raa != taa)
-                    {
-                        log_error("ERROR: sub_group_%s mismatch for local id "
-                                  "%d in sub group %d in group %d\n",
-                                  Which == 0 ? "any" : "all", i, j, k);
-                        return -1;
-                    }
-                }
-            }
-
-            x += nw;
-            y += nw;
-            m += 2 * nw;
-        }
-
-        return 0;
-    }
-};
-
-// Reduce functions
-template <typename Ty, int Which> struct RED
-{
-    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
-    {
-        int i, ii, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-
-        ii = 0;
-        for (k = 0; k < ng; ++k)
-        {
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-
-                for (i = 0; i < n; ++i)
-                    t[ii + i] = (Ty)(
-                        (int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1);
-            }
-
-            // Now map into work group using map from device
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                x[j] = t[i];
-            }
-
-            x += nw;
-            m += 2 * nw;
-        }
-    }
-
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
-                   int ng)
-    {
-        int ii, i, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-        Ty tr, rr;
-
-        log_info("  sub_group_reduce_%s(%s)...\n",
-                 Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
-                 TypeName<Ty>::val());
-
-        for (k = 0; k < ng; ++k)
-        {
-            // Map to array indexed to array indexed by local ID and sub group
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                mx[i] = x[j];
-                my[i] = y[j];
-            }
-
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-
-                // Compute target
-                if (Which == 0)
-                {
-                    // add
-                    tr = mx[ii];
-                    for (i = 1; i < n; ++i) tr += mx[ii + i];
-                }
-                else if (Which == 1)
-                {
-                    // max
-                    tr = mx[ii];
-                    for (i = 1; i < n; ++i)
-                        tr = tr > mx[ii + i] ? tr : mx[ii + i];
-                }
-                else if (Which == 2)
-                {
-                    // min
-                    tr = mx[ii];
-                    for (i = 1; i < n; ++i)
-                        tr = tr > mx[ii + i] ? mx[ii + i] : tr;
-                }
-
-                // Check result
-                for (i = 0; i < n; ++i)
-                {
-                    rr = my[ii + i];
-                    if (rr != tr)
-                    {
-                        log_error("ERROR: sub_group_reduce_%s(%s) mismatch for "
-                                  "local id %d in sub group %d in group %d\n",
-                                  Which == 0 ? "add"
-                                             : (Which == 1 ? "max" : "min"),
-                                  TypeName<Ty>::val(), i, j, k);
-                        return -1;
-                    }
-                }
-            }
-
-            x += nw;
-            y += nw;
-            m += 2 * nw;
-        }
-
-        return 0;
-    }
-};
-
-// Scan Inclusive functions
-template <typename Ty, int Which> struct SCIN
-{
-    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
-    {
-        int i, ii, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-
-        ii = 0;
-        for (k = 0; k < ng; ++k)
-        {
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-
-                for (i = 0; i < n; ++i)
-                    // t[ii+i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff)
-                    // % ns + 1);
-                    t[ii + i] = (Ty)i;
-            }
-
-            // Now map into work group using map from device
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                x[j] = t[i];
-            }
-
-            x += nw;
-            m += 2 * nw;
-        }
-    }
-
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
-                   int ng)
-    {
-        int ii, i, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-        Ty tr, rr;
-
-        log_info("  sub_group_scan_inclusive_%s(%s)...\n",
-                 Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
-                 TypeName<Ty>::val());
-
-        for (k = 0; k < ng; ++k)
-        {
-            // Map to array indexed to array indexed by local ID and sub group
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                mx[i] = x[j];
-                my[i] = y[j];
-            }
-
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-
-                // Check result
-                for (i = 0; i < n; ++i)
-                {
-                    if (Which == 0)
-                    {
-                        tr = i == 0 ? mx[ii] : tr + mx[ii + i];
-                    }
-                    else if (Which == 1)
-                    {
-                        tr = i == 0 ? mx[ii]
-                                    : (tr > mx[ii + i] ? tr : mx[ii + i]);
-                    }
-                    else
-                    {
-                        tr = i == 0 ? mx[ii]
-                                    : (tr > mx[ii + i] ? mx[ii + i] : tr);
-                    }
-
-                    rr = my[ii + i];
-                    if (rr != tr)
-                    {
-                        log_error(
-                            "ERROR: sub_group_scan_inclusive_%s(%s) mismatch "
-                            "for local id %d in sub group %d in group %d\n",
-                            Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
-                            TypeName<Ty>::val(), i, j, k);
-                        return -1;
-                    }
-                }
-            }
-
-            x += nw;
-            y += nw;
-            m += 2 * nw;
-        }
-
-        return 0;
-    }
-};
-
-// Scan Exclusive functions
-template <typename Ty, int Which> struct SCEX
-{
-    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
-    {
-        int i, ii, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-
-        ii = 0;
-        for (k = 0; k < ng; ++k)
-        {
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-
-                for (i = 0; i < n; ++i)
-                    t[ii + i] = (Ty)(
-                        (int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1);
-            }
-
-            // Now map into work group using map from device
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                x[j] = t[i];
-            }
-
-            x += nw;
-            m += 2 * nw;
-        }
-    }
-
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
-                   int ng)
-    {
-        int ii, i, j, k, n;
-        int nj = (nw + ns - 1) / ns;
-        Ty tr, trt, rr;
-
-        log_info("  sub_group_scan_exclusive_%s(%s)...\n",
-                 Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
-                 TypeName<Ty>::val());
-
-        for (k = 0; k < ng; ++k)
-        {
-            // Map to array indexed to array indexed by local ID and sub group
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                mx[i] = x[j];
-                my[i] = y[j];
-            }
-
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-
-                // Check result
-                for (i = 0; i < n; ++i)
-                {
-                    if (Which == 0)
-                    {
-                        tr = i == 0 ? TypeIdentity<Ty, Which>::val() : tr + trt;
-                    }
-                    else if (Which == 1)
-                    {
-                        tr = i == 0 ? TypeIdentity<Ty, Which>::val()
-                                    : (trt > tr ? trt : tr);
-                    }
-                    else
-                    {
-                        tr = i == 0 ? TypeIdentity<Ty, Which>::val()
-                                    : (trt > tr ? tr : trt);
-                    }
-                    trt = mx[ii + i];
-                    rr = my[ii + i];
-
-                    if (rr != tr)
-                    {
-                        log_error(
-                            "ERROR: sub_group_scan_exclusive_%s(%s) mismatch "
-                            "for local id %d in sub group %d in group %d\n",
-                            Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
-                            TypeName<Ty>::val(), i, j, k);
-                        return -1;
-                    }
-                }
-            }
-
-            x += nw;
-            y += nw;
-            m += 2 * nw;
-        }
-
-        return 0;
-    }
-};
-
-// Broadcast functios
-template <typename Ty> struct BC
-{
-    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
-    {
-        int i, ii, j, k, l, n;
-        int nj = (nw + ns - 1) / ns;
-        int d = ns > 100 ? 100 : ns;
-
-        ii = 0;
-        for (k = 0; k < ng; ++k)
-        {
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-                l = (int)(genrand_int32(gMTdata) & 0x7fffffff)
-                    % (d > n ? n : d);
-
-                for (i = 0; i < n; ++i)
-                    t[ii + i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff)
-                                         % 100 * 100
-                                     + l);
-            }
-
-            // Now map into work group using map from device
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                x[j] = t[i];
-            }
-
-            x += nw;
-            m += 2 * nw;
-        }
-    }
-
-    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
-                   int ng)
-    {
-        int ii, i, j, k, l, n;
-        int nj = (nw + ns - 1) / ns;
-        Ty tr, rr;
-
-        log_info("  sub_group_broadcast(%s)...\n", TypeName<Ty>::val());
-
-        for (k = 0; k < ng; ++k)
-        {
-            // Map to array indexed to array indexed by local ID and sub group
-            for (j = 0; j < nw; ++j)
-            {
-                i = m[2 * j + 1] * ns + m[2 * j];
-                mx[i] = x[j];
-                my[i] = y[j];
-            }
-
-            for (j = 0; j < nj; ++j)
-            {
-                ii = j * ns;
-                n = ii + ns > nw ? nw - ii : ns;
-                l = (int)mx[ii] % 100;
-                tr = mx[ii + l];
-
-                // Check result
-                for (i = 0; i < n; ++i)
-                {
-                    rr = my[ii + i];
-                    if (rr != tr)
-                    {
-                        log_error("ERROR: sub_group_broadcast(%s) mismatch for "
-                                  "local id %d in sub group %d in group %d\n",
-                                  TypeName<Ty>::val(), i, j, k);
-                        return -1;
-                    }
-                }
-            }
-
-            x += nw;
-            y += nw;
-            m += 2 * nw;
-        }
-
-        return 0;
-    }
-};
-
-#define G 2000
-#define L 200
-struct run_for_type
-{
-    run_for_type(cl_device_id device, cl_context context,
-                 cl_command_queue queue, int num_elements,
-                 bool useCoreSubgroups)
-    {
-        device_ = device;
-        context_ = context;
-        queue_ = queue;
-        num_elements_ = num_elements;
-        useCoreSubgroups_ = useCoreSubgroups;
-    }
-
-    template <typename T> cl_int run()
-    {
-        cl_int error;
-        error = test<T, BC<T>, G, L>::run(device_, context_, queue_,
-                                          num_elements_, "test_bcast",
-                                          bcast_source, 0, useCoreSubgroups_);
-        error |= test<T, RED<T, 0>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_redadd",
-            redadd_source, 0, useCoreSubgroups_);
-        error |= test<T, RED<T, 1>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_redmax",
-            redmax_source, 0, useCoreSubgroups_);
-        error |= test<T, RED<T, 2>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_redmin",
-            redmin_source, 0, useCoreSubgroups_);
-        error |= test<T, SCIN<T, 0>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_scinadd",
-            scinadd_source, 0, useCoreSubgroups_);
-        error |= test<T, SCIN<T, 1>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_scinmax",
-            scinmax_source, 0, useCoreSubgroups_);
-        error |= test<T, SCIN<T, 2>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_scinmin",
-            scinmin_source, 0, useCoreSubgroups_);
-        error |= test<T, SCEX<T, 0>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_scexadd",
-            scexadd_source, 0, useCoreSubgroups_);
-        error |= test<T, SCEX<T, 1>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_scexmax",
-            scexmax_source, 0, useCoreSubgroups_);
-        error |= test<T, SCEX<T, 2>, G, L>::run(
-            device_, context_, queue_, num_elements_, "test_scexmin",
-            scexmin_source, 0, useCoreSubgroups_);
-        return error;
-    }
-
-private:
-    cl_device_id device_;
-    cl_context context_;
-    cl_command_queue queue_;
-    int num_elements_;
-    bool useCoreSubgroups_;
-};
-
-// Entry point from main
-int test_work_group_functions(cl_device_id device, cl_context context,
-                              cl_command_queue queue, int num_elements,
-                              bool useCoreSubgroups)
-{
-    int error;
-    error = test<int, AA<0>, G, L>::run(device, context, queue, num_elements,
-                                        "test_any", any_source, 0,
-                                        useCoreSubgroups);
-    error |= test<int, AA<1>, G, L>::run(device, context, queue, num_elements,
-                                         "test_all", all_source, 0,
-                                         useCoreSubgroups);
-    run_for_type rft(device, context, queue, num_elements, useCoreSubgroups);
-    error |= rft.run<cl_uint>();
-    error |= rft.run<cl_int>();
-    error |= rft.run<cl_ulong>();
-    error |= rft.run<cl_long>();
-    error |= rft.run<float>();
-    error |= rft.run<double>();
-    // error |= rft.run<cl_half>();
-
-    return error;
-}
-
-int test_work_group_functions_core(cl_device_id device, cl_context context,
-                                   cl_command_queue queue, int num_elements)
-{
-    return test_work_group_functions(device, context, queue, num_elements,
-                                     true);
-}
-
-int test_work_group_functions_ext(cl_device_id device, cl_context context,
-                                  cl_command_queue queue, int num_elements)
-{
-    bool hasExtension = is_extension_available(device, "cl_khr_subgroups");
-
-    if (!hasExtension)
-    {
-        log_info(
-            "Device does not support 'cl_khr_subgroups'. Skipping the test.\n");
-        return TEST_SKIPPED_ITSELF;
-    }
-    return test_work_group_functions(device, context, queue, num_elements,
-                                     false);
-}