test_conformance/clcpp/workgroups/test_wg_broadcast.hpp - external/github.com/KhronosGroup/OpenCL-CTS - Git at Google

 //
 // Copyright (c) 2017 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #ifndef TEST_CONFORMANCE_CLCPP_WG_TEST_WG_BROADCAST_HPP
 #define TEST_CONFORMANCE_CLCPP_WG_TEST_WG_BROADCAST_HPP

 #include <vector>
 #include <limits>
 #include <algorithm>

 // Common for all OpenCL C++ tests
 #include "../common.hpp"
 // Common for tests of work-group functions
 #include "common.hpp"

 // -----------------------------------------------------------------------------------
 // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
 // -----------------------------------------------------------------------------------
 #if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
 std::string generate_wg_broadcast_1D_kernel_code()
 {
     return
         "__kernel void test_wg_broadcast(global uint *input, global uint *output)\n"
         "{\n"
         "    ulong tid = get_global_id(0);\n"
         "    uint result = work_group_broadcast(input[tid], get_group_id(0) % get_local_size(0));\n"
         "    output[tid] = result;\n"
         "}\n";
 }
 std::string generate_wg_broadcast_2D_kernel_code()
 {
     return
         "__kernel void test_wg_broadcast(global uint *input, global uint *output)\n"
         "{\n"
         "    ulong tid_x = get_global_id(0);\n"
         "    ulong tid_y = get_global_id(1);\n"
         "    size_t x = get_group_id(0) % get_local_size(0);\n"
         "    size_t y = get_group_id(1) % get_local_size(1);\n"
         "    size_t idx = (tid_y * get_global_size(0)) + tid_x;\n"
         "    uint result = work_group_broadcast(input[idx], x, y);\n"
         "    output[idx] = result;\n"
         "}\n";
 }
 std::string generate_wg_broadcast_3D_kernel_code()
 {
     return
         "__kernel void test_wg_broadcast(global uint *input, global uint *output)\n"
         "{\n"
         "    ulong tid_x = get_global_id(0);\n"
         "    ulong tid_y = get_global_id(1);\n"
         "    ulong tid_z = get_global_id(2);\n"
         "    size_t x = get_group_id(0) % get_local_size(0);\n"
         "    size_t y = get_group_id(1) % get_local_size(1);\n"
         "    size_t z = get_group_id(2) % get_local_size(2);\n"
         "    ulong idx = (tid_z * get_global_size(1) * get_global_size(0)) + (tid_y * get_global_size(0)) + tid_x;\n"
         "    uint result = work_group_broadcast(input[idx], x, y, z);\n"
         "    output[idx] = result;\n"
         "}\n";
 }
 #else
 std::string generate_wg_broadcast_1D_kernel_code()
 {
     return "#include <opencl_memory>\n"
            "#include <opencl_work_item>\n"
            "#include <opencl_work_group>\n"
            "using namespace cl;\n"
            "__kernel void test_wg_broadcast(global_ptr<uint[]> input, global_ptr<uint[]> output)\n"
            "{\n"
            "    ulong tid = get_global_id(0);\n"
            "    uint result = work_group_broadcast(input[tid], get_group_id(0) % get_local_size(0));\n"
            "    output[tid] = result;\n"
            "}\n";
 }
 std::string generate_wg_broadcast_2D_kernel_code()
 {
     return "#include <opencl_memory>\n"
            "#include <opencl_work_item>\n"
            "#include <opencl_work_group>\n"
            "using namespace cl;\n"
            "__kernel void test_wg_broadcast(global_ptr<uint[]> input, global_ptr<uint[]> output)\n"
            "{\n"
            "    ulong tid_x = get_global_id(0);\n"
            "    ulong tid_y = get_global_id(1);\n"
            "    size_t x = get_group_id(0) % get_local_size(0);\n"
            "    size_t y = get_group_id(1) % get_local_size(1);\n"
            "    size_t idx = (tid_y * get_global_size(0)) + tid_x;\n"
            "    uint result = work_group_broadcast(input[idx], x, y);\n"
            "    output[idx] = result;\n"
            "}\n";
 }
 std::string generate_wg_broadcast_3D_kernel_code()
 {
     return "#include <opencl_memory>\n"
            "#include <opencl_work_item>\n"
            "#include <opencl_work_group>\n"
            "using namespace cl;\n"
            "__kernel void test_wg_broadcast(global_ptr<uint[]> input, global_ptr<uint[]> output)\n"
            "{\n"
            "    ulong tid_x = get_global_id(0);\n"
            "    ulong tid_y = get_global_id(1);\n"
            "    ulong tid_z = get_global_id(2);\n"
            "    size_t x = get_group_id(0) % get_local_size(0);\n"
            "    size_t y = get_group_id(1) % get_local_size(1);\n"
            "    size_t z = get_group_id(2) % get_local_size(2);\n"
            "    ulong idx = (tid_z * get_global_size(1) * get_global_size(0)) + (tid_y * get_global_size(0)) + tid_x;\n"
            "    uint result = work_group_broadcast(input[idx], x, y, z);\n"
            "    output[idx] = result;\n"
            "}\n";
 }
 #endif

 int
 verify_wg_broadcast_1D(const std::vector<cl_uint> &in, const std::vector<cl_uint> &out, size_t n, size_t wg_size)
 {
     size_t i, j;
     size_t group_id;

     for (i=0,group_id=0; i<n; i+=wg_size,group_id++)
     {
         int local_size = (n-i) > wg_size ? wg_size : (n-i);
         cl_uint broadcast_result = in[i + (group_id % local_size)];
         for (j=0; j<local_size; j++)
         {
             if ( broadcast_result != out[i+j] )
             {
                 log_info("work_group_broadcast: Error at %lu: expected = %u, got = %u\n", i+j, broadcast_result, out[i+j]);
                 return -1;
             }
         }
     }

     return CL_SUCCESS;
 }

 int
 verify_wg_broadcast_2D(const std::vector<cl_uint> &in, const std::vector<cl_uint> &out,
                        size_t nx, size_t ny,
                        size_t wg_size_x, size_t wg_size_y)
 {
     size_t i, j, _i, _j;
     size_t group_id_x, group_id_y;

     for (i=0,group_id_y=0; i<ny; i+=wg_size_y,group_id_y++)
     {
         size_t y = group_id_y % wg_size_y;
         size_t local_size_y = (ny-i) > wg_size_y ? wg_size_y : (ny-i);
         for (_i=0; _i < local_size_y; _i++)
         {
             for (j=0,group_id_x=0; j<nx; j+=wg_size_x,group_id_x++)
             {
                 size_t x = group_id_x % wg_size_x;
                 size_t local_size_x = (nx-j) > wg_size_x ? wg_size_x : (nx-j);
                 cl_uint broadcast_result = in[(i + y) * nx + (j + x)];
                 for (_j=0; _j < local_size_x; _j++)
                 {
                     size_t indx = (i + _i) * nx + (j + _j);
                     if ( broadcast_result != out[indx] )
                     {
                         log_info("%lu\n", indx);
                         log_info("%lu\n", ((i + y) * nx + (j + x)));
                          log_info("%lu\n", out.size());
                         log_info("work_group_broadcast: Error at (%lu, %lu): expected = %u, got = %u\n", j+_j, i+_i, broadcast_result, out[indx]);
                         return -1;
                     }
                 }
             }
         }
     }

     return CL_SUCCESS;
 }

 int
 verify_wg_broadcast_3D(const std::vector<cl_uint> &in, const std::vector<cl_uint> &out,
                        size_t nx, size_t ny, size_t nz,
                        size_t wg_size_x, size_t wg_size_y, size_t wg_size_z)
 {
     size_t i, j, k, _i, _j, _k;
     size_t group_id_x, group_id_y, group_id_z;

     for (i=0,group_id_z=0; i<nz; i+=wg_size_z,group_id_z++)
     {
         size_t z = group_id_z % wg_size_z;
         size_t local_size_z = (nz-i) > wg_size_z ? wg_size_z : (nz-i);
         for (_i=0; _i < local_size_z; _i++)
         {
             for (j=0,group_id_y=0; j<ny; j+=wg_size_y,group_id_y++)
             {
                 size_t y = group_id_y % wg_size_y;
                 size_t local_size_y = (ny-j) > wg_size_y ? wg_size_y : (ny-j);
                 for (_j=0; _j < local_size_y; _j++)
                 {
                     for (k=0,group_id_x=0; k<nx; k+=wg_size_x,group_id_x++)
                     {
                         size_t x = group_id_x % wg_size_x;
                         size_t local_size_x = (nx-k) > wg_size_x ? wg_size_x : (nx-k);
                         cl_uint broadcast_result = in[(i + z) * ny * nz + (j + y) * nx + (k + x)];
                         for (_k=0; _k < local_size_x; _k++)
                         {
                             size_t indx = (i + _i) * ny * nx + (j + _j) * nx + (k + _k);
                             if ( broadcast_result != out[indx] )
                             {
                                 log_info(
                                     "work_group_broadcast: Error at (%lu, %lu, %lu): expected = %u, got = %u\n",
                                     k+_k, j+_j, i+_i,
                                     broadcast_result, out[indx]);
                                 return -1;
                             }
                         }
                     }
                 }
             }
         }
     }
     return CL_SUCCESS;
 }

 std::vector<cl_uint> generate_input_wg_broadcast(size_t count, size_t wg_size)
 {
     std::vector<cl_uint> input(count, cl_uint(0));
     size_t j = wg_size;
     for(size_t i = 0; i < count; i++)
     {
         input[i] = static_cast<cl_uint>(j);
         j--;
         if(j == 0)
         {
             j = wg_size;
         }
     }
     return input;
 }

 std::vector<cl_uint> generate_output_wg_broadcast(size_t count, size_t wg_size)
 {
     (void) wg_size;
     return std::vector<cl_uint>(count, cl_uint(1));
 }

 int work_group_broadcast(cl_device_id device, cl_context context, cl_command_queue queue, size_t count, size_t dim)
 {
     cl_mem buffers[2];
     cl_program program;
     cl_kernel kernel;
     size_t flat_wg_size;
     size_t wg_size[] = { 1, 1, 1};
     size_t work_size[] = { 1, 1, 1};
     int err;

     // Get kernel source code
     std::string code_str;
     if(dim > 2) code_str = generate_wg_broadcast_3D_kernel_code();
     else if(dim > 1) code_str = generate_wg_broadcast_2D_kernel_code();
     else code_str = generate_wg_broadcast_1D_kernel_code();

 // -----------------------------------------------------------------------------------
 // ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
 // -----------------------------------------------------------------------------------
 // Only OpenCL C++ to SPIR-V compilation
 #if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
     err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_broadcast");
     RETURN_ON_ERROR(err)
     return err;
 // Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
 #elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
     err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_broadcast", "-cl-std=CL2.0", false);
     RETURN_ON_ERROR(err)
 #else
     err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_broadcast");
     RETURN_ON_ERROR(err)
 #endif

     // Get max flat workgroup size
     err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &flat_wg_size, NULL);
     RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")

     // Set local work size
     wg_size[0] = flat_wg_size;
     if(dim > 2)
     {
         if (flat_wg_size >=512)
         {
             wg_size[0] = wg_size[1] = wg_size[2] = 8;
         }
         else if (flat_wg_size >= 64)
         {
             wg_size[0] = wg_size[1] = wg_size[2] = 4;
         }
         else if (flat_wg_size >= 8)
         {
             wg_size[0] = wg_size[1] = wg_size[2] = 2;
         }
         else
         {
             wg_size[0] = wg_size[1] = wg_size[2] = 1;
         }
     }
     else if(dim > 1)
     {
         if (flat_wg_size >= 256)
         {
             wg_size[0] = wg_size[1] = 16;
         }
         else if (flat_wg_size >=64)
         {
             wg_size[0] = wg_size[1] = 8;
         }
         else if (flat_wg_size >= 16)
         {
             wg_size[0] = wg_size[1] = 4;
         }
         else
         {
             wg_size[0] = wg_size[1] = 1;
         }
     }

     // Calculate flat local work size
     flat_wg_size = wg_size[0];
     if(dim > 1) flat_wg_size *= wg_size[1];
     if(dim > 2) flat_wg_size *= wg_size[2];

     // Calculate global work size
     size_t flat_work_size = count;
     // 3D
     if(dim > 2)
     {
         size_t wg_number = static_cast<size_t>(
             std::ceil(static_cast<double>(count / 3) / (wg_size[0] * wg_size[1] * wg_size[2]))
         );
         work_size[0] = wg_number * wg_size[0];
         work_size[1] = wg_number * wg_size[1];
         work_size[2] = wg_number * wg_size[2];
         flat_work_size = work_size[0] * work_size[1] * work_size[2];
     }
     // 2D
     else if(dim > 1)
     {
         size_t wg_number = static_cast<size_t>(
             std::ceil(static_cast<double>(count / 2) / (wg_size[0] * wg_size[1]))
         );
         work_size[0] = wg_number * wg_size[0];
         work_size[1] = wg_number * wg_size[1];
         flat_work_size = work_size[0] * work_size[1];
     }
     // 1D
     else
     {
         size_t wg_number = static_cast<size_t>(
             std::ceil(static_cast<double>(count) / wg_size[0])
         );
         flat_work_size = wg_number * wg_size[0];
         work_size[0] = flat_work_size;
     }

     std::vector<cl_uint> input = generate_input_wg_broadcast(flat_work_size, flat_wg_size);
     std::vector<cl_uint> output = generate_output_wg_broadcast(flat_work_size, flat_wg_size);

     buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                 sizeof(cl_uint) * input.size(), NULL, &err);
     RETURN_ON_CL_ERROR(err, "clCreateBuffer");

     buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                 sizeof(cl_uint) * output.size(), NULL, &err);
     RETURN_ON_CL_ERROR(err, "clCreateBuffer");

     err = clEnqueueWriteBuffer(
         queue, buffers[0], CL_TRUE, 0, sizeof(cl_uint) * input.size(),
         static_cast<void *>(input.data()), 0, NULL, NULL
     );
     RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");

     err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
     err |= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
     RETURN_ON_CL_ERROR(err, "clSetKernelArg");

     err = clEnqueueNDRangeKernel(queue, kernel, dim, NULL, work_size, wg_size, 0, NULL, NULL);
     RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");

     err = clEnqueueReadBuffer(
         queue, buffers[1], CL_TRUE, 0, sizeof(cl_uint) * output.size(),
         static_cast<void *>(output.data()), 0, NULL, NULL
     );
     RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");

     int result = CL_SUCCESS;
     // 3D
     if(dim > 2)
     {
         result = verify_wg_broadcast_3D(
             input, output,
             work_size[0], work_size[1], work_size[2],
             wg_size[0], wg_size[1], wg_size[2]
         );
     }
     // 2D
     else if(dim > 1)
     {
         result = verify_wg_broadcast_2D(
             input, output,
             work_size[0], work_size[1],
             wg_size[0], wg_size[1]
         );
     }
     // 1D
     else
     {
         result = verify_wg_broadcast_1D(
             input, output,
             work_size[0],
             wg_size[0]
         );
     }

     RETURN_ON_ERROR_MSG(result, "work_group_broadcast_%luD failed", dim);
     log_info("work_group_broadcast_%luD passed\n", dim);

     clReleaseMemObject(buffers[0]);
     clReleaseMemObject(buffers[1]);
     clReleaseKernel(kernel);
     clReleaseProgram(program);
     return err;
 }

 AUTO_TEST_CASE(test_work_group_broadcast)
 (cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
 {
     int error = CL_SUCCESS;
     int local_error = CL_SUCCESS;

     local_error = work_group_broadcast(device, context, queue, n_elems, 1);
     CHECK_ERROR(local_error)
     error |= local_error;

     local_error = work_group_broadcast(device, context, queue, n_elems, 2);
     CHECK_ERROR(local_error)
     error |= local_error;

     local_error = work_group_broadcast(device, context, queue, n_elems, 3);
     CHECK_ERROR(local_error)
     error |= local_error;

     if(error != CL_SUCCESS)
         return -1;
     return CL_SUCCESS;
 }

 #endif // TEST_CONFORMANCE_CLCPP_WG_TEST_WG_BROADCAST_HPP
	//
	// Copyright (c) 2017 The Khronos Group Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	#ifndef TEST_CONFORMANCE_CLCPP_WG_TEST_WG_BROADCAST_HPP
	#define TEST_CONFORMANCE_CLCPP_WG_TEST_WG_BROADCAST_HPP

	#include <vector>
	#include <limits>
	#include <algorithm>

	// Common for all OpenCL C++ tests
	#include "../common.hpp"
	// Common for tests of work-group functions
	#include "common.hpp"

	// -----------------------------------------------------------------------------------
	// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
	// -----------------------------------------------------------------------------------
	#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
	std::string generate_wg_broadcast_1D_kernel_code()
	{
	return
	"__kernel void test_wg_broadcast(global uint input, global uint output)\n"
	"{\n"
	" ulong tid = get_global_id(0);\n"
	" uint result = work_group_broadcast(input[tid], get_group_id(0) % get_local_size(0));\n"
	" output[tid] = result;\n"
	"}\n";
	}
	std::string generate_wg_broadcast_2D_kernel_code()
	{
	return
	"__kernel void test_wg_broadcast(global uint input, global uint output)\n"
	"{\n"
	" ulong tid_x = get_global_id(0);\n"
	" ulong tid_y = get_global_id(1);\n"
	" size_t x = get_group_id(0) % get_local_size(0);\n"
	" size_t y = get_group_id(1) % get_local_size(1);\n"
	" size_t idx = (tid_y * get_global_size(0)) + tid_x;\n"
	" uint result = work_group_broadcast(input[idx], x, y);\n"
	" output[idx] = result;\n"
	"}\n";
	}
	std::string generate_wg_broadcast_3D_kernel_code()
	{
	return
	"__kernel void test_wg_broadcast(global uint input, global uint output)\n"
	"{\n"
	" ulong tid_x = get_global_id(0);\n"
	" ulong tid_y = get_global_id(1);\n"
	" ulong tid_z = get_global_id(2);\n"
	" size_t x = get_group_id(0) % get_local_size(0);\n"
	" size_t y = get_group_id(1) % get_local_size(1);\n"
	" size_t z = get_group_id(2) % get_local_size(2);\n"
	" ulong idx = (tid_z * get_global_size(1) * get_global_size(0)) + (tid_y * get_global_size(0)) + tid_x;\n"
	" uint result = work_group_broadcast(input[idx], x, y, z);\n"
	" output[idx] = result;\n"
	"}\n";
	}
	#else
	std::string generate_wg_broadcast_1D_kernel_code()
	{
	return "#include <opencl_memory>\n"
	"#include <opencl_work_item>\n"
	"#include <opencl_work_group>\n"
	"using namespace cl;\n"
	"__kernel void test_wg_broadcast(global_ptr<uint[]> input, global_ptr<uint[]> output)\n"
	"{\n"
	" ulong tid = get_global_id(0);\n"
	" uint result = work_group_broadcast(input[tid], get_group_id(0) % get_local_size(0));\n"
	" output[tid] = result;\n"
	"}\n";
	}
	std::string generate_wg_broadcast_2D_kernel_code()
	{
	return "#include <opencl_memory>\n"
	"#include <opencl_work_item>\n"
	"#include <opencl_work_group>\n"
	"using namespace cl;\n"
	"__kernel void test_wg_broadcast(global_ptr<uint[]> input, global_ptr<uint[]> output)\n"
	"{\n"
	" ulong tid_x = get_global_id(0);\n"
	" ulong tid_y = get_global_id(1);\n"
	" size_t x = get_group_id(0) % get_local_size(0);\n"
	" size_t y = get_group_id(1) % get_local_size(1);\n"
	" size_t idx = (tid_y * get_global_size(0)) + tid_x;\n"
	" uint result = work_group_broadcast(input[idx], x, y);\n"
	" output[idx] = result;\n"
	"}\n";
	}
	std::string generate_wg_broadcast_3D_kernel_code()
	{
	return "#include <opencl_memory>\n"
	"#include <opencl_work_item>\n"
	"#include <opencl_work_group>\n"
	"using namespace cl;\n"
	"__kernel void test_wg_broadcast(global_ptr<uint[]> input, global_ptr<uint[]> output)\n"
	"{\n"
	" ulong tid_x = get_global_id(0);\n"
	" ulong tid_y = get_global_id(1);\n"
	" ulong tid_z = get_global_id(2);\n"
	" size_t x = get_group_id(0) % get_local_size(0);\n"
	" size_t y = get_group_id(1) % get_local_size(1);\n"
	" size_t z = get_group_id(2) % get_local_size(2);\n"
	" ulong idx = (tid_z * get_global_size(1) * get_global_size(0)) + (tid_y * get_global_size(0)) + tid_x;\n"
	" uint result = work_group_broadcast(input[idx], x, y, z);\n"
	" output[idx] = result;\n"
	"}\n";
	}
	#endif

	int
	verify_wg_broadcast_1D(const std::vector<cl_uint> &in, const std::vector<cl_uint> &out, size_t n, size_t wg_size)
	{
	size_t i, j;
	size_t group_id;

	for (i=0,group_id=0; i<n; i+=wg_size,group_id++)
	{
	int local_size = (n-i) > wg_size ? wg_size : (n-i);
	cl_uint broadcast_result = in[i + (group_id % local_size)];
	for (j=0; j<local_size; j++)
	{
	if ( broadcast_result != out[i+j] )
	{
	log_info("work_group_broadcast: Error at %lu: expected = %u, got = %u\n", i+j, broadcast_result, out[i+j]);
	return -1;
	}
	}
	}

	return CL_SUCCESS;
	}

	int
	verify_wg_broadcast_2D(const std::vector<cl_uint> &in, const std::vector<cl_uint> &out,
	size_t nx, size_t ny,
	size_t wg_size_x, size_t wg_size_y)
	{
	size_t i, j, _i, _j;
	size_t group_id_x, group_id_y;

	for (i=0,group_id_y=0; i<ny; i+=wg_size_y,group_id_y++)
	{
	size_t y = group_id_y % wg_size_y;
	size_t local_size_y = (ny-i) > wg_size_y ? wg_size_y : (ny-i);
	for (_i=0; _i < local_size_y; _i++)
	{
	for (j=0,group_id_x=0; j<nx; j+=wg_size_x,group_id_x++)
	{
	size_t x = group_id_x % wg_size_x;
	size_t local_size_x = (nx-j) > wg_size_x ? wg_size_x : (nx-j);
	cl_uint broadcast_result = in[(i + y) * nx + (j + x)];
	for (_j=0; _j < local_size_x; _j++)
	{
	size_t indx = (i + _i) * nx + (j + _j);
	if ( broadcast_result != out[indx] )
	{
	log_info("%lu\n", indx);
	log_info("%lu\n", ((i + y) * nx + (j + x)));
	log_info("%lu\n", out.size());
	log_info("work_group_broadcast: Error at (%lu, %lu): expected = %u, got = %u\n", j+_j, i+_i, broadcast_result, out[indx]);
	return -1;
	}
	}
	}
	}
	}

	return CL_SUCCESS;
	}

	int
	verify_wg_broadcast_3D(const std::vector<cl_uint> &in, const std::vector<cl_uint> &out,
	size_t nx, size_t ny, size_t nz,
	size_t wg_size_x, size_t wg_size_y, size_t wg_size_z)
	{
	size_t i, j, k, _i, _j, _k;
	size_t group_id_x, group_id_y, group_id_z;

	for (i=0,group_id_z=0; i<nz; i+=wg_size_z,group_id_z++)
	{
	size_t z = group_id_z % wg_size_z;
	size_t local_size_z = (nz-i) > wg_size_z ? wg_size_z : (nz-i);
	for (_i=0; _i < local_size_z; _i++)
	{
	for (j=0,group_id_y=0; j<ny; j+=wg_size_y,group_id_y++)
	{
	size_t y = group_id_y % wg_size_y;
	size_t local_size_y = (ny-j) > wg_size_y ? wg_size_y : (ny-j);
	for (_j=0; _j < local_size_y; _j++)
	{
	for (k=0,group_id_x=0; k<nx; k+=wg_size_x,group_id_x++)
	{
	size_t x = group_id_x % wg_size_x;
	size_t local_size_x = (nx-k) > wg_size_x ? wg_size_x : (nx-k);
	cl_uint broadcast_result = in[(i + z) * ny * nz + (j + y) * nx + (k + x)];
	for (_k=0; _k < local_size_x; _k++)
	{
	size_t indx = (i + _i) * ny * nx + (j + _j) * nx + (k + _k);
	if ( broadcast_result != out[indx] )
	{
	log_info(
	"work_group_broadcast: Error at (%lu, %lu, %lu): expected = %u, got = %u\n",
	k+_k, j+_j, i+_i,
	broadcast_result, out[indx]);
	return -1;
	}
	}
	}
	}
	}
	}
	}
	return CL_SUCCESS;
	}

	std::vector<cl_uint> generate_input_wg_broadcast(size_t count, size_t wg_size)
	{
	std::vector<cl_uint> input(count, cl_uint(0));
	size_t j = wg_size;
	for(size_t i = 0; i < count; i++)
	{
	input[i] = static_cast<cl_uint>(j);
	j--;
	if(j == 0)
	{
	j = wg_size;
	}
	}
	return input;
	}

	std::vector<cl_uint> generate_output_wg_broadcast(size_t count, size_t wg_size)
	{
	(void) wg_size;
	return std::vector<cl_uint>(count, cl_uint(1));
	}

	int work_group_broadcast(cl_device_id device, cl_context context, cl_command_queue queue, size_t count, size_t dim)
	{
	cl_mem buffers[2];
	cl_program program;
	cl_kernel kernel;
	size_t flat_wg_size;
	size_t wg_size[] = { 1, 1, 1};
	size_t work_size[] = { 1, 1, 1};
	int err;

	// Get kernel source code
	std::string code_str;
	if(dim > 2) code_str = generate_wg_broadcast_3D_kernel_code();
	else if(dim > 1) code_str = generate_wg_broadcast_2D_kernel_code();
	else code_str = generate_wg_broadcast_1D_kernel_code();

	// -----------------------------------------------------------------------------------
	// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
	// -----------------------------------------------------------------------------------
	// Only OpenCL C++ to SPIR-V compilation
	#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
	err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_broadcast");
	RETURN_ON_ERROR(err)
	return err;
	// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
	#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
	err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_broadcast", "-cl-std=CL2.0", false);
	RETURN_ON_ERROR(err)
	#else
	err = create_opencl_kernel(context, &program, &kernel, code_str, "test_wg_broadcast");
	RETURN_ON_ERROR(err)
	#endif

	// Get max flat workgroup size
	err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &flat_wg_size, NULL);
	RETURN_ON_CL_ERROR(err, "clGetKernelWorkGroupInfo")

	// Set local work size
	wg_size[0] = flat_wg_size;
	if(dim > 2)
	{
	if (flat_wg_size >=512)
	{
	wg_size[0] = wg_size[1] = wg_size[2] = 8;
	}
	else if (flat_wg_size >= 64)
	{
	wg_size[0] = wg_size[1] = wg_size[2] = 4;
	}
	else if (flat_wg_size >= 8)
	{
	wg_size[0] = wg_size[1] = wg_size[2] = 2;
	}
	else
	{
	wg_size[0] = wg_size[1] = wg_size[2] = 1;
	}
	}
	else if(dim > 1)
	{
	if (flat_wg_size >= 256)
	{
	wg_size[0] = wg_size[1] = 16;
	}
	else if (flat_wg_size >=64)
	{
	wg_size[0] = wg_size[1] = 8;
	}
	else if (flat_wg_size >= 16)
	{
	wg_size[0] = wg_size[1] = 4;
	}
	else
	{
	wg_size[0] = wg_size[1] = 1;
	}
	}

	// Calculate flat local work size
	flat_wg_size = wg_size[0];
	if(dim > 1) flat_wg_size *= wg_size[1];
	if(dim > 2) flat_wg_size *= wg_size[2];

	// Calculate global work size
	size_t flat_work_size = count;
	// 3D
	if(dim > 2)
	{
	size_t wg_number = static_cast<size_t>(
	std::ceil(static_cast<double>(count / 3) / (wg_size[0] * wg_size[1] * wg_size[2]))
	);
	work_size[0] = wg_number * wg_size[0];
	work_size[1] = wg_number * wg_size[1];
	work_size[2] = wg_number * wg_size[2];
	flat_work_size = work_size[0] * work_size[1] * work_size[2];
	}
	// 2D
	else if(dim > 1)
	{
	size_t wg_number = static_cast<size_t>(
	std::ceil(static_cast<double>(count / 2) / (wg_size[0] * wg_size[1]))
	);
	work_size[0] = wg_number * wg_size[0];
	work_size[1] = wg_number * wg_size[1];
	flat_work_size = work_size[0] * work_size[1];
	}
	// 1D
	else
	{
	size_t wg_number = static_cast<size_t>(
	std::ceil(static_cast<double>(count) / wg_size[0])
	);
	flat_work_size = wg_number * wg_size[0];
	work_size[0] = flat_work_size;
	}

	std::vector<cl_uint> input = generate_input_wg_broadcast(flat_work_size, flat_wg_size);
	std::vector<cl_uint> output = generate_output_wg_broadcast(flat_work_size, flat_wg_size);

	buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
	sizeof(cl_uint) * input.size(), NULL, &err);
	RETURN_ON_CL_ERROR(err, "clCreateBuffer");

	buffers[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
	sizeof(cl_uint) * output.size(), NULL, &err);
	RETURN_ON_CL_ERROR(err, "clCreateBuffer");

	err = clEnqueueWriteBuffer(
	queue, buffers[0], CL_TRUE, 0, sizeof(cl_uint) * input.size(),
	static_cast<void *>(input.data()), 0, NULL, NULL
	);
	RETURN_ON_CL_ERROR(err, "clEnqueueWriteBuffer");

	err = clSetKernelArg(kernel, 0, sizeof(buffers[0]), &buffers[0]);
	err \|= clSetKernelArg(kernel, 1, sizeof(buffers[1]), &buffers[1]);
	RETURN_ON_CL_ERROR(err, "clSetKernelArg");

	err = clEnqueueNDRangeKernel(queue, kernel, dim, NULL, work_size, wg_size, 0, NULL, NULL);
	RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel");

	err = clEnqueueReadBuffer(
	queue, buffers[1], CL_TRUE, 0, sizeof(cl_uint) * output.size(),
	static_cast<void *>(output.data()), 0, NULL, NULL
	);
	RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer");

	int result = CL_SUCCESS;
	// 3D
	if(dim > 2)
	{
	result = verify_wg_broadcast_3D(
	input, output,
	work_size[0], work_size[1], work_size[2],
	wg_size[0], wg_size[1], wg_size[2]
	);
	}
	// 2D
	else if(dim > 1)
	{
	result = verify_wg_broadcast_2D(
	input, output,
	work_size[0], work_size[1],
	wg_size[0], wg_size[1]
	);
	}
	// 1D
	else
	{
	result = verify_wg_broadcast_1D(
	input, output,
	work_size[0],
	wg_size[0]
	);
	}

	RETURN_ON_ERROR_MSG(result, "work_group_broadcast_%luD failed", dim);
	log_info("work_group_broadcast_%luD passed\n", dim);

	clReleaseMemObject(buffers[0]);
	clReleaseMemObject(buffers[1]);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	return err;
	}

	AUTO_TEST_CASE(test_work_group_broadcast)
	(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
	{
	int error = CL_SUCCESS;
	int local_error = CL_SUCCESS;

	local_error = work_group_broadcast(device, context, queue, n_elems, 1);
	CHECK_ERROR(local_error)
	error \|= local_error;

	local_error = work_group_broadcast(device, context, queue, n_elems, 2);
	CHECK_ERROR(local_error)
	error \|= local_error;

	local_error = work_group_broadcast(device, context, queue, n_elems, 3);
	CHECK_ERROR(local_error)
	error \|= local_error;

	if(error != CL_SUCCESS)
	return -1;
	return CL_SUCCESS;
	}

	#endif // TEST_CONFORMANCE_CLCPP_WG_TEST_WG_BROADCAST_HPP