| // |
| // Copyright (c) 2017 The Khronos Group Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| #include <stdio.h> |
| #include <string.h> |
| #include "harness/testHarness.h" |
| #include "harness/typeWrappers.h" |
| |
| #include <vector> |
| |
| #include "procs.h" |
| #include "utils.h" |
| #include <time.h> |
| |
| |
| #ifdef CL_VERSION_2_0 |
| extern int gWimpyMode; |
| #define BITS_DEPTH 28 |
| |
| static const char* enqueue_flags_wait_kernel_simple[] = |
| { |
| NL, "#define BITS_DEPTH " STRINGIFY_VALUE(BITS_DEPTH) |
| NL, "" |
| NL, "void block_fn(__global int* array, int index, size_t ls, size_t gs, __global int* res)" |
| NL, "{" |
| NL, " int val = 0;" |
| NL, " size_t lid = get_local_id(0);" |
| NL, " size_t tid = get_global_id(0);" |
| NL, "" |
| NL, " if(tid == 0)" |
| NL, " {" |
| NL, " if((index + 1) < BITS_DEPTH)" |
| NL, " {" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(gs, ls), " |
| NL, " ^{" |
| NL, " block_fn(array, index + 1, ls, gs, res);" |
| NL, " });" |
| NL, " }" |
| NL, " }" |
| NL, "" |
| NL, " array[index * gs + tid] = array[(index - 1) * gs + tid] + 1;" |
| NL, "" |
| NL, " if((index + 1) == BITS_DEPTH)" |
| NL, " {" |
| NL, " barrier(CLK_GLOBAL_MEM_FENCE);" |
| NL, "" |
| NL, " if(lid == 0)" |
| NL, " {" |
| NL, " size_t gid = get_group_id(0);" |
| NL, " res[gid] = 1;" |
| NL, "" |
| NL, " for(int j = 0; j < BITS_DEPTH; j++)" |
| NL, " {" |
| NL, " for(int i = 0; i < ls; i++)" |
| NL, " {" |
| NL, " if(array[j * gs + ls * gid + i] != ((ls * gid + i) + j))" |
| NL, " {" |
| NL, " res[gid] = 2;" |
| NL, " break;" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, "}" |
| NL, "" |
| NL, "kernel void enqueue_flags_wait_kernel_simple(__global int* res, __global int* array)" |
| NL, "{" |
| NL, " size_t ls = get_local_size(0);" |
| NL, " size_t gs = get_global_size(0);" |
| NL, " size_t tid = get_global_id(0);" |
| NL, "" |
| NL, " res[tid] = 0;" |
| NL, " array[tid] = tid;" |
| NL, "" |
| NL, " if(tid == 0)" |
| NL, " {" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(gs, ls), " |
| NL, " ^{" |
| NL, " block_fn(array, 1, ls, gs, res);" |
| NL, " });" |
| NL, " }" |
| NL, "}" |
| NL |
| }; |
| |
| static const char* enqueue_flags_wait_kernel_event[] = |
| { |
| NL, "#define BITS_DEPTH " STRINGIFY_VALUE(BITS_DEPTH) |
| NL, "" |
| NL, "void block_fn(__global int* array, int index, size_t ls, size_t gs, __global int* res)" |
| NL, "{" |
| NL, " int val = 0;" |
| NL, " size_t lid = get_local_id(0);" |
| NL, " size_t tid = get_global_id(0);" |
| NL, "" |
| NL, " if(tid == 0)" |
| NL, " {" |
| NL, " if((index + 1) < BITS_DEPTH)" |
| NL, " {" |
| NL, " clk_event_t block_evt;" |
| NL, " clk_event_t user_evt = create_user_event();" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(gs, ls), 1, &user_evt, &block_evt, " |
| NL, " ^{" |
| NL, " block_fn(array, index + 1, ls, gs, res);" |
| NL, " });" |
| NL, " set_user_event_status(user_evt, CL_COMPLETE);" |
| NL, " release_event(user_evt);" |
| NL, " release_event(block_evt);" |
| NL, " }" |
| NL, " }" |
| NL, "" |
| NL, " array[index * gs + tid] = array[(index - 1) * gs + tid] + 1;" |
| NL, "" |
| NL, " if((index + 1) == BITS_DEPTH)" |
| NL, " {" |
| NL, " barrier(CLK_GLOBAL_MEM_FENCE);" |
| NL, "" |
| NL, " if(lid == 0)" |
| NL, " {" |
| NL, " size_t gid = get_group_id(0);" |
| NL, " res[gid] = 1;" |
| NL, "" |
| NL, " for(int j = 0; j < BITS_DEPTH; j++)" |
| NL, " {" |
| NL, " for(int i = 0; i < ls; i++)" |
| NL, " {" |
| NL, " if(array[j * gs + ls * gid + i] != ((ls * gid + i) + j))" |
| NL, " {" |
| NL, " res[gid] = 2;" |
| NL, " break;" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, "}" |
| NL, "" |
| NL, "kernel void enqueue_flags_wait_kernel_event(__global int* res, __global int* array)" |
| NL, "{" |
| NL, " size_t tid = get_global_id(0);" |
| NL, " size_t gs = get_global_size(0);" |
| NL, " size_t ls = get_local_size(0);" |
| NL, "" |
| NL, " res[tid] = 0;" |
| NL, " array[tid] = tid;" |
| NL, "" |
| NL, " if(tid == 0)" |
| NL, " {" |
| NL, " clk_event_t block_evt;" |
| NL, " clk_event_t user_evt = create_user_event();" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(gs, ls), 1, &user_evt, &block_evt, " |
| NL, " ^{" |
| NL, " block_fn(array, 1, ls, gs, res);" |
| NL, " });" |
| NL, " set_user_event_status(user_evt, CL_COMPLETE);" |
| NL, " release_event(user_evt);" |
| NL, " release_event(block_evt);" |
| NL, " }" |
| NL, "}" |
| NL |
| }; |
| |
| static const char* enqueue_flags_wait_kernel_local[] = |
| { |
| NL, "#define BITS_DEPTH " STRINGIFY_VALUE(BITS_DEPTH) |
| NL, "" |
| NL, "void block_fn(__global int* array, int index, size_t ls, size_t gs, __global int* res, __local int* sub_array)" |
| NL, "{" |
| NL, " int val = 0;" |
| NL, " size_t gid = get_group_id(0);" |
| NL, " size_t lid = get_local_id(0);" |
| NL, " size_t tid = get_global_id(0);" |
| NL, "" |
| NL, " sub_array[lid] = array[(index - 1) * gs + tid];" |
| NL, " barrier(CLK_LOCAL_MEM_FENCE);" |
| NL, "" |
| NL, " for(int i = 0; i < ls; i++)" |
| NL, " {" |
| NL, " int id = gid * ls + i;" |
| NL, " val += sub_array[i];" |
| NL, " val -= (tid == id)? 0: (id + index - 1);" |
| NL, " }" |
| NL, "" |
| NL, " if(tid == 0)" |
| NL, " {" |
| NL, " if((index + 1) < BITS_DEPTH)" |
| NL, " {" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(gs, ls), " |
| NL, " ^(__local void* sub_array){" |
| NL, " block_fn(array, index + 1, ls, gs, res, sub_array);" |
| NL, " }, ls * sizeof(int));" |
| NL, " }" |
| NL, " }" |
| NL, "" |
| NL, " array[index * gs + tid] = val + 1;" |
| NL, "" |
| NL, " if((index + 1) == BITS_DEPTH)" |
| NL, " {" |
| NL, " barrier(CLK_GLOBAL_MEM_FENCE);" |
| NL, "" |
| NL, " if(lid == 0)" |
| NL, " {" |
| NL, " res[gid] = 1;" |
| NL, "" |
| NL, " for(int j = 0; j < BITS_DEPTH; j++)" |
| NL, " {" |
| NL, " for(int i = 0; i < ls; i++)" |
| NL, " {" |
| NL, " if(array[j * gs + ls * gid + i] != ((ls * gid + i) + j))" |
| NL, " {" |
| NL, " res[gid] = 2;" |
| NL, " break;" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, "}" |
| NL, "" |
| NL, "kernel void enqueue_flags_wait_kernel_local(__global int* res, __global int* array)" |
| NL, "{" |
| NL, " size_t ls = get_local_size(0);" |
| NL, " size_t gs = get_global_size(0);" |
| NL, " size_t tid = get_global_id(0);" |
| NL, "" |
| NL, " res[tid] = 0;" |
| NL, " array[tid] = tid;" |
| NL, "" |
| NL, " if(tid == 0)" |
| NL, " {" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(gs, ls), " |
| NL, " ^(__local void* sub_array){" |
| NL, " block_fn(array, 1, ls, gs, res, sub_array);" |
| NL, " }, ls * sizeof(int));" |
| NL, " }" |
| NL, "}" |
| NL |
| }; |
| |
| static const char* enqueue_flags_wait_kernel_event_local[] = |
| { |
| NL, "#define BITS_DEPTH " STRINGIFY_VALUE(BITS_DEPTH) |
| NL, "" |
| NL, "void block_fn(__global int* array, int index, size_t ls, size_t gs, __global int* res, __local int* sub_array)" |
| NL, "{" |
| NL, " int val = 0;" |
| NL, " size_t gid = get_group_id(0);" |
| NL, " size_t lid = get_local_id(0);" |
| NL, " size_t tid = get_global_id(0);" |
| NL, "" |
| NL, " sub_array[lid] = array[(index - 1) * gs + tid];" |
| NL, " barrier(CLK_LOCAL_MEM_FENCE);" |
| NL, "" |
| NL, " for(int i = 0; i < ls; i++)" |
| NL, " {" |
| NL, " int id = gid * ls + i;" |
| NL, " val += sub_array[i];" |
| NL, " val -= (tid == id)? 0: (id + index - 1);" |
| NL, " }" |
| NL, "" |
| NL, " if(tid == 0)" |
| NL, " {" |
| NL, " if((index + 1) < BITS_DEPTH)" |
| NL, " {" |
| NL, " clk_event_t block_evt;" |
| NL, " clk_event_t user_evt = create_user_event();" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(gs, ls), 1, &user_evt, &block_evt, " |
| NL, " ^(__local void* sub_array){" |
| NL, " block_fn(array, index + 1, ls, gs, res, sub_array);" |
| NL, " }, ls * sizeof(int));" |
| NL, " set_user_event_status(user_evt, CL_COMPLETE);" |
| NL, " release_event(user_evt);" |
| NL, " release_event(block_evt);" |
| NL, " }" |
| NL, " }" |
| NL, "" |
| NL, " array[index * gs + tid] = val + 1;" |
| NL, "" |
| NL, " if((index + 1) == BITS_DEPTH)" |
| NL, " {" |
| NL, " barrier(CLK_GLOBAL_MEM_FENCE);" |
| NL, "" |
| NL, " if(lid == 0)" |
| NL, " {" |
| NL, " res[gid] = 1;" |
| NL, "" |
| NL, " for(int j = 0; j < BITS_DEPTH; j++)" |
| NL, " {" |
| NL, " for(int i = 0; i < ls; i++)" |
| NL, " {" |
| NL, " if(array[j * gs + ls * gid + i] != ((ls * gid + i) + j))" |
| NL, " {" |
| NL, " res[gid] = 2;" |
| NL, " break;" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, "}" |
| NL, "" |
| NL, "kernel void enqueue_flags_wait_kernel_event_local(__global int* res, __global int* array)" |
| NL, "{" |
| NL, " size_t ls = get_local_size(0);" |
| NL, " size_t gs = get_global_size(0);" |
| NL, " size_t tid = get_global_id(0);" |
| NL, "" |
| NL, " res[tid] = 0;" |
| NL, " array[tid] = tid;" |
| NL, "" |
| NL, " if(tid == 0)" |
| NL, " {" |
| NL, " clk_event_t block_evt;" |
| NL, " clk_event_t user_evt = create_user_event();" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(gs, ls), 1, &user_evt, &block_evt, " |
| NL, " ^(__local void* sub_array){" |
| NL, " block_fn(array, 1, ls, gs, res, sub_array);" |
| NL, " }, ls * sizeof(int));" |
| NL, " set_user_event_status(user_evt, CL_COMPLETE);" |
| NL, " release_event(user_evt);" |
| NL, " release_event(block_evt);" |
| NL, " }" |
| NL, "}" |
| NL |
| }; |
| |
| static const char* enqueue_flags_wait_work_group_simple[] = |
| { |
| NL, "#define BITS_DEPTH " STRINGIFY_VALUE(BITS_DEPTH) |
| NL, "" |
| NL, "void block_fn(__global int* array, int index, size_t ls, __global int* res, int group_id)" |
| NL, "{" |
| NL, " size_t tid = get_global_id(0);" |
| NL, " size_t lid = get_local_id(0);" |
| NL, " size_t gs = get_global_size(0);" |
| NL, " size_t gid = get_group_id(0);" |
| NL, " " |
| NL, " if(gid == group_id)" |
| NL, " {" |
| NL, " if((index + 1) < BITS_DEPTH && lid == 0)" |
| NL, " {" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP, ndrange_1D(gs, ls), " |
| NL, " ^{" |
| NL, " block_fn(array, index + 1, ls, res, gid);" |
| NL, " });" |
| NL, " }" |
| NL, " " |
| NL, " array[index * gs + tid] = array[(index - 1) * gs + tid] + 1;" |
| NL, " }" |
| NL, "" |
| NL, " if((index + 1) == BITS_DEPTH)" |
| NL, " {" |
| NL, " barrier(CLK_GLOBAL_MEM_FENCE);" |
| NL, "" |
| NL, " if(lid == 0 && gid == group_id)" |
| NL, " {" |
| NL, " res[gid] = 1;" |
| NL, "" |
| NL, " for(int j = 0; j < BITS_DEPTH; j++)" |
| NL, " {" |
| NL, " for(int i = 0; i < ls; i++)" |
| NL, " {" |
| NL, " if(array[j * gs + ls * gid + i] != ((ls * gid + i) + j))" |
| NL, " {" |
| NL, " res[gid] = 2;" |
| NL, " break;" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, "}" |
| NL, "" |
| NL, "kernel void enqueue_flags_wait_work_group_simple(__global int* res, __global int* array)" |
| NL, "{" |
| NL, " size_t ls = get_local_size(0);" |
| NL, " size_t gs = get_global_size(0);" |
| NL, " size_t tid = get_global_id(0);" |
| NL, " size_t gid = get_group_id(0);" |
| NL, " size_t lid = get_local_id(0);" |
| NL, "" |
| NL, " res[tid] = 0;" |
| NL, " array[tid] = tid;" |
| NL, "" |
| NL, " if(lid == 0)" |
| NL, " {" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP, ndrange_1D(gs, ls), " |
| NL, " ^{" |
| NL, " block_fn(array, 1, ls, res, gid);" |
| NL, " });" |
| NL, " }" |
| NL, "}" |
| NL |
| }; |
| |
| static const char* enqueue_flags_wait_work_group_event[] = |
| { |
| NL, "#define BITS_DEPTH " STRINGIFY_VALUE(BITS_DEPTH) |
| NL, "" |
| NL, "void block_fn(__global int* array, int index, size_t ls, __global int* res, int group_id)" |
| NL, "{" |
| NL, " size_t tid = get_global_id(0);" |
| NL, " size_t lid = get_local_id(0);" |
| NL, " size_t gs = get_global_size(0);" |
| NL, " size_t gid = get_group_id(0);" |
| NL, " " |
| NL, " if(gid == group_id)" |
| NL, " {" |
| NL, " if((index + 1) < BITS_DEPTH && lid == 0)" |
| NL, " {" |
| NL, " clk_event_t block_evt;" |
| NL, " clk_event_t user_evt = create_user_event();" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP, ndrange_1D(gs, ls), 1, &user_evt, &block_evt, " |
| NL, " ^{" |
| NL, " block_fn(array, index + 1, ls, res, gid);" |
| NL, " });" |
| NL, " set_user_event_status(user_evt, CL_COMPLETE);" |
| NL, " release_event(user_evt);" |
| NL, " release_event(block_evt);" |
| NL, " }" |
| NL, " " |
| NL, " array[index * gs + tid] = array[(index - 1) * gs + tid] + 1;" |
| NL, " }" |
| NL, "" |
| NL, "" |
| NL, " if((index + 1) == BITS_DEPTH)" |
| NL, " {" |
| NL, " barrier(CLK_GLOBAL_MEM_FENCE);" |
| NL, "" |
| NL, " if(lid == 0 && gid == group_id)" |
| NL, " {" |
| NL, " res[gid] = 1;" |
| NL, "" |
| NL, " for(int j = 0; j < BITS_DEPTH; j++)" |
| NL, " {" |
| NL, " for(int i = 0; i < ls; i++)" |
| NL, " {" |
| NL, " if(array[j * gs + ls * gid + i] != ((ls * gid + i) + j))" |
| NL, " {" |
| NL, " res[gid] = 2;" |
| NL, " break;" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, "}" |
| NL, "" |
| NL, "kernel void enqueue_flags_wait_work_group_event(__global int* res, __global int* array)" |
| NL, "{" |
| NL, " size_t ls = get_local_size(0);" |
| NL, " size_t gs = get_global_size(0);" |
| NL, " size_t tid = get_global_id(0);" |
| NL, " size_t gid = get_group_id(0);" |
| NL, " size_t lid = get_local_id(0);" |
| NL, "" |
| NL, " res[tid] = 0;" |
| NL, " array[tid] = tid;" |
| NL, "" |
| NL, " if(lid == 0)" |
| NL, " {" |
| NL, " clk_event_t block_evt;" |
| NL, " clk_event_t user_evt = create_user_event();" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP, ndrange_1D(gs, ls), 1, &user_evt, &block_evt, " |
| NL, " ^{" |
| NL, " block_fn(array, 1, ls, res, gid);" |
| NL, " });" |
| NL, " set_user_event_status(user_evt, CL_COMPLETE);" |
| NL, " release_event(user_evt);" |
| NL, " release_event(block_evt);" |
| NL, " }" |
| NL, "}" |
| NL |
| }; |
| |
| static const char* enqueue_flags_wait_work_group_local[] = |
| { |
| NL, "#define BITS_DEPTH " STRINGIFY_VALUE(BITS_DEPTH) |
| NL, "" |
| NL, "void block_fn(__global int* array, int index, size_t ls, __global int* res, __local int* sub_array, int group_id)" |
| NL, "{" |
| NL, " int val = 0;" |
| NL, " size_t gid = get_group_id(0);" |
| NL, " size_t lid = get_local_id(0);" |
| NL, " size_t tid = get_global_id(0);" |
| NL, " size_t gs = get_global_size(0);" |
| NL, "" |
| NL, " sub_array[lid] = array[(index - 1) * gs + tid];" |
| NL, " barrier(CLK_LOCAL_MEM_FENCE);" |
| NL, "" |
| NL, " for(int i = 0; i < ls; i++)" |
| NL, " {" |
| NL, " int id = gid * ls + i;" |
| NL, " val += sub_array[i];" |
| NL, " val -= (tid == id)? 0: (id + index - 1);" |
| NL, " }" |
| NL, " " |
| NL, " if(gid == group_id)" |
| NL, " {" |
| NL, " if((index + 1) < BITS_DEPTH && lid == 0)" |
| NL, " {" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP, ndrange_1D(gs, ls), " |
| NL, " ^(__local void* sub_array){" |
| NL, " block_fn(array, index + 1, ls, res, sub_array, gid);" |
| NL, " }, ls * sizeof(int));" |
| NL, " }" |
| NL, " " |
| NL, " array[index * gs + tid] = val + 1;" |
| NL, " }" |
| NL, "" |
| NL, "" |
| NL, " if((index + 1) == BITS_DEPTH)" |
| NL, " {" |
| NL, " barrier(CLK_GLOBAL_MEM_FENCE);" |
| NL, "" |
| NL, " if(lid == 0 && gid == group_id)" |
| NL, " {" |
| NL, " res[gid] = 1;" |
| NL, "" |
| NL, " for(int j = 0; j < BITS_DEPTH; j++)" |
| NL, " {" |
| NL, " for(int i = 0; i < ls; i++)" |
| NL, " {" |
| NL, " if(array[j * gs + ls * gid + i] != ((ls * gid + i) + j))" |
| NL, " {" |
| NL, " res[gid] = 2;" |
| NL, " break;" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, "}" |
| NL, "" |
| NL, "kernel void enqueue_flags_wait_work_group_local(__global int* res, __global int* array)" |
| NL, "{" |
| NL, " size_t ls = get_local_size(0);" |
| NL, " size_t gs = get_global_size(0);" |
| NL, " size_t tid = get_global_id(0);" |
| NL, " size_t gid = get_group_id(0);" |
| NL, " size_t lid = get_local_id(0);" |
| NL, "" |
| NL, " res[tid] = 0;" |
| NL, " array[tid] = tid;" |
| NL, "" |
| NL, " if(lid == 0)" |
| NL, " {" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP, ndrange_1D(gs, ls), " |
| NL, " ^(__local void* sub_array){" |
| NL, " block_fn(array, 1, ls, res, sub_array, gid);" |
| NL, " }, ls * sizeof(int));" |
| NL, " }" |
| NL, "}" |
| NL |
| }; |
| |
| static const char* enqueue_flags_wait_work_group_event_local[] = |
| { |
| NL, "#define BITS_DEPTH " STRINGIFY_VALUE(BITS_DEPTH) |
| NL, "" |
| NL, "void block_fn(__global int* array, int index, size_t ls, __global int* res, __local int* sub_array, int group_id)" |
| NL, "{" |
| NL, " int val = 0;" |
| NL, " size_t gid = get_group_id(0);" |
| NL, " size_t lid = get_local_id(0);" |
| NL, " size_t tid = get_global_id(0);" |
| NL, " size_t gs = get_global_size(0);" |
| NL, "" |
| NL, " sub_array[lid] = array[(index - 1) * gs + tid];" |
| NL, " barrier(CLK_LOCAL_MEM_FENCE);" |
| NL, "" |
| NL, " for(int i = 0; i < ls; i++)" |
| NL, " {" |
| NL, " int id = gid * ls + i;" |
| NL, " val += sub_array[i];" |
| NL, " val -= (tid == id)? 0: (id + index - 1);" |
| NL, " }" |
| NL, "" |
| NL, " if(gid == group_id)" |
| NL, " {" |
| NL, " if((index + 1) < BITS_DEPTH && lid == 0)" |
| NL, " {" |
| NL, " clk_event_t block_evt;" |
| NL, " clk_event_t user_evt = create_user_event();" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP, ndrange_1D(gs, ls), 1, &user_evt, &block_evt, " |
| NL, " ^(__local void* sub_array){" |
| NL, " block_fn(array, index + 1, ls, res, sub_array, gid);" |
| NL, " }, ls * sizeof(int));" |
| NL, " set_user_event_status(user_evt, CL_COMPLETE);" |
| NL, " release_event(user_evt);" |
| NL, " release_event(block_evt);" |
| NL, " }" |
| NL, " " |
| NL, " array[index * gs + tid] = val + 1;" |
| NL, " }" |
| NL, "" |
| NL, " if((index + 1) == BITS_DEPTH)" |
| NL, " {" |
| NL, " barrier(CLK_GLOBAL_MEM_FENCE);" |
| NL, "" |
| NL, " if(lid == 0 && gid == group_id)" |
| NL, " {" |
| NL, " res[gid] = 1;" |
| NL, "" |
| NL, " for(int j = 0; j < BITS_DEPTH; j++)" |
| NL, " {" |
| NL, " for(int i = 0; i < ls; i++)" |
| NL, " {" |
| NL, " if(array[j * gs + ls * gid + i] != ((ls * gid + i) + j))" |
| NL, " {" |
| NL, " res[gid] = 2;" |
| NL, " break;" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, " }" |
| NL, "}" |
| NL, "" |
| NL, "kernel void enqueue_flags_wait_work_group_event_local(__global int* res, __global int* array)" |
| NL, "{" |
| NL, " size_t ls = get_local_size(0);" |
| NL, " size_t gs = get_global_size(0);" |
| NL, " size_t tid = get_global_id(0);" |
| NL, " size_t gid = get_group_id(0);" |
| NL, " size_t lid = get_local_id(0);" |
| NL, "" |
| NL, " res[tid] = 0;" |
| NL, " array[tid] = tid;" |
| NL, "" |
| NL, " if(lid == 0)" |
| NL, " {" |
| NL, " clk_event_t block_evt;" |
| NL, " clk_event_t user_evt = create_user_event();" |
| NL, " enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP, ndrange_1D(gs, ls), 1, &user_evt, &block_evt, " |
| NL, " ^(__local void* sub_array){" |
| NL, " block_fn(array, 1, ls, res, sub_array, gid);" |
| NL, " }, ls * sizeof(int));" |
| NL, " set_user_event_status(user_evt, CL_COMPLETE);" |
| NL, " release_event(user_evt);" |
| NL, " release_event(block_evt);" |
| NL, " }" |
| NL, "}" |
| NL |
| }; |
| |
| static const kernel_src sources_enqueue_block_flags[] = |
| { |
| KERNEL(enqueue_flags_wait_kernel_simple), |
| KERNEL(enqueue_flags_wait_kernel_event), |
| KERNEL(enqueue_flags_wait_kernel_local), |
| KERNEL(enqueue_flags_wait_kernel_event_local), |
| KERNEL(enqueue_flags_wait_work_group_simple), |
| KERNEL(enqueue_flags_wait_work_group_event), |
| KERNEL(enqueue_flags_wait_work_group_local), |
| KERNEL(enqueue_flags_wait_work_group_event_local) |
| }; |
| static const size_t num_enqueue_block_flags = arr_size(sources_enqueue_block_flags); |
| |
| |
| int test_enqueue_flags(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) |
| { |
| cl_uint i; |
| cl_int err_ret, res = 0; |
| clCommandQueueWrapper dev_queue; |
| cl_int kernel_results[MAX_GWS] = { -1 }; |
| int buff[MAX_GWS * BITS_DEPTH] = { 0 }; |
| |
| size_t ret_len; |
| size_t max_local_size = 1; |
| cl_uint maxQueueSize = 0; |
| err_ret = clGetDeviceInfo(device, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, sizeof(maxQueueSize), &maxQueueSize, 0); |
| test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE) failed"); |
| |
| err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, &ret_len); |
| test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed"); |
| |
| cl_queue_properties queue_prop_def[] = |
| { |
| CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|CL_QUEUE_ON_DEVICE|CL_QUEUE_ON_DEVICE_DEFAULT, |
| CL_QUEUE_SIZE, maxQueueSize, |
| 0 |
| }; |
| |
| dev_queue = clCreateCommandQueueWithProperties(context, device, queue_prop_def, &err_ret); |
| test_error(err_ret, "clCreateCommandQueueWithProperties(CL_QUEUE_DEVICE|CL_QUEUE_DEFAULT) failed"); |
| |
| size_t global_size = MAX_GWS; |
| size_t local_size = (max_local_size > global_size/16) ? global_size/16 : max_local_size; |
| if(gWimpyMode) |
| { |
| global_size = 4; |
| local_size = 2; |
| } |
| |
| size_t failCnt = 0; |
| for(i = 0; i < num_enqueue_block_flags; ++i) |
| { |
| if (!gKernelName.empty() && gKernelName != sources_enqueue_block_flags[i].kernel_name) |
| continue; |
| |
| log_info("Running '%s' kernel (%d of %d) ...\n", sources_enqueue_block_flags[i].kernel_name, i + 1, num_enqueue_block_flags); |
| |
| clMemWrapper mem = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, global_size * BITS_DEPTH * sizeof(cl_int), buff, &err_ret); |
| test_error(err_ret, "clCreateBuffer() failed"); |
| |
| kernel_arg args[] = |
| { |
| { sizeof(cl_mem), &mem } |
| }; |
| |
| err_ret = run_n_kernel_args(context, queue, sources_enqueue_block_flags[i].lines, sources_enqueue_block_flags[i].num_lines, sources_enqueue_block_flags[i].kernel_name, local_size, global_size, kernel_results, sizeof(kernel_results), arr_size(args), args); |
| if(check_error(err_ret, "'%s' kernel execution failed", sources_enqueue_block_flags[i].kernel_name)) { ++failCnt; res = -1; } |
| else |
| { |
| int r = 0; |
| for (int j=0; j<global_size; j++) |
| { |
| if (kernel_results[j] != 1 && j < (global_size / local_size) && check_error(-1, "'%s' kernel result[idx: %d] validation failed (test) %d != (expected) 1", sources_enqueue_block_flags[i].kernel_name, j, kernel_results[j])) |
| { |
| r = -1; |
| break; |
| } |
| else if (kernel_results[j] != 0 && j >= (global_size / local_size) && check_error(-1, "'%s' kernel result[idx: %d] validation failed (test) %d != (expected) 0", sources_enqueue_block_flags[i].kernel_name, j, kernel_results[j])) |
| { |
| r = -1; |
| break; |
| } |
| } |
| if(r == 0) log_info("'%s' kernel is OK.\n", sources_enqueue_block_flags[i].kernel_name); |
| else res = -1; |
| } |
| } |
| |
| if (failCnt > 0) |
| { |
| log_error("ERROR: %d of %d kernels failed.\n", failCnt, num_enqueue_block_flags); |
| } |
| |
| return res; |
| } |
| |
| |
| |
| #endif |