| // |
| // Copyright (c) 2017 The Khronos Group Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| #include "common.h" |
| |
| static char hash_table_kernel[] = |
| "#if 0\n" |
| "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n" |
| "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n" |
| "#endif\n" |
| "typedef struct BinNode {\n" |
| " int value;\n" |
| " atomic_uintptr_t pNext;\n" |
| "} BinNode;\n" |
| |
| "__kernel void build_hash_table(__global uint* input, __global BinNode* pNodes, volatile __global atomic_uint* pNumNodes, uint numBins)\n" |
| "{\n" |
| " __global BinNode *pNew = &pNodes[ atomic_fetch_add_explicit(pNumNodes, 1, memory_order_relaxed, memory_scope_all_svm_devices) ];\n" |
| " uint i = get_global_id(0);\n" |
| " uint b = input[i] % numBins;\n" |
| " pNew->value = input[i];\n" |
| " uintptr_t next = atomic_load_explicit(&(pNodes[b].pNext), memory_order_seq_cst, memory_scope_all_svm_devices);\n" |
| " do\n" |
| " {\n" |
| " atomic_store_explicit(&(pNew->pNext), next, memory_order_seq_cst, memory_scope_all_svm_devices);\n" // always inserting at head of list |
| " } while(!atomic_compare_exchange_strong_explicit(&(pNodes[b].pNext), &next, (uintptr_t)pNew, memory_order_seq_cst, memory_order_relaxed, memory_scope_all_svm_devices));\n" |
| "}\n"; |
| |
| typedef struct BinNode{ |
| cl_uint value; |
| struct BinNode* pNext; |
| } BinNode; |
| |
| void build_hash_table_on_host(cl_context c, cl_uint* input, size_t inputSize, BinNode* pNodes, cl_int volatile *pNumNodes, cl_uint numBins) |
| { |
| for(cl_uint i = 0; i < inputSize; i++) |
| { |
| BinNode *pNew = &pNodes[ AtomicFetchAddExplicit(pNumNodes, 1, memory_order_relaxed) ]; |
| cl_uint b = input[i] % numBins; |
| pNew->value = input[i]; |
| |
| BinNode *next = pNodes[b].pNext; |
| do { |
| pNew->pNext = next; // always inserting at head of list |
| } while(!AtomicCompareExchangeStrongExplicit(&(pNodes[b].pNext), &next, pNew, memory_order_relaxed, memory_order_seq_cst)); |
| } |
| } |
| |
| |
| int launch_kernels_and_verify(clContextWrapper &context, clCommandQueueWrapper* queues, clKernelWrapper &kernel, cl_uint num_devices, cl_uint numBins, size_t num_pixels) |
| { |
| int err = CL_SUCCESS; |
| cl_uint *pInputImage = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_uint) * num_pixels, 0); |
| BinNode *pNodes = (BinNode*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(BinNode) * (num_pixels * (num_devices + 1) + numBins), 0); |
| cl_int *pNumNodes = (cl_int*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(cl_int), 0); |
| |
| *pNumNodes = numBins; // using the first numBins nodes to hold the list heads. |
| for(cl_uint i=0;i<numBins;i++) { |
| pNodes[i].pNext = NULL; |
| }; |
| |
| for(cl_uint i=0; i < num_pixels; i++) pInputImage[i] = i; |
| |
| err |= clSetKernelArgSVMPointer(kernel, 0, pInputImage); |
| err |= clSetKernelArgSVMPointer(kernel, 1, pNodes); |
| err |= clSetKernelArgSVMPointer(kernel, 2, pNumNodes); |
| err |= clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*) &numBins); |
| |
| test_error(err, "clSetKernelArg failed"); |
| |
| cl_event done; |
| // get all the devices going simultaneously, each device (and the host) will insert all the pixels. |
| for(cl_uint d=0; d<num_devices; d++) |
| { |
| err = clEnqueueNDRangeKernel(queues[d], kernel, 1, NULL, &num_pixels, 0, 0, NULL, &done); |
| test_error(err,"clEnqueueNDRangeKernel failed"); |
| } |
| for(cl_uint d=0; d<num_devices; d++) clFlush(queues[d]); |
| |
| // wait until we see some activity from a device (try to run host side simultaneously). |
| while(numBins == AtomicLoadExplicit(pNumNodes, memory_order_relaxed)); |
| |
| build_hash_table_on_host(context, pInputImage, num_pixels, pNodes, pNumNodes, numBins); |
| |
| for(cl_uint d=0; d<num_devices; d++) clFinish(queues[d]); |
| |
| cl_uint num_items = 0; |
| // check correctness of each bin in the hash table. |
| for(cl_uint i = 0; i < numBins; i++) |
| { |
| BinNode *pNode = pNodes[i].pNext; |
| while(pNode) |
| { |
| if((pNode->value % numBins) != i) |
| { |
| log_error("Something went wrong, item is in wrong hash bucket\n"); |
| break; |
| } |
| num_items++; |
| pNode = pNode->pNext; |
| } |
| } |
| |
| clReleaseEvent(done); |
| clSVMFree(context, pInputImage); |
| clSVMFree(context, pNodes); |
| clSVMFree(context, pNumNodes); |
| // each device and the host inserted all of the pixels, check that none are missing. |
| if(num_items != num_pixels * (num_devices + 1) ) |
| { |
| log_error("The hash table is not correct, num items %d, expected num items: %d\n", num_items, num_pixels * (num_devices + 1)); |
| return -1; // test did not pass |
| } |
| return 0; |
| } |
| |
| // This tests for memory consistency across devices and the host. |
| // Each device and the host simultaneously insert values into a single hash table. |
| // Each bin in the hash table is a linked list. Each bin is protected against simultaneous |
| // update using a lock free technique. The correctness of the list is verfied on the host. |
| // This test requires the new OpenCL 2.0 atomic operations that implement the new seq_cst memory ordering. |
| int test_svm_fine_grain_memory_consistency(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements) |
| { |
| clContextWrapper context; |
| clProgramWrapper program; |
| clKernelWrapper kernel; |
| clCommandQueueWrapper queues[MAXQ]; |
| |
| cl_uint num_devices = 0; |
| cl_int err = CL_SUCCESS; |
| std::vector<std::string> required_extensions; |
| required_extensions.push_back("cl_khr_int64_base_atomics"); |
| required_extensions.push_back("cl_khr_int64_extended_atomics"); |
| |
| // Make pragmas visible for 64-bit addresses |
| hash_table_kernel[4] = sizeof(void *) == 8 ? '1' : '0'; |
| |
| char *source[] = { hash_table_kernel }; |
| |
| err = create_cl_objects(deviceID, (const char**)source, &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER | CL_DEVICE_SVM_ATOMICS, required_extensions); |
| if(err == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing. |
| if(err < 0) return -1; // fail test. |
| |
| kernel = clCreateKernel(program, "build_hash_table", &err); |
| test_error(err, "clCreateKernel failed"); |
| size_t num_pixels = num_elements; |
| |
| int result; |
| cl_uint numBins = 1; // all work groups in all devices and the host code will hammer on this one lock. |
| result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels); |
| if(result == -1) return result; |
| |
| numBins = 2; // 2 locks within in same cache line will get hit from different devices and host. |
| result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels); |
| if(result == -1) return result; |
| |
| numBins = 29; // locks span a few cache lines. |
| result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels); |
| if(result == -1) return result; |
| |
| return result; |
| } |