test_conformance/atomics/test_indexed_cases.cpp - external/github.com/KhronosGroup/OpenCL-CTS - Git at Google

 //
 // Copyright (c) 2017 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "testBase.h"
 #include "harness/conversions.h"

 const char * atomic_index_source =
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "// Counter keeps track of which index in counts we are using.\n"
 "// We get that value, increment it, and then set that index in counts to our thread ID.\n"
 "// At the end of this we should have all thread IDs in some random location in counts\n"
 "// exactly once. If atom_add failed then we will write over various thread IDs and we\n"
 "// will be missing some.\n"
 "\n"
 "__kernel void add_index_test(__global int *counter, __global int *counts) {\n"
 "    int tid = get_global_id(0);\n"
 "    \n"
 "    int counter_to_use = atom_add(counter, 1);\n"
 "    counts[counter_to_use] = tid;\n"
 "}";

 int test_atomic_add_index(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     clProgramWrapper program;
     clKernelWrapper kernel;
     clMemWrapper counter, counters;
     size_t numGlobalThreads, numLocalThreads;
     int fail = 0, succeed = 0, err;

   /* Check if atomics are supported. */
   if (!is_extension_available(deviceID, "cl_khr_global_int32_base_atomics")) {
     log_info("Base atomics not supported (cl_khr_global_int32_base_atomics). Skipping test.\n");
     return 0;
   }

     //===== add_index test
     // The index test replicates what particles does.
     // It uses one memory location to keep track of the current index and then each thread
     // does an atomic add to it to get its new location. The threads then write to their
     // assigned location. At the end we check to make sure that each thread's ID shows up
     // exactly once in the output.

     numGlobalThreads = 2048;

     if( create_single_kernel_helper( context, &program, &kernel, 1, &atomic_index_source, "add_index_test" ) )
         return -1;

     if( get_max_common_work_group_size( context, kernel, numGlobalThreads, &numLocalThreads ) )
         return -1;

     log_info("Execute global_threads:%d local_threads:%d\n",
              (int)numGlobalThreads, (int)numLocalThreads);

     // Create the counter that will keep track of where each thread writes.
     counter = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int) * 1,
                              NULL, NULL);
     // Create the counters that will hold the results of each thread writing
     // its ID into a (hopefully) unique location.
     counters = clCreateBuffer(context, CL_MEM_READ_WRITE,
                               sizeof(cl_int) * numGlobalThreads, NULL, NULL);

     // Reset all those locations to -1 to indciate they have not been used.
     cl_int *values = (cl_int*) malloc(sizeof(cl_int)*numGlobalThreads);
     if (values == NULL) {
         log_error("add_index_test FAILED to allocate memory for initial values.\n");
         fail = 1; succeed = -1;
     } else {
         memset(values, -1, numLocalThreads);
         unsigned int i=0;
         for (i=0; i<numGlobalThreads; i++)
             values[i] = -1;
         int init=0;
         err = clEnqueueWriteBuffer(queue, counters, true, 0, numGlobalThreads*sizeof(cl_int), values, 0, NULL, NULL);
         err |= clEnqueueWriteBuffer(queue, counter, true, 0,1*sizeof(cl_int), &init, 0, NULL, NULL);
         if (err) {
             log_error("add_index_test FAILED to write initial values to arrays: %d\n", err);
             fail=1; succeed=-1;
         } else {
             err = clSetKernelArg(kernel, 0, sizeof(counter), &counter);
             err |= clSetKernelArg(kernel, 1, sizeof(counters), &counters);
             if (err) {
                 log_error("add_index_test FAILED to set kernel arguments: %d\n", err);
                 fail=1; succeed=-1;
             } else {
                 err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, &numGlobalThreads, &numLocalThreads, 0, NULL, NULL );
                 if (err) {
                     log_error("add_index_test FAILED to execute kernel: %d\n", err);
                     fail=1; succeed=-1;
                 } else {
                     err = clEnqueueReadBuffer( queue, counters, true, 0, sizeof(cl_int)*numGlobalThreads, values, 0, NULL, NULL );
                     if (err) {
                         log_error("add_index_test FAILED to read back results: %d\n", err);
                         fail = 1; succeed=-1;
                     } else {
                         unsigned int looking_for, index;
                         for (looking_for=0; looking_for<numGlobalThreads; looking_for++) {
                             int instances_found=0;
                             for (index=0; index<numGlobalThreads; index++) {
                                 if (values[index]==(int)looking_for)
                                     instances_found++;
                             }
                             if (instances_found != 1) {
                                 log_error("add_index_test FAILED: wrong number of instances (%d!=1) for counter %d.\n", instances_found, looking_for);
                                 fail = 1; succeed=-1;
                             }
                         }
                     }
                 }
             }
         }
         if (!fail) {
             log_info("add_index_test passed. Each thread used exactly one index.\n");
         }
         free(values);
     }
     return fail;
 }

 const char *add_index_bin_kernel[] = {
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "// This test assigns a bunch of values to bins and then tries to put them in the bins in parallel\n"
 "// using an atomic add to keep track of the current location to write into in each bin.\n"
 "// This is the same as the memory update for the particles demo.\n"
 "\n"
 "__kernel void add_index_bin_test(__global int *bin_counters, __global int *bins, __global int *bin_assignments, int max_counts_per_bin) {\n"
 "    int tid = get_global_id(0);\n"
 "\n"
 "    int location = bin_assignments[tid];\n"
 "    int counter = atom_add(&bin_counters[location], 1);\n"
 "    bins[location*max_counts_per_bin + counter] = tid;\n"
 "}" };

 // This test assigns a bunch of values to bins and then tries to put them in the bins in parallel
 // using an atomic add to keep track of the current location to write into in each bin.
 // This is the same as the memory update for the particles demo.
 int add_index_bin_test(size_t *global_threads, cl_command_queue queue, cl_context context, MTdata d)
 {
     int number_of_items = (int)global_threads[0];
     size_t local_threads[1];
     int divisor = 12;
     int number_of_bins = number_of_items/divisor;
     int max_counts_per_bin = divisor*2;

     int fail = 0;
     int succeed = 0;
     int err;

     clProgramWrapper program;
     clKernelWrapper kernel;

     //  log_info("add_index_bin_test: %d items, into %d bins, with a max of %d items per bin (bins is %d long).\n",
     //           number_of_items, number_of_bins, max_counts_per_bin, number_of_bins*max_counts_per_bin);

     //===== add_index_bin test
     // The index test replicates what particles does.
     err = create_single_kernel_helper(context, &program, &kernel, 1, add_index_bin_kernel, "add_index_bin_test" );
     test_error( err, "Unable to create testing kernel" );

     if( get_max_common_work_group_size( context, kernel, global_threads[0], &local_threads[0] ) )
         return -1;

     log_info("Execute global_threads:%d local_threads:%d\n",
              (int)global_threads[0], (int)local_threads[0]);

     // Allocate our storage
     cl_mem bin_counters =
         clCreateBuffer(context, CL_MEM_READ_WRITE,
                        sizeof(cl_int) * number_of_bins, NULL, NULL);
     cl_mem bins = clCreateBuffer(
         context, CL_MEM_READ_WRITE,
         sizeof(cl_int) * number_of_bins * max_counts_per_bin, NULL, NULL);
     cl_mem bin_assignments =
         clCreateBuffer(context, CL_MEM_READ_ONLY,
                        sizeof(cl_int) * number_of_items, NULL, NULL);

     if (bin_counters == NULL) {
         log_error("add_index_bin_test FAILED to allocate bin_counters.\n");
         return -1;
     }
     if (bins == NULL) {
         log_error("add_index_bin_test FAILED to allocate bins.\n");
         return -1;
     }
     if (bin_assignments == NULL) {
         log_error("add_index_bin_test FAILED to allocate bin_assignments.\n");
         return -1;
     }

     // Initialize our storage
     cl_int *l_bin_counts = (cl_int*)malloc(sizeof(cl_int)*number_of_bins);
     if (!l_bin_counts) {
         log_error("add_index_bin_test FAILED to allocate initial values for bin_counters.\n");
         return -1;
     }
     int i;
     for (i=0; i<number_of_bins; i++)
         l_bin_counts[i] = 0;
     err = clEnqueueWriteBuffer(queue, bin_counters, true, 0, sizeof(cl_int)*number_of_bins, l_bin_counts, 0, NULL, NULL);
     if (err) {
         log_error("add_index_bin_test FAILED to set initial values for bin_counters: %d\n", err);
         return -1;
     }

     cl_int *values = (cl_int*)malloc(sizeof(cl_int)*number_of_bins*max_counts_per_bin);
     if (!values) {
         log_error("add_index_bin_test FAILED to allocate initial values for bins.\n");
         return -1;
     }
     for (i=0; i<number_of_bins*max_counts_per_bin; i++)
         values[i] = -1;
     err = clEnqueueWriteBuffer(queue, bins, true, 0, sizeof(cl_int)*number_of_bins*max_counts_per_bin, values, 0, NULL, NULL);
     if (err) {
         log_error("add_index_bin_test FAILED to set initial values for bins: %d\n", err);
         return -1;
     }
     free(values);

     cl_int *l_bin_assignments = (cl_int*)malloc(sizeof(cl_int)*number_of_items);
     if (!l_bin_assignments) {
         log_error("add_index_bin_test FAILED to allocate initial values for l_bin_assignments.\n");
         return -1;
     }
     for (i=0; i<number_of_items; i++) {
         int bin = random_in_range(0, number_of_bins-1, d);
         while (l_bin_counts[bin] >= max_counts_per_bin) {
             bin = random_in_range(0, number_of_bins-1, d);
         }
         if (bin >= number_of_bins)
             log_error("add_index_bin_test internal error generating bin assignments: bin %d >= number_of_bins %d.\n", bin, number_of_bins);
         if (l_bin_counts[bin]+1 > max_counts_per_bin)
             log_error("add_index_bin_test internal error generating bin assignments: bin %d has more entries (%d) than max_counts_per_bin (%d).\n", bin, l_bin_counts[bin], max_counts_per_bin);
         l_bin_counts[bin]++;
         l_bin_assignments[i] = bin;
         //     log_info("item %d assigned to bin %d (%d items)\n", i, bin, l_bin_counts[bin]);
     }
     err = clEnqueueWriteBuffer(queue, bin_assignments, true, 0, sizeof(cl_int)*number_of_items, l_bin_assignments, 0, NULL, NULL);
     if (err) {
         log_error("add_index_bin_test FAILED to set initial values for bin_assignments: %d\n", err);
         return -1;
     }
     // Setup the kernel
     err = clSetKernelArg(kernel, 0, sizeof(bin_counters), &bin_counters);
     err |= clSetKernelArg(kernel, 1, sizeof(bins), &bins);
     err |= clSetKernelArg(kernel, 2, sizeof(bin_assignments), &bin_assignments);
     err |= clSetKernelArg(kernel, 3, sizeof(max_counts_per_bin), &max_counts_per_bin);
     if (err) {
         log_error("add_index_bin_test FAILED to set kernel arguments: %d\n", err);
         fail=1; succeed=-1;
         return -1;
     }

     err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL );
     if (err) {
         log_error("add_index_bin_test FAILED to execute kernel: %d\n", err);
         fail=1; succeed=-1;
     }

     cl_int *final_bin_assignments = (cl_int*)malloc(sizeof(cl_int)*number_of_bins*max_counts_per_bin);
     if (!final_bin_assignments) {
         log_error("add_index_bin_test FAILED to allocate initial values for final_bin_assignments.\n");
         return -1;
     }
     err = clEnqueueReadBuffer( queue, bins, true, 0, sizeof(cl_int)*number_of_bins*max_counts_per_bin, final_bin_assignments, 0, NULL, NULL );
     if (err) {
         log_error("add_index_bin_test FAILED to read back bins: %d\n", err);
         fail = 1; succeed=-1;
     }

     cl_int *final_bin_counts = (cl_int*)malloc(sizeof(cl_int)*number_of_bins);
     if (!final_bin_counts) {
         log_error("add_index_bin_test FAILED to allocate initial values for final_bin_counts.\n");
         return -1;
     }
     err = clEnqueueReadBuffer( queue, bin_counters, true, 0, sizeof(cl_int)*number_of_bins, final_bin_counts, 0, NULL, NULL );
     if (err) {
         log_error("add_index_bin_test FAILED to read back bin_counters: %d\n", err);
         fail = 1; succeed=-1;
     }

     // Verification.
     int errors=0;
     int current_bin;
     int search;
     //  Print out all the contents of the bins.
     //  for (current_bin=0; current_bin<number_of_bins; current_bin++)
     //        for (search=0; search<max_counts_per_bin; search++)
     //      log_info("[bin %d, entry %d] = %d\n", current_bin, search, final_bin_assignments[current_bin*max_counts_per_bin+search]);

     // First verify that there are the correct number in each bin.
     for (current_bin=0; current_bin<number_of_bins; current_bin++) {
         int expected_number = l_bin_counts[current_bin];
         int actual_number = final_bin_counts[current_bin];
         if (expected_number != actual_number) {
             log_error("add_index_bin_test FAILED: bin %d reported %d entries when %d were expected.\n", current_bin, actual_number, expected_number);
             errors++;
         }
         for (search=0; search<expected_number; search++) {
             if (final_bin_assignments[current_bin*max_counts_per_bin+search] == -1) {
                 log_error("add_index_bin_test FAILED: bin %d had no entry at position %d when it should have had %d entries.\n", current_bin, search, expected_number);
                 errors++;
             }
         }
         for (search=expected_number; search<max_counts_per_bin; search++) {
             if (final_bin_assignments[current_bin*max_counts_per_bin+search] != -1) {
                 log_error("add_index_bin_test FAILED: bin %d had an extra entry at position %d when it should have had only %d entries.\n", current_bin, search, expected_number);
                 errors++;
             }
         }
     }
     // Now verify that the correct ones are in each bin
     int index;
     for (index=0; index<number_of_items; index++) {
         int expected_bin = l_bin_assignments[index];
         int found_it = 0;
         for (search=0; search<l_bin_counts[expected_bin]; search++) {
             if (final_bin_assignments[expected_bin*max_counts_per_bin+search] == index) {
                 found_it = 1;
             }
         }
         if (found_it == 0) {
             log_error("add_index_bin_test FAILED: did not find item %d in bin %d.\n", index, expected_bin);
             errors++;
         }
     }
     free(l_bin_counts);
     free(l_bin_assignments);
     free(final_bin_assignments);
     free(final_bin_counts);
     clReleaseMemObject(bin_counters);
     clReleaseMemObject(bins);
     clReleaseMemObject(bin_assignments);
     if (errors == 0) {
         log_info("add_index_bin_test passed. Each item was put in the correct bin in parallel.\n");
         return 0;
     } else {
         log_error("add_index_bin_test FAILED: %d errors.\n", errors);
         return -1;
     }
 }

 int test_atomic_add_index_bin(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     //===== add_index_bin test
     size_t numGlobalThreads = 2048;
     int iteration=0;
     int err, failed = 0;
     MTdata d = init_genrand( gRandomSeed );

   /* Check if atomics are supported. */
   if (!is_extension_available(deviceID, "cl_khr_global_int32_base_atomics")) {
     log_info("Base atomics not supported (cl_khr_global_int32_base_atomics). Skipping test.\n");
     free_mtdata( d );
     return 0;
   }

     for(iteration=0; iteration<10; iteration++) {
         log_info("add_index_bin_test with %d elements:\n", (int)numGlobalThreads);
         err = add_index_bin_test(&numGlobalThreads,  queue,  context, d);
         if (err) {
             failed++;
             break;
         }
         numGlobalThreads*=2;
     }
     free_mtdata( d );
     return failed;
 }
	//
	// Copyright (c) 2017 The Khronos Group Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	#include "testBase.h"
	#include "harness/conversions.h"

	const char * atomic_index_source =
	"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
	"// Counter keeps track of which index in counts we are using.\n"
	"// We get that value, increment it, and then set that index in counts to our thread ID.\n"
	"// At the end of this we should have all thread IDs in some random location in counts\n"
	"// exactly once. If atom_add failed then we will write over various thread IDs and we\n"
	"// will be missing some.\n"
	"\n"
	"__kernel void add_index_test(__global int counter, __global int counts) {\n"
	" int tid = get_global_id(0);\n"
	" \n"
	" int counter_to_use = atom_add(counter, 1);\n"
	" counts[counter_to_use] = tid;\n"
	"}";

	int test_atomic_add_index(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
	{
	clProgramWrapper program;
	clKernelWrapper kernel;
	clMemWrapper counter, counters;
	size_t numGlobalThreads, numLocalThreads;
	int fail = 0, succeed = 0, err;

	/* Check if atomics are supported. */
	if (!is_extension_available(deviceID, "cl_khr_global_int32_base_atomics")) {
	log_info("Base atomics not supported (cl_khr_global_int32_base_atomics). Skipping test.\n");
	return 0;
	}

	//===== add_index test
	// The index test replicates what particles does.
	// It uses one memory location to keep track of the current index and then each thread
	// does an atomic add to it to get its new location. The threads then write to their
	// assigned location. At the end we check to make sure that each thread's ID shows up
	// exactly once in the output.

	numGlobalThreads = 2048;

	if( create_single_kernel_helper( context, &program, &kernel, 1, &atomic_index_source, "add_index_test" ) )
	return -1;

	if( get_max_common_work_group_size( context, kernel, numGlobalThreads, &numLocalThreads ) )
	return -1;

	log_info("Execute global_threads:%d local_threads:%d\n",
	(int)numGlobalThreads, (int)numLocalThreads);

	// Create the counter that will keep track of where each thread writes.
	counter = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int) * 1,
	NULL, NULL);
	// Create the counters that will hold the results of each thread writing
	// its ID into a (hopefully) unique location.
	counters = clCreateBuffer(context, CL_MEM_READ_WRITE,
	sizeof(cl_int) * numGlobalThreads, NULL, NULL);

	// Reset all those locations to -1 to indciate they have not been used.
	cl_int values = (cl_int) malloc(sizeof(cl_int)*numGlobalThreads);
	if (values == NULL) {
	log_error("add_index_test FAILED to allocate memory for initial values.\n");
	fail = 1; succeed = -1;
	} else {
	memset(values, -1, numLocalThreads);
	unsigned int i=0;
	for (i=0; i<numGlobalThreads; i++)
	values[i] = -1;
	int init=0;
	err = clEnqueueWriteBuffer(queue, counters, true, 0, numGlobalThreads*sizeof(cl_int), values, 0, NULL, NULL);
	err \|= clEnqueueWriteBuffer(queue, counter, true, 0,1*sizeof(cl_int), &init, 0, NULL, NULL);
	if (err) {
	log_error("add_index_test FAILED to write initial values to arrays: %d\n", err);
	fail=1; succeed=-1;
	} else {
	err = clSetKernelArg(kernel, 0, sizeof(counter), &counter);
	err \|= clSetKernelArg(kernel, 1, sizeof(counters), &counters);
	if (err) {
	log_error("add_index_test FAILED to set kernel arguments: %d\n", err);
	fail=1; succeed=-1;
	} else {
	err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, &numGlobalThreads, &numLocalThreads, 0, NULL, NULL );
	if (err) {
	log_error("add_index_test FAILED to execute kernel: %d\n", err);
	fail=1; succeed=-1;
	} else {
	err = clEnqueueReadBuffer( queue, counters, true, 0, sizeof(cl_int)*numGlobalThreads, values, 0, NULL, NULL );
	if (err) {
	log_error("add_index_test FAILED to read back results: %d\n", err);
	fail = 1; succeed=-1;
	} else {
	unsigned int looking_for, index;
	for (looking_for=0; looking_for<numGlobalThreads; looking_for++) {
	int instances_found=0;
	for (index=0; index<numGlobalThreads; index++) {
	if (values[index]==(int)looking_for)
	instances_found++;
	}
	if (instances_found != 1) {
	log_error("add_index_test FAILED: wrong number of instances (%d!=1) for counter %d.\n", instances_found, looking_for);
	fail = 1; succeed=-1;
	}
	}
	}
	}
	}
	}
	if (!fail) {
	log_info("add_index_test passed. Each thread used exactly one index.\n");
	}
	free(values);
	}
	return fail;
	}

	const char *add_index_bin_kernel[] = {
	"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
	"// This test assigns a bunch of values to bins and then tries to put them in the bins in parallel\n"
	"// using an atomic add to keep track of the current location to write into in each bin.\n"
	"// This is the same as the memory update for the particles demo.\n"
	"\n"
	"__kernel void add_index_bin_test(__global int bin_counters, __global int bins, __global int *bin_assignments, int max_counts_per_bin) {\n"
	" int tid = get_global_id(0);\n"
	"\n"
	" int location = bin_assignments[tid];\n"
	" int counter = atom_add(&bin_counters[location], 1);\n"
	" bins[location*max_counts_per_bin + counter] = tid;\n"
	"}" };

	// This test assigns a bunch of values to bins and then tries to put them in the bins in parallel
	// using an atomic add to keep track of the current location to write into in each bin.
	// This is the same as the memory update for the particles demo.
	int add_index_bin_test(size_t *global_threads, cl_command_queue queue, cl_context context, MTdata d)
	{
	int number_of_items = (int)global_threads[0];
	size_t local_threads[1];
	int divisor = 12;
	int number_of_bins = number_of_items/divisor;
	int max_counts_per_bin = divisor*2;

	int fail = 0;
	int succeed = 0;
	int err;

	clProgramWrapper program;
	clKernelWrapper kernel;

	// log_info("add_index_bin_test: %d items, into %d bins, with a max of %d items per bin (bins is %d long).\n",
	// number_of_items, number_of_bins, max_counts_per_bin, number_of_bins*max_counts_per_bin);

	//===== add_index_bin test
	// The index test replicates what particles does.
	err = create_single_kernel_helper(context, &program, &kernel, 1, add_index_bin_kernel, "add_index_bin_test" );
	test_error( err, "Unable to create testing kernel" );

	if( get_max_common_work_group_size( context, kernel, global_threads[0], &local_threads[0] ) )
	return -1;

	log_info("Execute global_threads:%d local_threads:%d\n",
	(int)global_threads[0], (int)local_threads[0]);

	// Allocate our storage
	cl_mem bin_counters =
	clCreateBuffer(context, CL_MEM_READ_WRITE,
	sizeof(cl_int) * number_of_bins, NULL, NULL);
	cl_mem bins = clCreateBuffer(
	context, CL_MEM_READ_WRITE,
	sizeof(cl_int) * number_of_bins * max_counts_per_bin, NULL, NULL);
	cl_mem bin_assignments =
	clCreateBuffer(context, CL_MEM_READ_ONLY,
	sizeof(cl_int) * number_of_items, NULL, NULL);

	if (bin_counters == NULL) {
	log_error("add_index_bin_test FAILED to allocate bin_counters.\n");
	return -1;
	}
	if (bins == NULL) {
	log_error("add_index_bin_test FAILED to allocate bins.\n");
	return -1;
	}
	if (bin_assignments == NULL) {
	log_error("add_index_bin_test FAILED to allocate bin_assignments.\n");
	return -1;
	}

	// Initialize our storage
	cl_int l_bin_counts = (cl_int)malloc(sizeof(cl_int)*number_of_bins);
	if (!l_bin_counts) {
	log_error("add_index_bin_test FAILED to allocate initial values for bin_counters.\n");
	return -1;
	}
	int i;
	for (i=0; i<number_of_bins; i++)
	l_bin_counts[i] = 0;
	err = clEnqueueWriteBuffer(queue, bin_counters, true, 0, sizeof(cl_int)*number_of_bins, l_bin_counts, 0, NULL, NULL);
	if (err) {
	log_error("add_index_bin_test FAILED to set initial values for bin_counters: %d\n", err);
	return -1;
	}

	cl_int values = (cl_int)malloc(sizeof(cl_int)number_of_binsmax_counts_per_bin);
	if (!values) {
	log_error("add_index_bin_test FAILED to allocate initial values for bins.\n");
	return -1;
	}
	for (i=0; i<number_of_bins*max_counts_per_bin; i++)
	values[i] = -1;
	err = clEnqueueWriteBuffer(queue, bins, true, 0, sizeof(cl_int)number_of_binsmax_counts_per_bin, values, 0, NULL, NULL);
	if (err) {
	log_error("add_index_bin_test FAILED to set initial values for bins: %d\n", err);
	return -1;
	}
	free(values);

	cl_int l_bin_assignments = (cl_int)malloc(sizeof(cl_int)*number_of_items);
	if (!l_bin_assignments) {
	log_error("add_index_bin_test FAILED to allocate initial values for l_bin_assignments.\n");
	return -1;
	}
	for (i=0; i<number_of_items; i++) {
	int bin = random_in_range(0, number_of_bins-1, d);
	while (l_bin_counts[bin] >= max_counts_per_bin) {
	bin = random_in_range(0, number_of_bins-1, d);
	}
	if (bin >= number_of_bins)
	log_error("add_index_bin_test internal error generating bin assignments: bin %d >= number_of_bins %d.\n", bin, number_of_bins);
	if (l_bin_counts[bin]+1 > max_counts_per_bin)
	log_error("add_index_bin_test internal error generating bin assignments: bin %d has more entries (%d) than max_counts_per_bin (%d).\n", bin, l_bin_counts[bin], max_counts_per_bin);
	l_bin_counts[bin]++;
	l_bin_assignments[i] = bin;
	// log_info("item %d assigned to bin %d (%d items)\n", i, bin, l_bin_counts[bin]);
	}
	err = clEnqueueWriteBuffer(queue, bin_assignments, true, 0, sizeof(cl_int)*number_of_items, l_bin_assignments, 0, NULL, NULL);
	if (err) {
	log_error("add_index_bin_test FAILED to set initial values for bin_assignments: %d\n", err);
	return -1;
	}
	// Setup the kernel
	err = clSetKernelArg(kernel, 0, sizeof(bin_counters), &bin_counters);
	err \|= clSetKernelArg(kernel, 1, sizeof(bins), &bins);
	err \|= clSetKernelArg(kernel, 2, sizeof(bin_assignments), &bin_assignments);
	err \|= clSetKernelArg(kernel, 3, sizeof(max_counts_per_bin), &max_counts_per_bin);
	if (err) {
	log_error("add_index_bin_test FAILED to set kernel arguments: %d\n", err);
	fail=1; succeed=-1;
	return -1;
	}

	err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL );
	if (err) {
	log_error("add_index_bin_test FAILED to execute kernel: %d\n", err);
	fail=1; succeed=-1;
	}

	cl_int final_bin_assignments = (cl_int)malloc(sizeof(cl_int)number_of_binsmax_counts_per_bin);
	if (!final_bin_assignments) {
	log_error("add_index_bin_test FAILED to allocate initial values for final_bin_assignments.\n");
	return -1;
	}
	err = clEnqueueReadBuffer( queue, bins, true, 0, sizeof(cl_int)number_of_binsmax_counts_per_bin, final_bin_assignments, 0, NULL, NULL );
	if (err) {
	log_error("add_index_bin_test FAILED to read back bins: %d\n", err);
	fail = 1; succeed=-1;
	}

	cl_int final_bin_counts = (cl_int)malloc(sizeof(cl_int)*number_of_bins);
	if (!final_bin_counts) {
	log_error("add_index_bin_test FAILED to allocate initial values for final_bin_counts.\n");
	return -1;
	}
	err = clEnqueueReadBuffer( queue, bin_counters, true, 0, sizeof(cl_int)*number_of_bins, final_bin_counts, 0, NULL, NULL );
	if (err) {
	log_error("add_index_bin_test FAILED to read back bin_counters: %d\n", err);
	fail = 1; succeed=-1;
	}

	// Verification.
	int errors=0;
	int current_bin;
	int search;
	// Print out all the contents of the bins.
	// for (current_bin=0; current_bin<number_of_bins; current_bin++)
	// for (search=0; search<max_counts_per_bin; search++)
	// log_info("[bin %d, entry %d] = %d\n", current_bin, search, final_bin_assignments[current_bin*max_counts_per_bin+search]);

	// First verify that there are the correct number in each bin.
	for (current_bin=0; current_bin<number_of_bins; current_bin++) {
	int expected_number = l_bin_counts[current_bin];
	int actual_number = final_bin_counts[current_bin];
	if (expected_number != actual_number) {
	log_error("add_index_bin_test FAILED: bin %d reported %d entries when %d were expected.\n", current_bin, actual_number, expected_number);
	errors++;
	}
	for (search=0; search<expected_number; search++) {
	if (final_bin_assignments[current_bin*max_counts_per_bin+search] == -1) {
	log_error("add_index_bin_test FAILED: bin %d had no entry at position %d when it should have had %d entries.\n", current_bin, search, expected_number);
	errors++;
	}
	}
	for (search=expected_number; search<max_counts_per_bin; search++) {
	if (final_bin_assignments[current_bin*max_counts_per_bin+search] != -1) {
	log_error("add_index_bin_test FAILED: bin %d had an extra entry at position %d when it should have had only %d entries.\n", current_bin, search, expected_number);
	errors++;
	}
	}
	}
	// Now verify that the correct ones are in each bin
	int index;
	for (index=0; index<number_of_items; index++) {
	int expected_bin = l_bin_assignments[index];
	int found_it = 0;
	for (search=0; search<l_bin_counts[expected_bin]; search++) {
	if (final_bin_assignments[expected_bin*max_counts_per_bin+search] == index) {
	found_it = 1;
	}
	}
	if (found_it == 0) {
	log_error("add_index_bin_test FAILED: did not find item %d in bin %d.\n", index, expected_bin);
	errors++;
	}
	}
	free(l_bin_counts);
	free(l_bin_assignments);
	free(final_bin_assignments);
	free(final_bin_counts);
	clReleaseMemObject(bin_counters);
	clReleaseMemObject(bins);
	clReleaseMemObject(bin_assignments);
	if (errors == 0) {
	log_info("add_index_bin_test passed. Each item was put in the correct bin in parallel.\n");
	return 0;
	} else {
	log_error("add_index_bin_test FAILED: %d errors.\n", errors);
	return -1;
	}
	}

	int test_atomic_add_index_bin(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
	{
	//===== add_index_bin test
	size_t numGlobalThreads = 2048;
	int iteration=0;
	int err, failed = 0;
	MTdata d = init_genrand( gRandomSeed );

	/* Check if atomics are supported. */
	if (!is_extension_available(deviceID, "cl_khr_global_int32_base_atomics")) {
	log_info("Base atomics not supported (cl_khr_global_int32_base_atomics). Skipping test.\n");
	free_mtdata( d );
	return 0;
	}

	for(iteration=0; iteration<10; iteration++) {
	log_info("add_index_bin_test with %d elements:\n", (int)numGlobalThreads);
	err = add_index_bin_test(&numGlobalThreads, queue, context, d);
	if (err) {
	failed++;
	break;
	}
	numGlobalThreads*=2;
	}
	free_mtdata( d );
	return failed;
	}