test_conformance/half/Test_roundTrip.cpp - external/github.com/KhronosGroup/OpenCL-CTS - Git at Google

 //
 // Copyright (c) 2017 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include <string.h>
 #include "cl_utils.h"
 #include "tests.h"
 #include "harness/testHarness.h"

 int test_roundTrip( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
 {
     int vectorSize, error;
     uint64_t i, j;
     cl_program  programs[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
     cl_kernel   kernels[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
     cl_program  doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
     cl_kernel   doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
     uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
     uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
     uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
     uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
     memset( min_time, -1, sizeof( min_time ) );
     memset( min_double_time, -1, sizeof( min_double_time ) );

     for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
     {
         const char *source[] = {
             "__kernel void test( const __global half *in, __global half *out )\n"
             "{\n"
             "   size_t i = get_global_id(0);\n"
             "   vstore_half",vector_size_name_extensions[vectorSize],"( vload_half",vector_size_name_extensions[vectorSize],"(i, in),  i, out);\n"
             "}\n"
         };

         const char *doubleSource[] = {
             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
             "__kernel void test( const __global half *in, __global half *out )\n"
             "{\n"
             "   size_t i = get_global_id(0);\n"
             "   vstore_half",vector_size_name_extensions[vectorSize],"( convert_double", vector_size_name_extensions[vectorSize], "( vload_half",vector_size_name_extensions[vectorSize],"(i, in)),  i, out);\n"
             "}\n"
         };

         const char *sourceV3[] = {
             "__kernel void test( const __global half *in, __global half *out,"
             "                    uint extra_last_thread  )\n"
             "{\n"
             "   size_t i = get_global_id(0);\n"
             "   size_t last_i = get_global_size(0)-1;\n"
             "   size_t adjust = 0;\n"
             "   if(i == last_i && extra_last_thread != 0) { \n"
             "     adjust = 3-extra_last_thread;\n"
             "   }\n"
             "   vstore_half3( vload_half3(i, in-adjust),  i, out-adjust);\n"
             "}\n"
         };

         const char *doubleSourceV3[] = {
             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
             "__kernel void test( const __global half *in, __global half *out,"
             "                    uint extra_last_thread  )\n"
             "{\n"
             "   size_t i = get_global_id(0);\n"
             "   size_t last_i = get_global_size(0)-1;\n"
             "   size_t adjust = 0;\n"
             "   if(i == last_i && extra_last_thread != 0) { \n"
             "     adjust = 3-extra_last_thread;\n"
             "   }\n"
             "   vstore_half3( vload_half3(i, in-adjust),  i, out-adjust);\n"
             "}\n"
         };

 /*
         const char *sourceV3aligned[] = {
             "__kernel void test( const __global half *in, __global half *out )\n"
             "{\n"
             "   size_t i = get_global_id(0);\n"
             "   vstorea_half3( vloada_half3(i, in),  i, out);\n"
             "   vstore_half(vload_half(4*i+3, in), 4*i+3, out);\n"
             "}\n"
         };

         const char *doubleSourceV3aligned[] = {
             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
             "__kernel void test( const __global half *in, __global half *out )\n"
             "{\n"
             "   size_t i = get_global_id(0);\n"
             "   vstorea_half3( vloada_half3(i, in),  i, out);\n"
             "   vstore_half(vload_half(4*i+3, in), 4*i+3, out);\n"
             "}\n"
         };
 */

         if(g_arrVecSizes[vectorSize] == 3) {
             programs[vectorSize] = MakeProgram( device, sourceV3, sizeof( sourceV3) / sizeof( sourceV3[0])  );
             if( NULL == programs[ vectorSize ] )
             {
                 gFailCount++;

                 return -1;
             }
         } else {
             programs[vectorSize] = MakeProgram( device, source, sizeof( source) / sizeof( source[0])  );
             if( NULL == programs[ vectorSize ] )
             {
                 gFailCount++;
                 return -1;
             }
         }

         kernels[ vectorSize ] = clCreateKernel( programs[ vectorSize ], "test", &error );
         if( NULL == kernels[vectorSize] )
         {
             gFailCount++;
             vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
             return error;
         }

         if( gTestDouble )
         {
             if(g_arrVecSizes[vectorSize] == 3) {
                 doublePrograms[vectorSize] = MakeProgram( device, doubleSourceV3, sizeof( doubleSourceV3) / sizeof( doubleSourceV3[0])  );
                 if( NULL == doublePrograms[ vectorSize ] )
                 {
                     gFailCount++;
                     return -1;
                 }
             } else {
                 doublePrograms[vectorSize] = MakeProgram( device, doubleSource, sizeof( doubleSource) / sizeof( doubleSource[0])  );
                 if( NULL == doublePrograms[ vectorSize ] )
                 {
                     gFailCount++;
                     return -1;
                 }
             }

             doubleKernels[ vectorSize ] = clCreateKernel( doublePrograms[ vectorSize ], "test", &error );
             if( NULL == doubleKernels[vectorSize] )
             {
                 gFailCount++;
                 vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
                 return error;
             }
         }
     }

     // Figure out how many elements are in a work block
     size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float));
     size_t blockCount = (size_t)getBufferSize(device) / elementSize; //elementSize is a power of two
     uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of cl_half
     size_t stride = blockCount;

     error = 0;
     uint64_t printMask = (lastCase >> 4) - 1;
     uint32_t count;
     size_t loopCount;

     for( i = 0; i < (uint64_t)lastCase; i += stride )
     {
         count = (uint32_t) MIN( blockCount, lastCase - i );

         //Init the input stream
         uint16_t *p = (uint16_t *)gIn_half;
         for( j = 0; j < count; j++ )
             p[j] = j + i;

         if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_half, CL_TRUE, 0, count * sizeof( cl_half ), gIn_half, 0, NULL, NULL)) )
         {
             vlog_error( "Failure in clWriteArray\n" );
             gFailCount++;
             goto exit;
         }

         //Check the vector lengths
         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
         { // here we loop through vector sizes -- 3 is last.
             uint32_t pattern = 0xdeaddead;
             memset_pattern4( gOut_half, &pattern, (size_t)getBufferSize(device)/2);

             if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
             {
                 vlog_error( "Failure in clWriteArray\n" );
                 gFailCount++;
                 goto exit;
             }


             // here is where "3" starts to cause problems.
             error = RunKernel(device, kernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
                               runsOverBy(count, vectorSize, false) );
             if(error)
             {
                 gFailCount++;
                 goto exit;
             }

             if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
             {
                 vlog_error( "Failure in clReadArray\n" );
                 gFailCount++;
                 goto exit;
             }

             if( (memcmp( gOut_half, gIn_half, count * sizeof(cl_half))) )
             {
                 uint16_t *u1 = (uint16_t *)gOut_half;
                 uint16_t *u2 = (uint16_t *)gIn_half;
                 for( j = 0; j < count; j++ )
                 {
                     if( u1[j] != u2[j] )
                     {
                         uint16_t abs1 = u1[j] & 0x7fff;
                         uint16_t abs2 = u2[j] & 0x7fff;
                         if( abs1 > 0x7c00 && abs2 > 0x7c00 )
                             continue; //any NaN is okay if NaN is input

                         // if reference result is sub normal, test if the output is flushed to zero
                         if( IsHalfSubnormal(u2[j]) && ( (u1[j] == 0) || (u1[j] == 0x8000) ) )
                             continue;

                         vlog_error( "%lld) (of %lld)  Failure at 0x%4.4x:  0x%4.4x   vector_size = %d \n", j, (uint64_t)count, u2[j], u1[j], (g_arrVecSizes[vectorSize]) );
                         gFailCount++;
                         error = -1;
                         goto exit;
                     }
                 }
             }

             if( gTestDouble )
             {
                 memset_pattern4( gOut_half, &pattern, (size_t)getBufferSize(device)/2);
                 if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
                 {
                     vlog_error( "Failure in clWriteArray\n" );
                     gFailCount++;
                     goto exit;
                 }


                 if( (error = RunKernel(device, doubleKernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
                                        runsOverBy(count, vectorSize, false) ) ) )
                 {
                     gFailCount++;
                     goto exit;
                 }

                 if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
                 {
                     vlog_error( "Failure in clReadArray\n" );
                     gFailCount++;
                     goto exit;
                 }

                 if( (memcmp( gOut_half, gIn_half, count * sizeof(cl_half))) )
                 {
                     uint16_t *u1 = (uint16_t *)gOut_half;
                     uint16_t *u2 = (uint16_t *)gIn_half;
                     for( j = 0; j < count; j++ )
                     {
                         if( u1[j] != u2[j] )
                         {
                             uint16_t abs1 = u1[j] & 0x7fff;
                             uint16_t abs2 = u2[j] & 0x7fff;
                             if( abs1 > 0x7c00 && abs2 > 0x7c00 )
                                 continue; //any NaN is okay if NaN is input

                             // if reference result is sub normal, test if the output is flushed to zero
                             if( IsHalfSubnormal(u2[j]) && ( (u1[j] == 0) || (u1[j] == 0x8000) ) )
                                 continue;

                             vlog_error( "%lld) Failure at 0x%4.4x:  0x%4.4x   vector_size = %d (double precsion)\n", j, u2[j], u1[j], (g_arrVecSizes[vectorSize]) );
                             gFailCount++;
                             error = -1;
                             goto exit;
                         }
                     }
                 }
             }
         }

         if( ((i+blockCount) & ~printMask) == (i+blockCount) )
         {
             vlog( "." );
             fflush( stdout );
         }
     }

     vlog( "\n" );

     loopCount = 100;
     if( gReportTimes )
     {
         //Run again for timing
         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
         {
             uint64_t bestTime = -1ULL;

             for( j = 0; j < loopCount; j++ )
             {
                 uint64_t startTime = ReadTime();
                 if( (error = RunKernel(device, kernels[vectorSize], gInBuffer_half, gOutBuffer_half,numVecs(count, vectorSize, false) ,
                                        runsOverBy(count, vectorSize, false)) ) )
                 {
                     gFailCount++;
                     goto exit;
                 }

                 if( (error = clFinish(gQueue)) )
                 {
                     vlog_error( "Failure in clFinish\n" );
                     gFailCount++;
                     goto exit;
                 }
                 uint64_t currentTime = ReadTime() - startTime;
                 if( currentTime < bestTime )
                     bestTime = currentTime;
                 time[ vectorSize ] += currentTime;
             }
             if( bestTime < min_time[ vectorSize ] )
                 min_time[ vectorSize ] = bestTime;

             if( gTestDouble )
             {
                 bestTime = -1ULL;
                 for( j = 0; j < loopCount; j++ )
                 {
                     uint64_t startTime = ReadTime();
                     if( (error = RunKernel(device, doubleKernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
                                            runsOverBy(count, vectorSize, false)) ) )
                     {
                         gFailCount++;
                         goto exit;
                     }

                     if( (error = clFinish(gQueue)) )
                     {
                         vlog_error( "Failure in clFinish\n" );
                         gFailCount++;
                         goto exit;
                     }
                     uint64_t currentTime = ReadTime() - startTime;
                     if( currentTime < bestTime )
                         bestTime = currentTime;
                     doubleTime[ vectorSize ] += currentTime;
                 }
                 if( bestTime < min_double_time[ vectorSize ] )
                     min_double_time[ vectorSize ] = bestTime;
             }
         }
     }

     if( gReportTimes )
     {
         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
             vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, "average us/elem", "roundTrip avg. (vector size: %d)", (g_arrVecSizes[vectorSize]) );
         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
             vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, "best us/elem", "roundTrip best (vector size: %d)", (g_arrVecSizes[vectorSize])  );
         if( gTestDouble )
         {
             for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
                 vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, "average us/elem (double)", "roundTrip avg. d (vector size: %d)", (g_arrVecSizes[vectorSize])  );
             for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
                 vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, "best us/elem (double)", "roundTrip best d (vector size: %d)", (g_arrVecSizes[vectorSize]) );
         }
     }

 exit:
     //clean up
     for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
     {
         clReleaseKernel( kernels[ vectorSize ] );
         clReleaseProgram( programs[ vectorSize ] );
         if( gTestDouble )
         {
             clReleaseKernel( doubleKernels[ vectorSize ] );
             clReleaseProgram( doublePrograms[ vectorSize ] );
         }
     }

     return error;
 }
	//
	// Copyright (c) 2017 The Khronos Group Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	#include <string.h>
	#include "cl_utils.h"
	#include "tests.h"
	#include "harness/testHarness.h"

	int test_roundTrip( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
	{
	int vectorSize, error;
	uint64_t i, j;
	cl_program programs[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
	cl_kernel kernels[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
	cl_program doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
	cl_kernel doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
	uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
	uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
	uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
	uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
	memset( min_time, -1, sizeof( min_time ) );
	memset( min_double_time, -1, sizeof( min_double_time ) );

	for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
	{
	const char *source[] = {
	"__kernel void test( const __global half in, __global half out )\n"
	"{\n"
	" size_t i = get_global_id(0);\n"
	" vstore_half",vector_size_name_extensions[vectorSize],"( vload_half",vector_size_name_extensions[vectorSize],"(i, in), i, out);\n"
	"}\n"
	};

	const char *doubleSource[] = {
	"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
	"__kernel void test( const __global half in, __global half out )\n"
	"{\n"
	" size_t i = get_global_id(0);\n"
	" vstore_half",vector_size_name_extensions[vectorSize],"( convert_double", vector_size_name_extensions[vectorSize], "( vload_half",vector_size_name_extensions[vectorSize],"(i, in)), i, out);\n"
	"}\n"
	};

	const char *sourceV3[] = {
	"__kernel void test( const __global half in, __global half out,"
	" uint extra_last_thread )\n"
	"{\n"
	" size_t i = get_global_id(0);\n"
	" size_t last_i = get_global_size(0)-1;\n"
	" size_t adjust = 0;\n"
	" if(i == last_i && extra_last_thread != 0) { \n"
	" adjust = 3-extra_last_thread;\n"
	" }\n"
	" vstore_half3( vload_half3(i, in-adjust), i, out-adjust);\n"
	"}\n"
	};

	const char *doubleSourceV3[] = {
	"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
	"__kernel void test( const __global half in, __global half out,"
	" uint extra_last_thread )\n"
	"{\n"
	" size_t i = get_global_id(0);\n"
	" size_t last_i = get_global_size(0)-1;\n"
	" size_t adjust = 0;\n"
	" if(i == last_i && extra_last_thread != 0) { \n"
	" adjust = 3-extra_last_thread;\n"
	" }\n"
	" vstore_half3( vload_half3(i, in-adjust), i, out-adjust);\n"
	"}\n"
	};

	/*
	const char *sourceV3aligned[] = {
	"__kernel void test( const __global half in, __global half out )\n"
	"{\n"
	" size_t i = get_global_id(0);\n"
	" vstorea_half3( vloada_half3(i, in), i, out);\n"
	" vstore_half(vload_half(4i+3, in), 4i+3, out);\n"
	"}\n"
	};

	const char *doubleSourceV3aligned[] = {
	"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
	"__kernel void test( const __global half in, __global half out )\n"
	"{\n"
	" size_t i = get_global_id(0);\n"
	" vstorea_half3( vloada_half3(i, in), i, out);\n"
	" vstore_half(vload_half(4i+3, in), 4i+3, out);\n"
	"}\n"
	};
	*/

	if(g_arrVecSizes[vectorSize] == 3) {
	programs[vectorSize] = MakeProgram( device, sourceV3, sizeof( sourceV3) / sizeof( sourceV3[0]) );
	if( NULL == programs[ vectorSize ] )
	{
	gFailCount++;

	return -1;
	}
	} else {
	programs[vectorSize] = MakeProgram( device, source, sizeof( source) / sizeof( source[0]) );
	if( NULL == programs[ vectorSize ] )
	{
	gFailCount++;
	return -1;
	}
	}

	kernels[ vectorSize ] = clCreateKernel( programs[ vectorSize ], "test", &error );
	if( NULL == kernels[vectorSize] )
	{
	gFailCount++;
	vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
	return error;
	}

	if( gTestDouble )
	{
	if(g_arrVecSizes[vectorSize] == 3) {
	doublePrograms[vectorSize] = MakeProgram( device, doubleSourceV3, sizeof( doubleSourceV3) / sizeof( doubleSourceV3[0]) );
	if( NULL == doublePrograms[ vectorSize ] )
	{
	gFailCount++;
	return -1;
	}
	} else {
	doublePrograms[vectorSize] = MakeProgram( device, doubleSource, sizeof( doubleSource) / sizeof( doubleSource[0]) );
	if( NULL == doublePrograms[ vectorSize ] )
	{
	gFailCount++;
	return -1;
	}
	}

	doubleKernels[ vectorSize ] = clCreateKernel( doublePrograms[ vectorSize ], "test", &error );
	if( NULL == doubleKernels[vectorSize] )
	{
	gFailCount++;
	vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
	return error;
	}
	}
	}

	// Figure out how many elements are in a work block
	size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float));
	size_t blockCount = (size_t)getBufferSize(device) / elementSize; //elementSize is a power of two
	uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of cl_half
	size_t stride = blockCount;

	error = 0;
	uint64_t printMask = (lastCase >> 4) - 1;
	uint32_t count;
	size_t loopCount;

	for( i = 0; i < (uint64_t)lastCase; i += stride )
	{
	count = (uint32_t) MIN( blockCount, lastCase - i );

	//Init the input stream
	uint16_t p = (uint16_t )gIn_half;
	for( j = 0; j < count; j++ )
	p[j] = j + i;

	if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_half, CL_TRUE, 0, count * sizeof( cl_half ), gIn_half, 0, NULL, NULL)) )
	{
	vlog_error( "Failure in clWriteArray\n" );
	gFailCount++;
	goto exit;
	}

	//Check the vector lengths
	for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
	{ // here we loop through vector sizes -- 3 is last.
	uint32_t pattern = 0xdeaddead;
	memset_pattern4( gOut_half, &pattern, (size_t)getBufferSize(device)/2);

	if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
	{
	vlog_error( "Failure in clWriteArray\n" );
	gFailCount++;
	goto exit;
	}


	// here is where "3" starts to cause problems.
	error = RunKernel(device, kernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
	runsOverBy(count, vectorSize, false) );
	if(error)
	{
	gFailCount++;
	goto exit;
	}

	if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
	{
	vlog_error( "Failure in clReadArray\n" );
	gFailCount++;
	goto exit;
	}

	if( (memcmp( gOut_half, gIn_half, count * sizeof(cl_half))) )
	{
	uint16_t u1 = (uint16_t )gOut_half;
	uint16_t u2 = (uint16_t )gIn_half;
	for( j = 0; j < count; j++ )
	{
	if( u1[j] != u2[j] )
	{
	uint16_t abs1 = u1[j] & 0x7fff;
	uint16_t abs2 = u2[j] & 0x7fff;
	if( abs1 > 0x7c00 && abs2 > 0x7c00 )
	continue; //any NaN is okay if NaN is input

	// if reference result is sub normal, test if the output is flushed to zero
	if( IsHalfSubnormal(u2[j]) && ( (u1[j] == 0) \|\| (u1[j] == 0x8000) ) )
	continue;

	vlog_error( "%lld) (of %lld) Failure at 0x%4.4x: 0x%4.4x vector_size = %d \n", j, (uint64_t)count, u2[j], u1[j], (g_arrVecSizes[vectorSize]) );
	gFailCount++;
	error = -1;
	goto exit;
	}
	}
	}

	if( gTestDouble )
	{
	memset_pattern4( gOut_half, &pattern, (size_t)getBufferSize(device)/2);
	if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
	{
	vlog_error( "Failure in clWriteArray\n" );
	gFailCount++;
	goto exit;
	}


	if( (error = RunKernel(device, doubleKernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
	runsOverBy(count, vectorSize, false) ) ) )
	{
	gFailCount++;
	goto exit;
	}

	if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
	{
	vlog_error( "Failure in clReadArray\n" );
	gFailCount++;
	goto exit;
	}

	if( (memcmp( gOut_half, gIn_half, count * sizeof(cl_half))) )
	{
	uint16_t u1 = (uint16_t )gOut_half;
	uint16_t u2 = (uint16_t )gIn_half;
	for( j = 0; j < count; j++ )
	{
	if( u1[j] != u2[j] )
	{
	uint16_t abs1 = u1[j] & 0x7fff;
	uint16_t abs2 = u2[j] & 0x7fff;
	if( abs1 > 0x7c00 && abs2 > 0x7c00 )
	continue; //any NaN is okay if NaN is input

	// if reference result is sub normal, test if the output is flushed to zero
	if( IsHalfSubnormal(u2[j]) && ( (u1[j] == 0) \|\| (u1[j] == 0x8000) ) )
	continue;

	vlog_error( "%lld) Failure at 0x%4.4x: 0x%4.4x vector_size = %d (double precsion)\n", j, u2[j], u1[j], (g_arrVecSizes[vectorSize]) );
	gFailCount++;
	error = -1;
	goto exit;
	}
	}
	}
	}
	}

	if( ((i+blockCount) & ~printMask) == (i+blockCount) )
	{
	vlog( "." );
	fflush( stdout );
	}
	}

	vlog( "\n" );

	loopCount = 100;
	if( gReportTimes )
	{
	//Run again for timing
	for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
	{
	uint64_t bestTime = -1ULL;

	for( j = 0; j < loopCount; j++ )
	{
	uint64_t startTime = ReadTime();
	if( (error = RunKernel(device, kernels[vectorSize], gInBuffer_half, gOutBuffer_half,numVecs(count, vectorSize, false) ,
	runsOverBy(count, vectorSize, false)) ) )
	{
	gFailCount++;
	goto exit;
	}

	if( (error = clFinish(gQueue)) )
	{
	vlog_error( "Failure in clFinish\n" );
	gFailCount++;
	goto exit;
	}
	uint64_t currentTime = ReadTime() - startTime;
	if( currentTime < bestTime )
	bestTime = currentTime;
	time[ vectorSize ] += currentTime;
	}
	if( bestTime < min_time[ vectorSize ] )
	min_time[ vectorSize ] = bestTime;

	if( gTestDouble )
	{
	bestTime = -1ULL;
	for( j = 0; j < loopCount; j++ )
	{
	uint64_t startTime = ReadTime();
	if( (error = RunKernel(device, doubleKernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
	runsOverBy(count, vectorSize, false)) ) )
	{
	gFailCount++;
	goto exit;
	}

	if( (error = clFinish(gQueue)) )
	{
	vlog_error( "Failure in clFinish\n" );
	gFailCount++;
	goto exit;
	}
	uint64_t currentTime = ReadTime() - startTime;
	if( currentTime < bestTime )
	bestTime = currentTime;
	doubleTime[ vectorSize ] += currentTime;
	}
	if( bestTime < min_double_time[ vectorSize ] )
	min_double_time[ vectorSize ] = bestTime;
	}
	}
	}

	if( gReportTimes )
	{
	for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
	vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, "average us/elem", "roundTrip avg. (vector size: %d)", (g_arrVecSizes[vectorSize]) );
	for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
	vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, "best us/elem", "roundTrip best (vector size: %d)", (g_arrVecSizes[vectorSize]) );
	if( gTestDouble )
	{
	for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
	vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, "average us/elem (double)", "roundTrip avg. d (vector size: %d)", (g_arrVecSizes[vectorSize]) );
	for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
	vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, "best us/elem (double)", "roundTrip best d (vector size: %d)", (g_arrVecSizes[vectorSize]) );
	}
	}

	exit:
	//clean up
	for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
	{
	clReleaseKernel( kernels[ vectorSize ] );
	clReleaseProgram( programs[ vectorSize ] );
	if( gTestDouble )
	{
	clReleaseKernel( doubleKernels[ vectorSize ] );
	clReleaseProgram( doublePrograms[ vectorSize ] );
	}
	}

	return error;
	}