test_conformance/contractions/contractions.cpp - external/github.com/KhronosGroup/OpenCL-CTS - Git at Google

 //
 // Copyright (c) 2017 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "harness/compat.h"

 #include <string.h>
 #include <stdio.h>

 #if !defined(_WIN32)
 #include <libgen.h>
 #include <sys/param.h>
 #endif

 #include "mingw_compat.h"
 #if defined (__MINGW32__)
 #include <sys/param.h>
 #endif

 #include <time.h>
 #include "errorHelpers.h"
 #include "harness/compat.h"
 #include "harness/mt19937.h"
 #include "harness/kernelHelpers.h"
 #include "harness/rounding_mode.h"
 #include "harness/fpcontrol.h"
 #include "harness/testHarness.h"
 #include "harness/parseParameters.h"
 #if defined( __APPLE__ )
 #include <sys/sysctl.h>
 #endif
 #if defined( __linux__ )
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <linux/sysctl.h>
 #endif

 #if defined (_WIN32)
 #include <string.h>
 #endif

 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
 #include <emmintrin.h>
 #endif

 #if defined(__PPC__)
 // Global varaiable used to hold the FPU control register state. The FPSCR register can not
 // be used because not all Power implementations retain or observed the NI (non-IEEE
 // mode) bit.
 __thread fpu_control_t fpu_control = 0;
 #endif

 #ifndef MAXPATHLEN
 #define MAXPATHLEN  2048
 #endif

 char                appName[ MAXPATHLEN ] = "";
 cl_context          gContext = NULL;
 cl_command_queue    gQueue = NULL;
 cl_program          gProgram[5] = { NULL, NULL, NULL, NULL, NULL };
 cl_program          gProgram_double[5] = { NULL, NULL, NULL, NULL, NULL };
 int                 gForceFTZ = 0;
 int                 gSeed = 0;
 int                 gSeedSpecified = 0;
 int                 gHasDouble = 0;
 MTdata              gMTdata = NULL;
 int                 gSkipNanInf = 0;
 int                     gIgnoreZeroSign = 0;

 cl_mem              bufA = NULL;
 cl_mem              bufB = NULL;
 cl_mem              bufC = NULL;
 cl_mem              bufD = NULL;
 cl_mem              bufE = NULL;
 cl_mem              bufC_double = NULL;
 cl_mem              bufD_double = NULL;
 float               *buf1, *buf2, *buf3, *buf4, *buf5, *buf6;
 float               *correct[8];
 int                     *skipTest[8];

 double              *buf3_double, *buf4_double, *buf5_double, *buf6_double;
 double              *correct_double[8];

 static const char   **gArgList;
 static size_t       gArgCount;

 #define BUFFER_SIZE         (1024*1024)


 static int ParseArgs( int argc, const char **argv );
 static void PrintUsage( void );
 test_status InitCL( cl_device_id device );
 static void ReleaseCL( void );
 static int RunTest( int testNumber );
 static int RunTest_Double( int testNumber );

 #if defined(__ANDROID__)
 #define nanf( X ) strtof( "NAN", ( char ** ) NULL )
 #define nan( X )  strtod( "NAN", ( char ** ) NULL )
 #endif

 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
 // defeat x87 on MSVC
 float sse_add(float x, float y)
 {
     volatile float a = x;
     volatile float b = y;

     // defeat x87
     __m128 va = _mm_set_ss( (float) a );
     __m128 vb = _mm_set_ss( (float) b );
     va = _mm_add_ss( va, vb );
     _mm_store_ss( (float*) &a, va );
     return a;
 }

 double sse_add_sd(double x, double y)
 {
     volatile double a = x;
     volatile double b = y;

     // defeat x87
     __m128d va = _mm_set_sd( (double) a );
     __m128d vb = _mm_set_sd( (double) b );
     va = _mm_add_sd( va, vb );
     _mm_store_sd( (double*) &a, va );
     return a;
 }

 float sse_sub(float x, float y)
 {
     volatile float a = x;
     volatile float b = y;

     // defeat x87
     __m128 va = _mm_set_ss( (float) a );
     __m128 vb = _mm_set_ss( (float) b );
     va = _mm_sub_ss( va, vb );
     _mm_store_ss( (float*) &a, va );
     return a;
 }

 double sse_sub_sd(double x, double y)
 {
     volatile double a = x;
     volatile double b = y;

     // defeat x87
     __m128d va = _mm_set_sd( (double) a );
     __m128d vb = _mm_set_sd( (double) b );
     va = _mm_sub_sd( va, vb );
     _mm_store_sd( (double*) &a, va );
     return a;
 }

 float sse_mul(float x, float y)
 {
     volatile float a = x;
     volatile float b = y;

     // defeat x87
     __m128 va = _mm_set_ss( (float) a );
     __m128 vb = _mm_set_ss( (float) b );
     va = _mm_mul_ss( va, vb );
     _mm_store_ss( (float*) &a, va );
     return a;
 }

 double sse_mul_sd(double x, double y)
 {
     volatile double a = x;
     volatile double b = y;

     // defeat x87
     __m128d va = _mm_set_sd( (double) a );
     __m128d vb = _mm_set_sd( (double) b );
     va = _mm_mul_sd( va, vb );
     _mm_store_sd( (double*) &a, va );
     return a;
 }
 #endif

 #ifdef __PPC__
 float ppc_mul(float a, float b)
 {
     float p;

     if (gForceFTZ) {
         // Flush input a to zero if it is sub-normal
         if (fabsf(a) < FLT_MIN) {
             a = copysignf(0.0, a);
         }
         // Flush input b to zero if it is sub-normal
         if (fabsf(b) < FLT_MIN) {
             b = copysignf(0.0, b);
         }
         // Perform multiply
         p = a * b;
         // Flush the product if it is a sub-normal
         if (fabs((double)a * (double)b) < FLT_MIN) {
             p = copysignf(0.0, p);
         }
     } else {
         p = a * b;
     }
     return p;
 }
 #endif

 int test_contractions_float_0(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest(0);
 }

 int test_contractions_float_1(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest(1);
 }

 int test_contractions_float_2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest(2);
 }

 int test_contractions_float_3(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest(3);
 }

 int test_contractions_float_4(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest(4);
 }

 int test_contractions_float_5(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest(5);
 }

 int test_contractions_float_6(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest(6);
 }

 int test_contractions_float_7(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest(7);
 }

 int test_contractions_double_0(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest_Double(0);
 }

 int test_contractions_double_1(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest_Double(1);
 }

 int test_contractions_double_2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest_Double(2);
 }

 int test_contractions_double_3(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest_Double(3);
 }

 int test_contractions_double_4(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest_Double(4);
 }

 int test_contractions_double_5(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest_Double(5);
 }

 int test_contractions_double_6(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest_Double(6);
 }

 int test_contractions_double_7(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
 {
     return RunTest_Double(7);
 }

 test_definition test_list[] = {
     ADD_TEST( contractions_float_0 ),
     ADD_TEST( contractions_float_1 ),
     ADD_TEST( contractions_float_2 ),
     ADD_TEST( contractions_float_3 ),
     ADD_TEST( contractions_float_4 ),
     ADD_TEST( contractions_float_5 ),
     ADD_TEST( contractions_float_6 ),
     ADD_TEST( contractions_float_7 ),
     ADD_TEST( contractions_double_0 ),
     ADD_TEST( contractions_double_1 ),
     ADD_TEST( contractions_double_2 ),
     ADD_TEST( contractions_double_3 ),
     ADD_TEST( contractions_double_4 ),
     ADD_TEST( contractions_double_5 ),
     ADD_TEST( contractions_double_6 ),
     ADD_TEST( contractions_double_7 ),
 };

 const int test_num = ARRAY_SIZE( test_list );

 int main( int argc, const char **argv )
 {
     argc = parseCustomParam(argc, argv);
     if (argc == -1)
     {
         return -1;
     }

     int error = ParseArgs( argc, argv );

     if( !error )
     {
         error = runTestHarnessWithCheck( gArgCount, gArgList, test_num, test_list, true, 0, InitCL );
     }

     if( gQueue )
     {
         int flush_error = clFinish( gQueue );
         if( flush_error )
             log_error( "clFinish failed: %d\n", flush_error );
     }

     ReleaseCL();
     free( gArgList );

     return error;
 }


 static int ParseArgs( int argc, const char **argv )
 {
     gArgList = (const char **)calloc( argc, sizeof( char*) );

     if( NULL == gArgList )
     {
         vlog_error( "Failed to allocate memory for argList\n" );
         return 1;
     }

     gArgList[0] = argv[0];
     gArgCount = 1;

     int length_of_seed = 0;

     { // Extract the app name
         strncpy( appName, argv[0], MAXPATHLEN );

 #if (defined( __APPLE__ ) || defined(__linux__) || defined(__MINGW32__))
         char baseName[MAXPATHLEN];
         char *base = NULL;
         strncpy( baseName, argv[0], MAXPATHLEN );
         base = basename( baseName );
         if( NULL != base )
         {
             strncpy( appName, base, sizeof( appName )  );
             appName[ sizeof( appName ) -1 ] = '\0';
         }
 #elif defined (_WIN32)
         char fname[_MAX_FNAME + _MAX_EXT + 1];
         char ext[_MAX_EXT];

         errno_t err = _splitpath_s( argv[0], NULL, 0, NULL, 0,
                                    fname, _MAX_FNAME, ext, _MAX_EXT );
         if (err == 0) { // no error
             strcat (fname, ext); //just cat them, size of frame can keep both
             strncpy (appName, fname, sizeof(appName));
             appName[ sizeof( appName ) -1 ] = '\0';
         }
 #endif
     }

     for( int i = 1; i < argc; i++ )
     {
         const char *arg = argv[i];
         if( NULL == arg )
             break;

         if( arg[0] == '-' )
         {
             while( arg[1] != '\0' )
             {
                 arg++;
                 switch( *arg )
                 {
                     case 'h':
                         PrintUsage();
                         return -1;

                     case 's':
                         arg++;
                         gSeed = atoi( arg );
                         while (arg[length_of_seed] >='0' && arg[length_of_seed]<='9')
                             length_of_seed++;
                         gSeedSpecified = 1;
                         arg+=length_of_seed-1;
                         break;

                     case 'z':
                         gForceFTZ ^= 1;
                         break;

                     default:
                         vlog( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg );
                         PrintUsage();
                         return -1;
                 }
             }
         }
         else
         {
             gArgList[gArgCount] = arg;
             gArgCount++;
         }
     }
     vlog( "\n\nTest binary built %s %s\n", __DATE__, __TIME__ );

     PrintArch();

     return 0;
 }

 static void PrintUsage( void )
 {
     vlog( "%s [-z]: <optional: test names>\n", appName );
     vlog( "\tOptions:\n" );
     vlog( "\t\t-z\tToggle FTZ mode (Section 6.5.3) for all functions. (Set by device capabilities by default.)\n" );
     vlog( "\t\t-sNUMBER set random seed.\n");
     vlog( "\n" );
     vlog( "\tTest names:\n" );
     for( int i = 0; i < test_num; i++ )
     {
         vlog( "\t\t%s\n", test_list[i].name );
     }
 }

 const char *sizeNames[] = { "float", "float2", "float4", "float8", "float16" };
 const char *sizeNames_double[] = { "double", "double2", "double4", "double8", "double16" };

 test_status InitCL( cl_device_id device )
 {
     int error;
     uint32_t i, j;
     int *bufSkip = NULL;
     int isRTZ = 0;
     RoundingMode oldRoundMode = kDefaultRoundingMode;

     cl_device_fp_config floatCapabilities = 0;
     if( (error = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(floatCapabilities), &floatCapabilities, NULL)))
         floatCapabilities = 0;
     if(0 == (CL_FP_DENORM & floatCapabilities) )
         gForceFTZ ^= 1;

     // check for cl_khr_fp64
     gHasDouble = is_extension_available(device, "cl_khr_fp64" );

     if(0 == (CL_FP_INF_NAN & floatCapabilities) )
         gSkipNanInf = 1;

     // Embedded devices that flush to zero are allowed to have an undefined sign.
     if (gIsEmbedded && gForceFTZ)
         gIgnoreZeroSign = 1;

     gContext = clCreateContext( NULL, 1, &device, notify_callback, NULL, &error );
     if( NULL == gContext || error )
     {
         vlog_error( "clCreateDeviceGroup failed. %d\n", error );
         return TEST_FAIL;
     }

     gQueue = clCreateCommandQueue( gContext, device, 0, &error );
     if( NULL == gQueue || error )
     {
         vlog_error( "clCreateContext failed. %d\n", error );
         return TEST_FAIL;
     }

     // setup input buffers
     bufA = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
     bufB = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
     bufC = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
     bufD = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
     bufE = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );

     if( bufA == NULL    ||
        bufB == NULL    ||
        bufC == NULL    ||
        bufD == NULL    ||
        bufE == NULL    )
     {
         vlog_error( "clCreateArray failed for input\n" );
         return TEST_FAIL;
     }

     if( gHasDouble )
     {
         bufC_double = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
         bufD_double = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
         if( bufC_double == NULL    ||
            bufD_double == NULL    )
         {
             vlog_error( "clCreateArray failed for input DP\n" );
             return TEST_FAIL;
         }
     }

     const char *kernels[] = {
         "", "#pragma OPENCL FP_CONTRACT OFF\n"
         "__kernel void kernel1( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
         "{\n"
         "   int i = get_global_id(0);\n"
         "   out[i] = a[i] * b[i] + c[i];\n"
         "}\n"
         "\n"
         "__kernel void kernel2( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
         "{\n"
         "   int i = get_global_id(0);\n"
         "   out[i] = a[i] * b[i] - c[i];\n"
         "}\n"
         "\n"
         "__kernel void kernel3( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
         "{\n"
         "   int i = get_global_id(0);\n"
         "   out[i] = c[i] + a[i] * b[i];\n"
         "}\n"
         "\n"
         "__kernel void kernel4( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
         "{\n"
         "   int i = get_global_id(0);\n"
         "   out[i] = c[i] - a[i] * b[i];\n"
         "}\n"
         "\n"
         "__kernel void kernel5( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
         "{\n"
         "   int i = get_global_id(0);\n"
         "   out[i] = -(a[i] * b[i] + c[i]);\n"
         "}\n"
         "\n"
         "__kernel void kernel6( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
         "{\n"
         "   int i = get_global_id(0);\n"
         "   out[i] = -(a[i] * b[i] - c[i]);\n"
         "}\n"
         "\n"
         "__kernel void kernel7( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
         "{\n"
         "   int i = get_global_id(0);\n"
         "   out[i] = -(c[i] + a[i] * b[i]);\n"
         "}\n"
         "\n"
         "__kernel void kernel8( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
         "{\n"
         "   int i = get_global_id(0);\n"
         "   out[i] = -(c[i] - a[i] * b[i]);\n"
         "}\n"
         "\n" };

     for (i = 0; i < sizeof(sizeNames) / sizeof(sizeNames[0]); i++)
     {
         size_t strCount = sizeof(kernels) / sizeof(kernels[0]);
         kernels[0] = "";

         for (j = 2; j < strCount; j += 2) kernels[j] = sizeNames[i];
         error = create_single_kernel_helper(gContext, &gProgram[i], nullptr,
                                             strCount, kernels, nullptr);
         if (CL_SUCCESS != error || nullptr == gProgram[i])
         {
             log_error("Error: Unable to create test program! (%s) (in %s:%d)\n",
                       IGetErrorString(error), __FILE__, __LINE__);
             return TEST_FAIL;
         }
     }

     if (gHasDouble)
     {
         kernels[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
         for (i = 0; i < sizeof(sizeNames_double) / sizeof(sizeNames_double[0]);
              i++)
         {
             size_t strCount = sizeof(kernels) / sizeof(kernels[0]);

             for (j = 2; j < strCount; j += 2) kernels[j] = sizeNames_double[i];
             error = create_single_kernel_helper(gContext, &gProgram_double[i],
                                                 nullptr, strCount, kernels,
                                                 nullptr);
             if (CL_SUCCESS != error || nullptr == gProgram_double[i])
             {
                 log_error(
                     "Error: Unable to create test program! (%s) (in %s:%d)\n",
                     IGetErrorString(error), __FILE__, __LINE__);
                 return TEST_FAIL;
             }
         }
     }

     if( 0 == gSeedSpecified )
     {
         time_t currentTime = time( NULL );
         struct tm *t = localtime(&currentTime);
         gSeed = t->tm_sec + 60 * ( t->tm_min + 60 * (t->tm_hour + 24 * (t->tm_yday + 365 * t->tm_year)));
         gSeed = (uint32_t) (((uint64_t) gSeed * (uint64_t) gSeed ) >> 16);
     }
     gMTdata = init_genrand( gSeed );


     // Init bufA and bufB
     {
         buf1 = (float *)malloc( BUFFER_SIZE );
         buf2 = (float *)malloc( BUFFER_SIZE );
         buf3 = (float *)malloc( BUFFER_SIZE );
         buf4 = (float *)malloc( BUFFER_SIZE );
         buf5 = (float *)malloc( BUFFER_SIZE );
         buf6 = (float *)malloc( BUFFER_SIZE );

         bufSkip = (int *)malloc( BUFFER_SIZE );

         if( NULL == buf1 || NULL == buf2 || NULL == buf3 || NULL == buf4 || NULL == buf5 || NULL == buf6 || NULL == bufSkip)
         {
             vlog_error( "Out of memory initializing buffers\n" );
             return TEST_FAIL;
         }
         for( i = 0; i < sizeof( correct ) / sizeof( correct[0] ); i++ )
         {
             correct[i] = (float *)malloc( BUFFER_SIZE );
             skipTest[i] = (int *)malloc( BUFFER_SIZE );
             if(( NULL == correct[i] ) || ( NULL == skipTest[i]))
             {
                 vlog_error( "Out of memory initializing buffers 2\n" );
                 return TEST_FAIL;
             }
         }

         for( i = 0; i < BUFFER_SIZE / sizeof(float); i++ )
             ((uint32_t*) buf1)[i] = genrand_int32( gMTdata );

         if( (error = clEnqueueWriteBuffer(gQueue, bufA, CL_FALSE, 0, BUFFER_SIZE, buf1, 0, NULL, NULL) ))
         {
             vlog_error( "Failure %d at clEnqueueWriteBuffer1\n", error );
             return TEST_FAIL;
         }

         for( i = 0; i < BUFFER_SIZE / sizeof(float); i++ )
             ((uint32_t*) buf2)[i] = genrand_int32( gMTdata );

         if( (error = clEnqueueWriteBuffer(gQueue, bufB, CL_FALSE, 0, BUFFER_SIZE, buf2, 0, NULL, NULL) ))
         {
             vlog_error( "Failure %d at clEnqueueWriteBuffer2\n", error );
             return TEST_FAIL;
         }

         void *ftzInfo = NULL;
         if( gForceFTZ )
             ftzInfo = FlushToZero();
         if ((CL_FP_ROUND_TO_ZERO == get_default_rounding_mode(device)) && gIsEmbedded) {
             oldRoundMode = set_round(kRoundTowardZero, kfloat);
             isRTZ = 1;
         }
         float *f = (float*) buf1;
         float *f2 = (float*) buf2;
         float *f3 = (float*) buf3;
         float *f4 = (float*) buf4;
         for( i = 0; i < BUFFER_SIZE / sizeof(float); i++ )
         {
             float q = f[i];
             float q2 = f2[i];

             feclearexcept(FE_OVERFLOW);
 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
             // VS2005 might use x87 for straight multiplies, and we can't
             // turn that off
             f3[i] = sse_mul(q, q2);
             f4[i] = sse_mul(-q, q2);
 #elif defined(__PPC__)
             // None of the current generation PPC processors support HW
             // FTZ, emulate it in sw.
             f3[i] = ppc_mul(q, q2);
             f4[i] = ppc_mul(-q, q2);
 #else
             f3[i] = q * q2;
             f4[i] = -q * q2;
 #endif
             // Skip test if the device doesn't support infinities and NaN AND the result overflows
             // or either input is an infinity of NaN
             bufSkip[i] = (gSkipNanInf && ((FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW))) ||
                                           (fabsf(q)  == FLT_MAX) || (q  != q)  ||
                                           (fabsf(q2) == FLT_MAX) || (q2 != q2)));
         }

         if( gForceFTZ )
             UnFlushToZero(ftzInfo);

     if (isRTZ)
       (void)set_round(oldRoundMode, kfloat);


         if( (error = clEnqueueWriteBuffer(gQueue, bufC, CL_FALSE, 0, BUFFER_SIZE, buf3, 0, NULL, NULL) ))
         {
             vlog_error( "Failure %d at clEnqueueWriteBuffer3\n", error );
             return TEST_FAIL;
         }
         if( (error = clEnqueueWriteBuffer(gQueue, bufD, CL_FALSE, 0, BUFFER_SIZE, buf4, 0, NULL, NULL) ))
         {
             vlog_error( "Failure %d at clEnqueueWriteBuffer4\n", error );
             return TEST_FAIL;
         }

         // Fill the buffers with NaN
         float *f5 = (float*) buf5;
         float nan_val = nanf("");
         for( i = 0; i < BUFFER_SIZE / sizeof( float ); i++ )
             f5[i] = nan_val;

         // calculate reference results
         for( i = 0; i < BUFFER_SIZE / sizeof( float ); i++ )
         {
             for ( j=0; j<8; j++)
             {
                 feclearexcept(FE_OVERFLOW);
                 switch (j)
                 {
 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
                         // VS2005 might use x87 for straight add/sub, and we can't
                         // turn that off
                     case 0:
                         correct[0][i] = sse_add(buf3[i],buf4[i]); break;
                     case 1:
                         correct[1][i] = sse_sub(buf3[i],buf3[i]); break;
                     case 2:
                         correct[2][i] = sse_add(buf4[i],buf3[i]); break;
                     case 3:
                         correct[3][i] = sse_sub(buf3[i],buf3[i]); break;
                     case 4:
                         correct[4][i] = -sse_add(buf3[i],buf4[i]); break;
                     case 5:
                         correct[5][i] = -sse_sub(buf3[i],buf3[i]); break;
                     case 6:
                         correct[6][i] = -sse_add(buf4[i],buf3[i]); break;
                     case 7:
                         correct[7][i] = -sse_sub(buf3[i],buf3[i]); break;
 #else
                     case 0:
                         correct[0][i] = buf3[i] + buf4[i]; break;
                     case 1:
                         correct[1][i] = buf3[i] - buf3[i]; break;
                     case 2:
                         correct[2][i] = buf4[i] + buf3[i]; break;
                     case 3:
                         correct[3][i] = buf3[i] - buf3[i]; break;
                     case 4:
                         correct[4][i] = -(buf3[i] + buf4[i]); break;
                     case 5:
                         correct[5][i] = -(buf3[i] - buf3[i]); break;
                     case 6:
                         correct[6][i] = -(buf4[i] + buf3[i]); break;
                     case 7:
                         correct[7][i] = -(buf3[i] - buf3[i]); break;
 #endif
                 }
                 // Further skip test inputs if the device doesn support infinities AND NaNs
                 // resulting sum overflows
                 skipTest[j][i] = (bufSkip[i] ||
                                   (gSkipNanInf && (FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)))));

 #if defined(__PPC__)
                 // Since the current Power processors don't emulate flush to zero in HW,
                 // it must be emulated in SW instead.
                 if (gForceFTZ)
                 {
                     if ((fabsf(correct[j][i]) < FLT_MIN) && (correct[j][i] != 0.0f))
                         correct[j][i] = copysignf(0.0f, correct[j][i]);
                 }
 #endif
             }
         }
         if( gHasDouble )
         {
             // Spec requires correct non-flushed results
             // for doubles. We disable FTZ if this is default on
             // the platform (like ARM) for reference result computation
             // It is no-op if platform default is not FTZ (e.g. x86)
             FPU_mode_type oldMode;
             DisableFTZ( &oldMode );

             buf3_double = (double *)malloc( BUFFER_SIZE );
             buf4_double = (double *)malloc( BUFFER_SIZE );
             buf5_double = (double *)malloc( BUFFER_SIZE );
             buf6_double = (double *)malloc( BUFFER_SIZE );
             if( NULL == buf3_double || NULL == buf4_double || NULL == buf5_double || NULL == buf6_double )
             {
                 vlog_error( "Out of memory initializing DP buffers\n" );
                 return TEST_FAIL;
             }
             for( i = 0; i < sizeof( correct_double ) / sizeof( correct_double[0] ); i++ )
             {
                 correct_double[i] = (double *)malloc( BUFFER_SIZE );
                 if( NULL == correct_double[i] )
                 {
                     vlog_error( "Out of memory initializing DP buffers 2\n" );
                     return TEST_FAIL;
                 }
             }


             double *f  = (double*) buf1;
             double *f2 = (double*) buf2;
             double *f3 = (double*) buf3_double;
             double *f4 = (double*) buf4_double;
             for( i = 0; i < BUFFER_SIZE / sizeof(double); i++ )
             {
                 double q = f[i];
                 double q2 = f2[i];
 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
                 // VS2005 might use x87 for straight multiplies, and we can't
                 // turn that off
                 f3[i] = sse_mul_sd(q, q2);
                 f4[i] = sse_mul_sd(-q, q2);
 #else
                 f3[i] = q * q2;
                 f4[i] = -q * q2;
 #endif
             }

             if( (error = clEnqueueWriteBuffer(gQueue, bufC_double, CL_FALSE, 0, BUFFER_SIZE, buf3_double, 0, NULL, NULL) ))
             {
                 vlog_error( "Failure %d at clEnqueueWriteBuffer3\n", error );
                 return TEST_FAIL;
             }
             if( (error = clEnqueueWriteBuffer(gQueue, bufD_double, CL_FALSE, 0, BUFFER_SIZE, buf4_double, 0, NULL, NULL) ))
             {
                 vlog_error( "Failure %d at clEnqueueWriteBuffer4\n", error );
                 return TEST_FAIL;
             }

             // Fill the buffers with NaN
             double *f5 = (double*) buf5_double;
             double nan_val = nanf("");
             for( i = 0; i < BUFFER_SIZE / sizeof( double ); i++ )
                 f5[i] = nan_val;

             // calculate reference results
             for( i = 0; i < BUFFER_SIZE / sizeof( double ); i++ )
             {
 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
                 // VS2005 might use x87 for straight add/sub, and we can't
                 // turn that off
                 correct_double[0][i] = sse_add_sd(buf3_double[i],buf4_double[i]);
                 correct_double[1][i] = sse_sub_sd(buf3_double[i],buf3_double[i]);
                 correct_double[2][i] = sse_add_sd(buf4_double[i],buf3_double[i]);
                 correct_double[3][i] = sse_sub_sd(buf3_double[i],buf3_double[i]);
                 correct_double[4][i] = -sse_add_sd(buf3_double[i],buf4_double[i]);
                 correct_double[5][i] = -sse_sub_sd(buf3_double[i],buf3_double[i]);
                 correct_double[6][i] = -sse_add_sd(buf4_double[i],buf3_double[i]);
                 correct_double[7][i] = -sse_sub_sd(buf3_double[i],buf3_double[i]);
 #else
                 correct_double[0][i] = buf3_double[i] + buf4_double[i];
                 correct_double[1][i] = buf3_double[i] - buf3_double[i];
                 correct_double[2][i] = buf4_double[i] + buf3_double[i];
                 correct_double[3][i] = buf3_double[i] - buf3_double[i];
                 correct_double[4][i] = -(buf3_double[i] + buf4_double[i]);
                 correct_double[5][i] = -(buf3_double[i] - buf3_double[i]);
                 correct_double[6][i] = -(buf4_double[i] + buf3_double[i]);
                 correct_double[7][i] = -(buf3_double[i] - buf3_double[i]);
 #endif
             }

             // Restore previous FP state since we modified it for
             // reference result computation (see DisableFTZ call above)
             RestoreFPState(&oldMode);
         }
     }

     char c[1000];
     static const char *no_yes[] = { "NO", "YES" };
     vlog( "\nCompute Device info:\n" );
     clGetDeviceInfo( device,  CL_DEVICE_NAME, sizeof(c), (void *)&c, NULL);
     vlog( "\tDevice Name: %s\n", c );
     clGetDeviceInfo( device,  CL_DEVICE_VENDOR, sizeof(c), (void *)&c, NULL);
     vlog( "\tVendor: %s\n", c );
     clGetDeviceInfo( device,  CL_DEVICE_VERSION, sizeof(c), (void *)&c, NULL);
     vlog( "\tDevice Version: %s\n", c );
     clGetDeviceInfo( device, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL);
     vlog( "\tCL C Version: %s\n", c );
     clGetDeviceInfo( device,  CL_DRIVER_VERSION, sizeof(c), (void *)&c, NULL);
     vlog( "\tDriver Version: %s\n", c );
     vlog( "\tSubnormal values supported? %s\n", no_yes[0 != (CL_FP_DENORM & floatCapabilities)] );
     vlog( "\tTesting with FTZ mode ON? %s\n", no_yes[0 != gForceFTZ] );
     vlog( "\tTesting Doubles? %s\n", no_yes[0 != gHasDouble] );
     vlog( "\tRandom Number seed: 0x%8.8x\n", gSeed );
     vlog( "\n\n" );

     return TEST_PASS;
 }

 static void ReleaseCL( void )
 {
     clReleaseMemObject(bufA);
     clReleaseMemObject(bufB);
     clReleaseMemObject(bufC);
     clReleaseMemObject(bufD);
     clReleaseMemObject(bufE);
     clReleaseProgram(gProgram[0]);
     clReleaseProgram(gProgram[1]);
     clReleaseProgram(gProgram[2]);
     clReleaseProgram(gProgram[3]);
     clReleaseProgram(gProgram[4]);
     if( gHasDouble )
     {
         clReleaseMemObject(bufC_double);
         clReleaseMemObject(bufD_double);
         clReleaseProgram(gProgram_double[0]);
         clReleaseProgram(gProgram_double[1]);
         clReleaseProgram(gProgram_double[2]);
         clReleaseProgram(gProgram_double[3]);
         clReleaseProgram(gProgram_double[4]);
     }
     clReleaseCommandQueue(gQueue);
     clReleaseContext(gContext);
 }


 static int RunTest( int testNumber )
 {
     size_t i;
     int error = 0;
     cl_mem args[4];
     float *c;
     const char *kernelName[] = { "kernel1", "kernel2", "kernel3", "kernel4",
         "kernel5", "kernel6", "kernel7", "kernel8" };
     switch( testNumber )
     {
         case 0:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD;     c = buf4;   break;      // a * b + c
         case 1:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC;     c = buf3;   break;
         case 2:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD;     c = buf4;   break;
         case 3:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC;     c = buf3;   break;
         case 4:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD;     c = buf4;   break;
         case 5:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC;     c = buf3;   break;
         case 6:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD;     c = buf4;   break;
         case 7:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC;     c = buf3;   break;
         default:
             vlog_error( "Unknown test case %d passed to RunTest\n", testNumber );
             return -1;
     }


     int vectorSize;
     for( vectorSize = 0; vectorSize < 5; vectorSize++ )
     {
         cl_kernel k = clCreateKernel( gProgram[ vectorSize ], kernelName[ testNumber ], &error );
         if( NULL == k || error )
         {
             vlog_error( "%d) Unable to find kernel \"%s\" for vector size: %d\n", error, kernelName[ testNumber ], 1 << vectorSize );
             return -2;
         }

         // set the kernel args
         for( i = 0; i < sizeof(args ) / sizeof( args[0]); i++ )
             if( (error = clSetKernelArg(k, i, sizeof( cl_mem ), args + i) ))
             {
                 vlog_error( "Error %d setting kernel arg # %ld\n", error, i );
                 return error;
             }

         // write NaNs to the result array
         if( (error = clEnqueueWriteBuffer(gQueue, bufE, CL_TRUE, 0, BUFFER_SIZE, buf5, 0, NULL, NULL) ))
         {
             vlog_error( "Failure %d at clWriteArray %d\n", error, testNumber );
             return error;
         }

         // execute the kernel
         size_t gDim[3] = { BUFFER_SIZE / (sizeof( cl_float ) * (1<<vectorSize)), 0, 0 };
         if( ((error = clEnqueueNDRangeKernel(gQueue, k, 1, NULL, gDim, NULL, 0, NULL, NULL) )))
         {
             vlog_error( "Got Error # %d trying to execture kernel\n", error );
             return error;
         }

         // read the data back
         if( (error = clEnqueueReadBuffer(gQueue, bufE, CL_TRUE, 0, BUFFER_SIZE, buf6, 0, NULL, NULL ) ))
         {
             vlog_error( "Failure %d at clReadArray %d\n", error, testNumber );
             return error;
         }

         // verify results
         float *test = (float*) buf6;
         float *a = (float*) buf1;
         float *b = (float*) buf2;
         for( i = 0; i < BUFFER_SIZE / sizeof( float ); i++ )
         {
             if( isnan(test[i]) && isnan(correct[testNumber][i] ) )
                 continue;

             if( skipTest[testNumber][i] )
                 continue;

             // sign of zero must be correct
             if(( ((uint32_t*) test)[i] != ((uint32_t*) correct[testNumber])[i] ) &&
                !(gIgnoreZeroSign && (test[i] == 0.0f) && (correct[testNumber][i] == 0.0f)) )
             {
                 switch( testNumber )
                 {
                         // Zeros for these should be positive
                     case 0:     vlog_error( "%ld) Error for %s %s: %a * %a + %a =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
                     case 1:     vlog_error( "%ld) Error for %s %s: %a * %a - %a =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
                     case 2:     vlog_error( "%ld) Error for %s %s: %a + %a * %a =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
                     case 3:     vlog_error( "%ld) Error for %s %s: %a - %a * %a =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;

                         // Zeros for these should be negative
                     case 4:     vlog_error( "%ld) Error for %s %s: -(%a * %a + %a) =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
                     case 5:     vlog_error( "%ld) Error for %s %s: -(%a * %a - %a) =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
                     case 6:     vlog_error( "%ld) Error for %s %s: -(%a + %a * %a) =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
                     case 7:     vlog_error( "%ld) Error for %s %s: -(%a - %a * %a) =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
                     default:
                         vlog_error( "error: Unknown test number!\n" );
                         clReleaseKernel(k);
                         return -2;
                 }
             }
         }

         clReleaseKernel(k);
     }

     return error;
 }

 static int RunTest_Double( int testNumber )
 {
     if( !gHasDouble )
     {
         vlog("Double is not supported, test not run.\n");
         return 0;
     }

     size_t i;
     int error = 0;
     cl_mem args[4];
     double *c;
     const char *kernelName[] = { "kernel1", "kernel2", "kernel3", "kernel4",
         "kernel5", "kernel6", "kernel7", "kernel8" };

     switch( testNumber )
     {
         case 0:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD_double;     c = buf4_double;   break;      // a * b + c
         case 1:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC_double;     c = buf3_double;   break;
         case 2:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD_double;     c = buf4_double;   break;
         case 3:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC_double;     c = buf3_double;   break;
         case 4:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD_double;     c = buf4_double;   break;
         case 5:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC_double;     c = buf3_double;   break;
         case 6:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD_double;     c = buf4_double;   break;
         case 7:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC_double;     c = buf3_double;   break;
         default:
             vlog_error( "Unknown test case %d passed to RunTest\n", testNumber );
             return -1;
     }

     int vectorSize;
     for( vectorSize = 0; vectorSize < 5; vectorSize++ )
     {
         cl_kernel k = clCreateKernel( gProgram_double[ vectorSize ], kernelName[ testNumber ], &error );
         if( NULL == k || error )
         {
             vlog_error( "%d) Unable to find kernel \"%s\" for vector size: %d\n", error, kernelName[ testNumber ], 1 << vectorSize );
             return -2;
         }

         // set the kernel args
         for( i = 0; i < sizeof(args ) / sizeof( args[0]); i++ )
             if( (error = clSetKernelArg(k, i, sizeof( cl_mem ), args + i) ))
             {
                 vlog_error( "Error %d setting kernel arg # %ld\n", error, i );
                 return error;
             }

         // write NaNs to the result array
         if( (error = clEnqueueWriteBuffer(gQueue, bufE, CL_FALSE, 0, BUFFER_SIZE, buf5_double, 0, NULL, NULL) ))
         {
             vlog_error( "Failure %d at clWriteArray %d\n", error, testNumber );
             return error;
         }

         // execute the kernel
         size_t gDim[3] = { BUFFER_SIZE / (sizeof( cl_double ) * (1<<vectorSize)), 0, 0 };
         if( ((error = clEnqueueNDRangeKernel(gQueue, k, 1, NULL, gDim, NULL, 0, NULL, NULL) )))
         {
             vlog_error( "Got Error # %d trying to execture kernel\n", error );
             return error;
         }

         // read the data back
         if( (error = clEnqueueReadBuffer(gQueue, bufE, CL_TRUE, 0, BUFFER_SIZE, buf6_double, 0, NULL, NULL ) ))
         {
             vlog_error( "Failure %d at clReadArray %d\n", error, testNumber );
             return error;
         }

         // verify results
         double *test = (double*) buf6_double;
         double *a = (double*) buf1;
         double *b = (double*) buf2;
         for( i = 0; i < BUFFER_SIZE / sizeof( double ); i++ )
         {
             if( isnan(test[i]) && isnan(correct_double[testNumber][i] ) )
                 continue;

             // sign of zero must be correct
             if( ((uint64_t*) test)[i] != ((uint64_t*) correct_double[testNumber])[i] )
             {
                 switch( testNumber )
                 {
                         // Zeros for these should be positive
                     case 0:     vlog_error( "%ld) Error for %s %s: %a * %a + %a =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       return -1;
                     case 1:     vlog_error( "%ld) Error for %s %s: %a * %a - %a =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       return -1;
                     case 2:     vlog_error( "%ld) Error for %s %s: %a + %a * %a =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       return -1;
                     case 3:     vlog_error( "%ld) Error for %s %s: %a - %a * %a =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       return -1;

                         // Zeros for these should be negative
                     case 4:     vlog_error( "%ld) Error for %s %s: -(%a * %a + %a) =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       return -1;
                     case 5:     vlog_error( "%ld) Error for %s %s: -(%a * %a - %a) =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       return -1;
                     case 6:     vlog_error( "%ld) Error for %s %s: -(%a + %a * %a) =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       return -1;
                     case 7:     vlog_error( "%ld) Error for %s %s: -(%a - %a * %a) =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       return -1;
                     default:
                         vlog_error( "error: Unknown test number!\n" );
                         return -2;
                 }
             }
         }

         clReleaseKernel(k);
     }

     return error;
 }