| // |
| // Copyright (c) 2017 The Khronos Group Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| #include "harness/compat.h" |
| #include "harness/rounding_mode.h" |
| #include "harness/ThreadPool.h" |
| #include "harness/testHarness.h" |
| #include "harness/kernelHelpers.h" |
| #include "harness/parseParameters.h" |
| #if defined(__APPLE__) |
| #include <sys/sysctl.h> |
| #endif |
| |
| #if defined( __linux__ ) |
| #include <unistd.h> |
| #include <sys/syscall.h> |
| #include <linux/sysctl.h> |
| #endif |
| #if defined(__linux__) |
| #include <sys/param.h> |
| #include <libgen.h> |
| #endif |
| |
| #include "mingw_compat.h" |
| #if defined(__MINGW32__) |
| #include <sys/param.h> |
| #endif |
| |
| #include <stdarg.h> |
| #include <stdio.h> |
| #include <string.h> |
| #if !defined(_WIN32) |
| #include <libgen.h> |
| #include <sys/mman.h> |
| #endif |
| #include <time.h> |
| |
| #include "Sleep.h" |
| #include "basic_test_conversions.h" |
| |
| #if (defined(_WIN32) && defined (_MSC_VER)) |
| // need for _controlfp_s and rouinding modes in RoundingMode |
| #include "harness/testHarness.h" |
| #endif |
| |
| #pragma mark - |
| #pragma mark globals |
| |
| #define BUFFER_SIZE (1024*1024) |
| #define kPageSize 4096 |
| #define EMBEDDED_REDUCTION_FACTOR 16 |
| #define PERF_LOOP_COUNT 100 |
| |
| #define kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */) |
| |
| #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__) |
| #include "fplib.h" |
| extern bool qcom_sat; |
| extern roundingMode qcom_rm; |
| #endif |
| |
| const char ** argList = NULL; |
| int argCount = 0; |
| cl_context gContext = NULL; |
| cl_command_queue gQueue = NULL; |
| char appName[64] = "ctest"; |
| int gStartTestNumber = -1; |
| int gEndTestNumber = 0; |
| #if defined( __APPLE__ ) |
| int gTimeResults = 1; |
| #else |
| int gTimeResults = 0; |
| #endif |
| int gReportAverageTimes = 0; |
| void *gIn = NULL; |
| void *gRef = NULL; |
| void *gAllowZ = NULL; |
| void *gOut[ kCallStyleCount ] = { NULL }; |
| cl_mem gInBuffer; |
| cl_mem gOutBuffers[ kCallStyleCount ]; |
| size_t gComputeDevices = 0; |
| uint32_t gDeviceFrequency = 0; |
| int gWimpyMode = 0; |
| int gWimpyReductionFactor = 128; |
| int gSkipTesting = 0; |
| int gForceFTZ = 0; |
| int gMultithread = 1; |
| int gIsRTZ = 0; |
| uint32_t gSimdSize = 1; |
| int gHasDouble = 0; |
| int gTestDouble = 1; |
| const char * sizeNames[] = { "", "", "2", "3", "4", "8", "16" }; |
| const int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 }; |
| int gMinVectorSize = 0; |
| int gMaxVectorSize = sizeof(vectorSizes) / sizeof( vectorSizes[0] ); |
| static MTdata gMTdata; |
| |
| #pragma mark - |
| #pragma mark Declarations |
| |
| static int ParseArgs( int argc, const char **argv ); |
| static void PrintUsage( void ); |
| test_status InitCL( cl_device_id device ); |
| static int GetTestCase( const char *name, Type *outType, Type *inType, SaturationMode *sat, RoundingMode *round ); |
| static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMode sat, RoundingMode round, MTdata d ); |
| static cl_program MakeProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel ); |
| static int RunKernel( cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount ); |
| |
| void *FlushToZero( void ); |
| void UnFlushToZero( void *); |
| |
| static cl_program CreateImplicitConvertProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, char testName[256], cl_int *error ); |
| static cl_program CreateStandardProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, char testName[256], cl_int *error ); |
| |
| |
| // Windows (since long double got deprecated) sets the x87 to 53-bit precision |
| // (that's x87 default state). This causes problems with the tests that |
| // convert long and ulong to float and double or otherwise deal with values |
| // that need more precision than 53-bit. So, set the x87 to 64-bit precision. |
| static inline void Force64BitFPUPrecision(void) |
| { |
| #if __MINGW32__ |
| // The usual method is to use _controlfp as follows: |
| // #include <float.h> |
| // _controlfp(_PC_64, _MCW_PC); |
| // |
| // _controlfp is available on MinGW32 but not on MinGW64. Instead of having |
| // divergent code just use inline assembly which works for both. |
| unsigned short int orig_cw = 0; |
| unsigned short int new_cw = 0; |
| __asm__ __volatile__ ("fstcw %0":"=m" (orig_cw)); |
| new_cw = orig_cw | 0x0300; // set precision to 64-bit |
| __asm__ __volatile__ ("fldcw %0"::"m" (new_cw)); |
| #else |
| /* Implement for other platforms if needed */ |
| #endif |
| } |
| |
| int test_conversions( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements ) |
| { |
| int error, i, testNumber = -1; |
| int startMinVectorSize = gMinVectorSize; |
| Type inType, outType; |
| RoundingMode round; |
| SaturationMode sat; |
| |
| if( argCount ) |
| { |
| for( i = 0; i < argCount; i++ ) |
| { |
| if( GetTestCase( argList[i], &outType, &inType, &sat, &round ) ) |
| { |
| vlog_error( "\n\t\t**** ERROR: Unable to parse function name %s. Skipping.... *****\n\n", argList[i] ); |
| continue; |
| } |
| |
| // skip double if we don't have it |
| if( !gTestDouble && (inType == kdouble || outType == kdouble ) ) |
| { |
| if( gHasDouble ) |
| { |
| vlog_error( "\t *** convert_%sn%s%s( %sn ) FAILED ** \n", gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] ); |
| vlog( "\t\tcl_khr_fp64 enabled, but double testing turned off.\n" ); |
| } |
| |
| continue; |
| } |
| |
| // skip longs on embedded |
| if( !gHasLong && (inType == klong || outType == klong || inType == kulong || outType == kulong) ) |
| { |
| continue; |
| } |
| |
| // Skip the implicit converts if the rounding mode is not default or test is saturated |
| if( 0 == startMinVectorSize ) |
| { |
| if( sat || round != kDefaultRoundingMode ) |
| gMinVectorSize = 1; |
| else |
| gMinVectorSize = 0; |
| } |
| |
| if( ( error = DoTest( device, outType, inType, sat, round, gMTdata ) ) ) |
| { |
| vlog_error( "\t *** convert_%sn%s%s( %sn ) FAILED ** \n", gTypeNames[outType], gSaturationNames[sat], gRoundingModeNames[round], gTypeNames[inType] ); |
| } |
| } |
| } |
| else |
| { |
| for( outType = (Type)0; outType < kTypeCount; outType = (Type)(outType+1) ) |
| { |
| for( inType = (Type)0; inType < kTypeCount; inType = (Type)(inType+1) ) |
| { |
| // skip longs on embedded |
| if( !gHasLong && (inType == klong || outType == klong || inType == kulong || outType == kulong) ) |
| { |
| continue; |
| } |
| |
| for( sat = (SaturationMode)0; sat < kSaturationModeCount; sat = (SaturationMode)(sat+1) ) |
| { |
| //skip illegal saturated conversions to float type |
| if( kSaturated == sat && ( outType == kfloat || outType == kdouble ) ) |
| { |
| continue; |
| } |
| |
| for( round = (RoundingMode)0; round < kRoundingModeCount; round = (RoundingMode)(round+1) ) |
| { |
| if( ++testNumber < gStartTestNumber ) |
| { |
| // vlog( "%d) skipping convert_%sn%s%s( %sn )\n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] ); |
| continue; |
| } |
| else |
| { |
| if( gEndTestNumber > 0 && testNumber >= gEndTestNumber ) |
| { |
| goto exit; |
| } |
| } |
| |
| vlog( "%d) Testing convert_%sn%s%s( %sn ):\n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] ); |
| |
| // skip double if we don't have it |
| if( ! gTestDouble && (inType == kdouble || outType == kdouble ) ) |
| { |
| if( gHasDouble ) |
| { |
| vlog_error( "\t *** %d) convert_%sn%s%s( %sn ) FAILED ** \n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] ); |
| vlog( "\t\tcl_khr_fp64 enabled, but double testing turned off.\n" ); |
| } |
| continue; |
| } |
| |
| // Skip the implicit converts if the rounding mode is not default or test is saturated |
| if( 0 == startMinVectorSize ) |
| { |
| if( sat || round != kDefaultRoundingMode ) |
| gMinVectorSize = 1; |
| else |
| gMinVectorSize = 0; |
| } |
| |
| if( ( error = DoTest( device, outType, inType, sat, round, gMTdata ) ) ) |
| { |
| vlog_error( "\t *** %d) convert_%sn%s%s( %sn ) FAILED ** \n", testNumber, gTypeNames[outType], gSaturationNames[sat], gRoundingModeNames[round], gTypeNames[inType] ); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| exit: |
| return gFailCount; |
| } |
| |
| test_definition test_list[] = { |
| ADD_TEST( conversions ), |
| }; |
| |
| const int test_num = ARRAY_SIZE( test_list ); |
| |
| #pragma mark - |
| |
| int main (int argc, const char **argv ) |
| { |
| int error; |
| cl_uint seed = (cl_uint) time( NULL ); |
| |
| argc = parseCustomParam(argc, argv); |
| if (argc == -1) |
| { |
| return 1; |
| } |
| |
| if( (error = ParseArgs( argc, argv )) ) |
| return error; |
| |
| //Turn off sleep so our tests run to completion |
| PreventSleep(); |
| atexit( ResumeSleep ); |
| |
| if(!gMultithread) |
| SetThreadCount(1); |
| |
| #if defined(_MSC_VER) && defined(_M_IX86) |
| // VS2005 (and probably others, since long double got deprecated) sets |
| // the x87 to 53-bit precision. This causes problems with the tests |
| // that convert long and ulong to float and double, since they deal |
| // with values that need more precision than that. So, set the x87 |
| // to 64-bit precision. |
| unsigned int ignored; |
| _controlfp_s(&ignored, _PC_64, _MCW_PC); |
| #endif |
| |
| vlog( "===========================================================\n" ); |
| vlog( "Random seed: %u\n", seed ); |
| gMTdata = init_genrand( seed ); |
| |
| const char* arg[] = {argv[0]}; |
| int ret = runTestHarnessWithCheck( 1, arg, test_num, test_list, true, 0, InitCL ); |
| |
| free_mtdata( gMTdata ); |
| if (gQueue) |
| { |
| error = clFinish(gQueue); |
| if (error) vlog_error("clFinish failed: %d\n", error); |
| } |
| |
| clReleaseMemObject(gInBuffer); |
| |
| for( int i = 0; i < kCallStyleCount; i++ ) { |
| clReleaseMemObject(gOutBuffers[i]); |
| } |
| clReleaseCommandQueue(gQueue); |
| clReleaseContext(gContext); |
| |
| return ret; |
| } |
| |
| #pragma mark - |
| #pragma mark setup |
| |
| static int ParseArgs( int argc, const char **argv ) |
| { |
| int i; |
| argList = (const char **)calloc( argc - 1, sizeof( char*) ); |
| argCount = 0; |
| |
| if( NULL == argList && argc > 1 ) |
| return -1; |
| |
| #if (defined( __APPLE__ ) || defined(__linux__) || defined (__MINGW32__)) |
| { // Extract the app name |
| char baseName[ MAXPATHLEN ]; |
| strncpy( baseName, argv[0], MAXPATHLEN ); |
| char *base = basename( baseName ); |
| if( NULL != base ) |
| { |
| strncpy( appName, base, sizeof( appName ) ); |
| appName[ sizeof( appName ) -1 ] = '\0'; |
| } |
| } |
| #elif defined (_WIN32) |
| { |
| char fname[_MAX_FNAME + _MAX_EXT + 1]; |
| char ext[_MAX_EXT]; |
| |
| errno_t err = _splitpath_s( argv[0], NULL, 0, NULL, 0, |
| fname, _MAX_FNAME, ext, _MAX_EXT ); |
| if (err == 0) { // no error |
| strcat (fname, ext); //just cat them, size of frame can keep both |
| strncpy (appName, fname, sizeof(appName)); |
| appName[ sizeof( appName ) -1 ] = '\0'; |
| } |
| } |
| #endif |
| |
| vlog( "\n%s", appName ); |
| for( i = 1; i < argc; i++ ) |
| { |
| const char *arg = argv[i]; |
| if( NULL == arg ) |
| break; |
| |
| vlog( "\t%s", arg ); |
| if( arg[0] == '-' ) |
| { |
| arg++; |
| while( *arg != '\0' ) |
| { |
| switch( *arg ) |
| { |
| case 'd': |
| gTestDouble ^= 1; |
| break; |
| case 'l': |
| gSkipTesting ^= 1; |
| break; |
| case 'm': |
| gMultithread ^= 1; |
| break; |
| case 'w': |
| gWimpyMode ^= 1; |
| break; |
| case '[': |
| parseWimpyReductionFactor(arg, gWimpyReductionFactor); |
| break; |
| case 'z': |
| gForceFTZ ^= 1; |
| break; |
| case 't': |
| gTimeResults ^= 1; |
| break; |
| case 'a': |
| gReportAverageTimes ^= 1; |
| break; |
| case '1': |
| if( arg[1] == '6' ) |
| { |
| gMinVectorSize = 6; |
| gMaxVectorSize = 7; |
| arg++; |
| } |
| else |
| { |
| gMinVectorSize = 0; |
| gMaxVectorSize = 2; |
| } |
| break; |
| |
| case '2': |
| gMinVectorSize = 2; |
| gMaxVectorSize = 3; |
| break; |
| |
| case '3': |
| gMinVectorSize = 3; |
| gMaxVectorSize = 4; |
| break; |
| |
| case '4': |
| gMinVectorSize = 4; |
| gMaxVectorSize = 5; |
| break; |
| |
| case '8': |
| gMinVectorSize = 5; |
| gMaxVectorSize = 6; |
| break; |
| |
| default: |
| vlog( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg ); |
| PrintUsage(); |
| return -1; |
| } |
| arg++; |
| } |
| } |
| else |
| { |
| char *t = NULL; |
| long number = strtol( arg, &t, 0 ); |
| if( t != arg ) |
| { |
| if( gStartTestNumber != -1 ) |
| gEndTestNumber = gStartTestNumber + (int) number; |
| else |
| gStartTestNumber = (int) number; |
| } |
| else |
| { |
| argList[ argCount ] = arg; |
| argCount++; |
| } |
| } |
| } |
| |
| // Check for the wimpy mode environment variable |
| if (getenv("CL_WIMPY_MODE")) { |
| vlog( "\n" ); |
| vlog( "*** Detected CL_WIMPY_MODE env ***\n" ); |
| gWimpyMode = 1; |
| } |
| |
| vlog( "\n" ); |
| |
| vlog( "Test binary built %s %s\n", __DATE__, __TIME__ ); |
| |
| PrintArch(); |
| |
| if( gWimpyMode ) |
| { |
| vlog( "\n" ); |
| vlog( "*** WARNING: Testing in Wimpy mode! ***\n" ); |
| vlog( "*** Wimpy mode is not sufficient to verify correctness. ***\n" ); |
| vlog( "*** It gives warm fuzzy feelings and then nevers calls. ***\n\n" ); |
| vlog("*** Wimpy Reduction Factor: %-27u ***\n\n", gWimpyReductionFactor); |
| } |
| |
| return 0; |
| } |
| |
| static void PrintUsage( void ) |
| { |
| int i; |
| vlog( "%s [-wz#]: <optional: test names>\n", appName ); |
| vlog( "\ttest names:\n" ); |
| vlog( "\t\tdestFormat<_sat><_round>_sourceFormat\n" ); |
| vlog( "\t\t\tPossible format types are:\n\t\t\t\t" ); |
| for( i = 0; i < kTypeCount; i++ ) |
| vlog( "%s, ", gTypeNames[i] ); |
| vlog( "\n\n\t\t\tPossible saturation values are: (empty) and _sat\n" ); |
| vlog( "\t\t\tPossible rounding values are:\n\t\t\t\t(empty), " ); |
| for( i = 1; i < kRoundingModeCount; i++ ) |
| vlog( "%s, ", gRoundingModeNames[i] ); |
| vlog( "\n\t\t\tExamples:\n" ); |
| vlog( "\t\t\t\tulong_short converts short to ulong\n" ); |
| vlog( "\t\t\t\tchar_sat_rte_float converts float to char with saturated clipping in round to nearest rounding mode\n\n" ); |
| vlog( "\toptions:\n" ); |
| vlog( "\t\t-d\tToggle testing of double precision. On by default if cl_khr_fp64 is enabled, ignored otherwise.\n" ); |
| vlog( "\t\t-l\tToggle link check mode. When on, testing is skipped, and we just check to see that the kernels build. (Off by default.)\n" ); |
| vlog( "\t\t-m\tToggle Multithreading. (On by default.)\n" ); |
| vlog( "\t\t-w\tToggle wimpy mode. When wimpy mode is on, we run a very small subset of the tests for each fn. NOT A VALID TEST! (Off by default.)\n" ); |
| vlog(" \t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is 1-12, default factor(%u)\n", gWimpyReductionFactor); |
| vlog( "\t\t-z\tToggle flush to zero mode (Default: per device)\n" ); |
| vlog( "\t\t-#\tTest just vector size given by #, where # is an element of the set {1,2,3,4,8,16}\n" ); |
| vlog( "\n" ); |
| vlog( "You may also pass the number of the test on which to start.\nA second number can be then passed to indicate how many tests to run\n\n" ); |
| } |
| |
| |
| static int GetTestCase( const char *name, Type *outType, Type *inType, SaturationMode *sat, RoundingMode *round ) |
| { |
| int i; |
| |
| //Find the return type |
| for( i = 0; i < kTypeCount; i++ ) |
| if( name == strstr( name, gTypeNames[i] ) ) |
| { |
| *outType = (Type)i; |
| name += strlen( gTypeNames[i] ); |
| |
| break; |
| } |
| |
| if( i == kTypeCount ) |
| return -1; |
| |
| // Check to see if _sat appears next |
| *sat = (SaturationMode)0; |
| for( i = 1; i < kSaturationModeCount; i++ ) |
| if( name == strstr( name, gSaturationNames[i] ) ) |
| { |
| *sat = (SaturationMode)i; |
| name += strlen( gSaturationNames[i] ); |
| break; |
| } |
| |
| *round = (RoundingMode)0; |
| for( i = 1; i < kRoundingModeCount; i++ ) |
| if( name == strstr( name, gRoundingModeNames[i] ) ) |
| { |
| *round = (RoundingMode)i; |
| name += strlen( gRoundingModeNames[i] ); |
| break; |
| } |
| |
| if( *name != '_' ) |
| return -2; |
| name++; |
| |
| for( i = 0; i < kTypeCount; i++ ) |
| if( name == strstr( name, gTypeNames[i] ) ) |
| { |
| *inType = (Type)i; |
| name += strlen( gTypeNames[i] ); |
| |
| break; |
| } |
| |
| if( i == kTypeCount ) |
| return -3; |
| |
| if( *name != '\0' ) |
| return -4; |
| |
| return 0; |
| } |
| |
| #pragma mark - |
| #pragma mark OpenCL |
| |
| test_status InitCL( cl_device_id device ) |
| { |
| int error, i; |
| size_t configSize = sizeof( gComputeDevices ); |
| |
| if( (error = clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, configSize, &gComputeDevices, NULL )) ) |
| gComputeDevices = 1; |
| |
| configSize = sizeof( gDeviceFrequency ); |
| if( (error = clGetDeviceInfo( device, CL_DEVICE_MAX_CLOCK_FREQUENCY, configSize, &gDeviceFrequency, NULL )) ) |
| gDeviceFrequency = 0; |
| |
| cl_device_fp_config floatCapabilities = 0; |
| if( (error = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(floatCapabilities), &floatCapabilities, NULL))) |
| floatCapabilities = 0; |
| if(0 == (CL_FP_DENORM & floatCapabilities) ) |
| gForceFTZ ^= 1; |
| |
| if( 0 == (floatCapabilities & CL_FP_ROUND_TO_NEAREST ) ) |
| { |
| char profileStr[128] = ""; |
| // Verify that we are an embedded profile device |
| if( (error = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof( profileStr ), profileStr, NULL ) ) ) |
| { |
| vlog_error( "FAILURE: Could not get device profile: error %d\n", error ); |
| return TEST_FAIL; |
| } |
| |
| if( strcmp( profileStr, "EMBEDDED_PROFILE" ) ) |
| { |
| vlog_error( "FAILURE: non-embedded profile device does not support CL_FP_ROUND_TO_NEAREST\n" ); |
| return TEST_FAIL; |
| } |
| |
| if( 0 == (floatCapabilities & CL_FP_ROUND_TO_ZERO ) ) |
| { |
| vlog_error( "FAILURE: embedded profile device supports neither CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n" ); |
| return TEST_FAIL; |
| } |
| |
| gIsRTZ = 1; |
| } |
| |
| else if(is_extension_available(device, "cl_khr_fp64")) |
| { |
| gHasDouble = 1; |
| } |
| gTestDouble &= gHasDouble; |
| |
| //detect whether profile of the device is embedded |
| char profile[1024] = ""; |
| if( (error = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof(profile), profile, NULL ) ) ){} |
| else if( strstr(profile, "EMBEDDED_PROFILE" ) ) |
| { |
| gIsEmbedded = 1; |
| if( !is_extension_available(device, "cles_khr_int64" ) ) |
| gHasLong = 0; |
| } |
| |
| |
| gContext = clCreateContext( NULL, 1, &device, notify_callback, NULL, &error ); |
| if( NULL == gContext || error ) |
| { |
| vlog_error( "clCreateContext failed. (%d)\n", error ); |
| return TEST_FAIL; |
| } |
| |
| gQueue = clCreateCommandQueue(gContext, device, 0, &error); |
| if( NULL == gQueue || error ) |
| { |
| vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); |
| return TEST_FAIL; |
| } |
| |
| //Allocate buffers |
| //FIXME: use clProtectedArray for guarded allocations? |
| gIn = malloc( BUFFER_SIZE + 2 * kPageSize ); |
| gAllowZ = malloc( BUFFER_SIZE + 2 * kPageSize ); |
| gRef = malloc( BUFFER_SIZE + 2 * kPageSize ); |
| for( i = 0; i < kCallStyleCount; i++ ) |
| { |
| gOut[i] = malloc( BUFFER_SIZE + 2 * kPageSize ); |
| if( NULL == gOut[i] ) |
| return TEST_FAIL; |
| } |
| |
| // setup input buffers |
| gInBuffer = clCreateBuffer(gContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, BUFFER_SIZE, NULL, &error); |
| if( gInBuffer == NULL || error) |
| { |
| vlog_error( "clCreateBuffer failed for input (%d)\n", error ); |
| return TEST_FAIL; |
| } |
| |
| // setup output buffers |
| for( i = 0; i < kCallStyleCount; i++ ) |
| { |
| gOutBuffers[i] = clCreateBuffer( gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, BUFFER_SIZE, NULL, &error ); |
| if( gOutBuffers[i] == NULL || error ) |
| { |
| vlog_error( "clCreateArray failed for output (%d)\n", error ); |
| return TEST_FAIL; |
| } |
| } |
| |
| |
| gMTdata = init_genrand( gRandomSeed ); |
| |
| |
| char c[1024]; |
| static const char *no_yes[] = { "NO", "YES" }; |
| vlog( "\nCompute Device info:\n" ); |
| clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(c), c, NULL); |
| vlog( "\tDevice Name: %s\n", c ); |
| clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(c), c, NULL); |
| vlog( "\tVendor: %s\n", c ); |
| clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(c), c, NULL); |
| vlog( "\tDevice Version: %s\n", c ); |
| clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL); |
| vlog( "\tCL C Version: %s\n", c ); |
| clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(c), c, NULL); |
| vlog( "\tDriver Version: %s\n", c ); |
| vlog( "\tProcessing with %ld devices\n", gComputeDevices ); |
| vlog( "\tDevice Frequency: %d MHz\n", gDeviceFrequency ); |
| vlog( "\tSubnormal values supported for floats? %s\n", no_yes[0 != (CL_FP_DENORM & floatCapabilities)] ); |
| vlog( "\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ] ); |
| vlog( "\tTesting with default RTZ mode for floats? %s\n", no_yes[0 != gIsRTZ] ); |
| vlog( "\tHas Double? %s\n", no_yes[0 != gHasDouble] ); |
| if( gHasDouble ) |
| vlog( "\tTest Double? %s\n", no_yes[0 != gTestDouble] ); |
| vlog( "\tHas Long? %s\n", no_yes[0 != gHasLong] ); |
| vlog( "\tTesting vector sizes: " ); |
| for( i = gMinVectorSize; i < gMaxVectorSize; i++ ) |
| vlog("\t%d", vectorSizes[i]); |
| vlog( "\n" ); |
| return TEST_PASS; |
| } |
| |
| static int RunKernel( cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount ) |
| { |
| // The global dimensions are just the blockCount to execute since we haven't set up multiple queues for multiple devices. |
| int error; |
| |
| error = clSetKernelArg(kernel, 0, sizeof( inBuf ), &inBuf); |
| error |= clSetKernelArg(kernel, 1, sizeof(outBuf), &outBuf); |
| |
| if( error ) |
| { |
| vlog_error( "FAILED -- could not set kernel args (%d)\n", error ); |
| return error; |
| } |
| |
| if( (error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount, NULL, 0, NULL, NULL))) |
| { |
| vlog_error( "FAILED -- could not execute kernel (%d)\n", error ); |
| return error; |
| } |
| |
| return 0; |
| } |
| |
| #if ! defined( __APPLE__ ) |
| void memset_pattern4(void *dest, const void *src_pattern, size_t bytes ); |
| #endif |
| |
| #if defined( __APPLE__ ) |
| #include <mach/mach_time.h> |
| #endif |
| |
| uint64_t GetTime( void ); |
| uint64_t GetTime( void ) |
| { |
| #if defined( __APPLE__ ) |
| return mach_absolute_time(); |
| #elif defined(_MSC_VER) |
| return ReadTime(); |
| #else |
| //mach_absolute_time is a high precision timer with precision < 1 microsecond. |
| #warning need accurate clock here. Times are invalid. |
| return 0; |
| #endif |
| } |
| |
| |
| #if defined (_MSC_VER) |
| /* function is defined in "compat.h" */ |
| #else |
| double SubtractTime( uint64_t endTime, uint64_t startTime ); |
| double SubtractTime( uint64_t endTime, uint64_t startTime ) |
| { |
| uint64_t diff = endTime - startTime; |
| static double conversion = 0.0; |
| |
| if( 0.0 == conversion ) |
| { |
| #if defined( __APPLE__ ) |
| mach_timebase_info_data_t info = {0,0}; |
| kern_return_t err = mach_timebase_info( &info ); |
| if( 0 == err ) |
| conversion = 1e-9 * (double) info.numer / (double) info.denom; |
| #else |
| // This function consumes output from GetTime() above, and converts the time to secionds. |
| #warning need accurate ticks to seconds conversion factor here. Times are invalid. |
| #endif |
| } |
| |
| // strictly speaking we should also be subtracting out timer latency here |
| return conversion * (double) diff; |
| } |
| #endif |
| |
| typedef struct CalcReferenceValuesInfo |
| { |
| struct WriteInputBufferInfo *parent; // pointer back to the parent WriteInputBufferInfo struct |
| cl_kernel kernel; // the kernel for this vector size |
| cl_program program; // the program for this vector size |
| cl_uint vectorSize; // the vector size for this callback chain |
| void *p; // the pointer to mapped result data for this vector size |
| cl_int result; |
| }CalcReferenceValuesInfo; |
| |
| typedef struct WriteInputBufferInfo |
| { |
| volatile cl_event calcReferenceValues; // user event which signals when main thread is done calculating reference values |
| volatile cl_event doneBarrier; // user event which signals when worker threads are done |
| cl_uint count; // the number of elements in the array |
| Type outType; // the data type of the conversion result |
| Type inType; // the data type of the conversion input |
| volatile int barrierCount; |
| CalcReferenceValuesInfo calcInfo[kCallStyleCount]; |
| }WriteInputBufferInfo; |
| |
| cl_uint RoundUpToNextPowerOfTwo( cl_uint x ); |
| cl_uint RoundUpToNextPowerOfTwo( cl_uint x ) |
| { |
| if( 0 == (x & (x-1))) |
| return x; |
| |
| while( x & (x-1) ) |
| x &= x-1; |
| |
| return x + x; |
| } |
| |
| void CL_CALLBACK WriteInputBufferComplete( cl_event, cl_int, void * ); |
| |
| typedef struct DataInitInfo |
| { |
| cl_ulong start; |
| cl_uint size; |
| Type outType; |
| Type inType; |
| SaturationMode sat; |
| RoundingMode round; |
| MTdata *d; |
| }DataInitInfo; |
| |
| cl_int InitData( cl_uint job_id, cl_uint thread_id, void *p ); |
| cl_int InitData( cl_uint job_id, cl_uint thread_id, void *p ) |
| { |
| DataInitInfo *info = (DataInitInfo*) p; |
| |
| gInitFunctions[ info->inType ]( (char*)gIn + job_id * info->size * gTypeSizes[info->inType], info->sat, info->round, |
| info->outType, info->start + job_id * info->size, info->size, info->d[thread_id] ); |
| return CL_SUCCESS; |
| } |
| |
| static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count) |
| { |
| cl_uint i; |
| for (i = 0; i < count; ++i) |
| allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0); |
| } |
| |
| cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p ); |
| cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p ) |
| { |
| DataInitInfo *info = (DataInitInfo*) p; |
| cl_uint count = info->size; |
| Type inType = info->inType; |
| Type outType = info->outType; |
| RoundingMode round = info->round; |
| size_t j; |
| |
| Force64BitFPUPrecision(); |
| |
| void *s = (cl_uchar*) gIn + job_id * count * gTypeSizes[info->inType]; |
| void *a = (cl_uchar*) gAllowZ + job_id * count; |
| void *d = (cl_uchar*) gRef + job_id * count * gTypeSizes[info->outType]; |
| |
| if (outType != inType) |
| { |
| //create the reference while we wait |
| Convert f = gConversions[ outType ][ inType ]; |
| if( info->sat ) |
| f = gSaturatedConversions[ outType ][ inType ]; |
| |
| #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__) |
| /* ARM VFP doesn't have hardware instruction for converting from 64-bit |
| * integer to float types, hence GCC ARM uses the floating-point |
| * emulation code despite which -mfloat-abi setting it is. But the |
| * emulation code in libgcc.a has only one rounding mode (round to |
| * nearest even in this case) and ignores the user rounding mode setting |
| * in hardware. As a result setting rounding modes in hardware won't |
| * give correct rounding results for type covert from 64-bit integer to |
| * float using GCC for ARM compiler so for testing different rounding |
| * modes, we need to use alternative reference function. ARM64 does have |
| * an instruction, however we cannot guarantee the compiler will use it. |
| * On all ARM architechures use emulation to calculate reference.*/ |
| switch (round) |
| { |
| /* conversions to floating-point type use the current rounding mode. |
| * The only default floating-point rounding mode supported is round to nearest even |
| * i.e the current rounding mode will be _rte for floating-point types. */ |
| case kDefaultRoundingMode: |
| qcom_rm = qcomRTE; |
| break; |
| case kRoundToNearestEven: |
| qcom_rm = qcomRTE; |
| break; |
| case kRoundUp: |
| qcom_rm = qcomRTP; |
| break; |
| case kRoundDown: |
| qcom_rm = qcomRTN; |
| break; |
| case kRoundTowardZero: |
| qcom_rm = qcomRTZ; |
| break; |
| default: |
| vlog_error("ERROR: undefined rounding mode %d\n", round); |
| break; |
| } |
| qcom_sat = info->sat; |
| #endif |
| |
| RoundingMode oldRound = set_round( round, outType ); |
| f( d, s, count ); |
| set_round( oldRound, outType ); |
| |
| // Decide if we allow a zero result in addition to the correctly rounded one |
| memset(a, 0, count); |
| if (gForceFTZ) { |
| if (inType == kfloat) |
| setAllowZ((uint8_t*)a, (uint32_t*)s, count); |
| if (outType == kfloat) |
| setAllowZ((uint8_t*)a, (uint32_t*)d, count); |
| } |
| } |
| else |
| { |
| // Copy the input to the reference |
| memcpy(d, s, info->size * gTypeSizes[inType]); |
| } |
| |
| //Patch up NaNs conversions to integer to zero -- these can be converted to any integer |
| if( info->outType != kfloat && info->outType != kdouble ) |
| { |
| if( inType == kfloat ) |
| { |
| float *inp = (float*) s; |
| for( j = 0; j < count; j++ ) |
| { |
| if( isnan( inp[j] ) ) |
| memset( (char*) d + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] ); |
| } |
| } |
| if( inType == kdouble ) |
| { |
| double *inp = (double*) s; |
| for( j = 0; j < count; j++ ) |
| { |
| if( isnan( inp[j] ) ) |
| memset( (char*) d + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] ); |
| } |
| } |
| } |
| else if( inType == kfloat || inType == kdouble ) |
| { // outtype and intype is float or double. NaN conversions for float <-> double can be any NaN |
| if( inType == kfloat && outType == kdouble ) |
| { |
| float *inp = (float*) s; |
| for( j = 0; j < count; j++ ) |
| { |
| if( isnan( inp[j] ) ) |
| ((double*) d)[j] = NAN; |
| } |
| } |
| if( inType == kdouble && outType == kfloat ) |
| { |
| double *inp = (double*) s; |
| for( j = 0; j < count; j++ ) |
| { |
| if( isnan( inp[j] ) ) |
| ((float*) d)[j] = NAN; |
| } |
| } |
| } |
| |
| return CL_SUCCESS; |
| } |
| |
| static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMode sat, RoundingMode round, MTdata d ) |
| { |
| #ifdef __APPLE__ |
| cl_ulong wall_start = mach_absolute_time(); |
| #endif |
| |
| DataInitInfo init_info = { 0, 0, outType, inType, sat, round, NULL }; |
| WriteInputBufferInfo writeInputBufferInfo; |
| int vectorSize; |
| int error = 0; |
| cl_uint threads = GetThreadCount(); |
| uint64_t i; |
| |
| gTestCount++; |
| size_t blockCount = BUFFER_SIZE / MAX( gTypeSizes[ inType ], gTypeSizes[ outType ] ); |
| size_t step = blockCount; |
| uint64_t lastCase = 1ULL << (8*gTypeSizes[ inType ]); |
| cl_event writeInputBuffer = NULL; |
| |
| memset( &writeInputBufferInfo, 0, sizeof( writeInputBufferInfo ) ); |
| init_info.d = (MTdata*)malloc( threads * sizeof( MTdata ) ); |
| if( NULL == init_info.d ) |
| { |
| vlog_error( "ERROR: Unable to allocate storage for random number generator!\n" ); |
| return -1; |
| } |
| for( i = 0; i < threads; i++ ) |
| { |
| init_info.d[i] = init_genrand( genrand_int32( d ) ); |
| if( NULL == init_info.d[i] ) |
| { |
| vlog_error( "ERROR: Unable to allocate storage for random number generator!\n" ); |
| return -1; |
| } |
| } |
| |
| writeInputBufferInfo.outType = outType; |
| writeInputBufferInfo.inType = inType; |
| |
| for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) |
| { |
| writeInputBufferInfo.calcInfo[vectorSize].program = MakeProgram( outType, inType, sat, round, vectorSize, |
| &writeInputBufferInfo.calcInfo[vectorSize].kernel ); |
| if( NULL == writeInputBufferInfo.calcInfo[vectorSize].program ) |
| { |
| gFailCount++; |
| return -1; |
| } |
| if( NULL == writeInputBufferInfo.calcInfo[vectorSize].kernel ) |
| { |
| gFailCount++; |
| vlog_error( "\t\tFAILED -- Failed to create kernel.\n" ); |
| return -2; |
| } |
| |
| writeInputBufferInfo.calcInfo[vectorSize].parent = &writeInputBufferInfo; |
| writeInputBufferInfo.calcInfo[vectorSize].vectorSize = vectorSize; |
| writeInputBufferInfo.calcInfo[vectorSize].result = -1; |
| } |
| |
| if( gSkipTesting ) |
| goto exit; |
| |
| // Patch up rounding mode if default is RTZ |
| // We leave the part above in default rounding mode so that the right kernel is compiled. |
| if( round == kDefaultRoundingMode && gIsRTZ && (outType == kfloat) ) |
| init_info.round = round = kRoundTowardZero; |
| |
| // Figure out how many elements are in a work block |
| |
| // we handle 64-bit types a bit differently. |
| if( 8*gTypeSizes[ inType ] > 32 ) |
| lastCase = 0x100000000ULL; |
| |
| if ( !gWimpyMode && gIsEmbedded ) |
| step = blockCount * EMBEDDED_REDUCTION_FACTOR; |
| |
| if ( gWimpyMode ) |
| step = (size_t)blockCount * (size_t)gWimpyReductionFactor; |
| vlog( "Testing... " ); |
| fflush(stdout); |
| for( i = 0; i < (uint64_t)lastCase; i += step ) |
| { |
| |
| if( 0 == ( i & ((lastCase >> 3) -1))) { |
| vlog("."); |
| fflush(stdout); |
| } |
| |
| cl_uint count = (uint32_t) MIN( blockCount, lastCase - i ); |
| writeInputBufferInfo.count = count; |
| |
| // Crate a user event to represent the status of the reference value computation completion |
| writeInputBufferInfo.calcReferenceValues = clCreateUserEvent( gContext, &error); |
| if( error || NULL == writeInputBufferInfo.calcReferenceValues ) |
| { |
| vlog_error( "ERROR: Unable to create user event. (%d)\n", error ); |
| gFailCount++; |
| goto exit; |
| } |
| |
| // retain for consumption by MapOutputBufferComplete |
| for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) |
| { |
| if( (error = clRetainEvent(writeInputBufferInfo.calcReferenceValues) )) |
| { |
| vlog_error( "ERROR: Unable to retain user event. (%d)\n", error ); |
| gFailCount++; |
| goto exit; |
| } |
| } |
| |
| // Crate a user event to represent when the callbacks are done verifying correctness |
| writeInputBufferInfo.doneBarrier = clCreateUserEvent( gContext, &error); |
| if( error || NULL == writeInputBufferInfo.calcReferenceValues ) |
| { |
| vlog_error( "ERROR: Unable to create user event for barrier. (%d)\n", error ); |
| gFailCount++; |
| goto exit; |
| } |
| |
| // retain for use by the callback that calls this |
| if( (error = clRetainEvent(writeInputBufferInfo.doneBarrier) )) |
| { |
| vlog_error( "ERROR: Unable to retain user event doneBarrier. (%d)\n", error ); |
| gFailCount++; |
| goto exit; |
| } |
| |
| // Call this in a multithreaded manner |
| // gInitFunctions[ inType ]( gIn, sat, round, outType, i, count, d ); |
| cl_uint chunks = RoundUpToNextPowerOfTwo(threads) * 2; |
| init_info.start = i; |
| init_info.size = count / chunks; |
| if( init_info.size < 16384 ) |
| { |
| chunks = RoundUpToNextPowerOfTwo(threads); |
| init_info.size = count / chunks; |
| if( init_info.size < 16384 ) |
| { |
| init_info.size = count; |
| chunks = 1; |
| } |
| } |
| ThreadPool_Do(InitData, chunks, &init_info); |
| |
| // Copy the results to the device |
| writeInputBuffer = NULL; |
| if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, count * gTypeSizes[inType], gIn, 0, NULL, &writeInputBuffer ))) |
| { |
| vlog_error( "ERROR: clEnqueueWriteBuffer failed. (%d)\n", error ); |
| gFailCount++; |
| goto exit; |
| } |
| |
| // Setup completion callback for the write, which will enqueue the rest of the work |
| // This is somewhat gratuitous. Because this is an in order queue, we didn't really need to |
| // do this work in a callback. We could have done it from the main thread. Here we are |
| // verifying that the implementation can enqueue work from a callback, while at the same time |
| // also checking to make sure that the conversions work. |
| // |
| // Because the verification code is also moved to a callback, it is hoped that implementations will |
| // achieve a test performance improvement because they can verify the results in parallel. If the |
| // implementation serializes callbacks however, that won't happen. Consider it some motivation |
| // to do the right thing! :-) |
| if( (error = clSetEventCallback( writeInputBuffer, CL_COMPLETE, WriteInputBufferComplete, &writeInputBufferInfo)) ) |
| { |
| vlog_error( "ERROR: clSetEventCallback failed. (%d)\n", error ); |
| gFailCount++; |
| goto exit; |
| } |
| |
| // The event can't be destroyed until the callback is called, so we can release it now. |
| if( (error = clReleaseEvent(writeInputBuffer) )) |
| { |
| vlog_error( "ERROR: clReleaseEvent failed. (%d)\n", error ); |
| gFailCount++; |
| goto exit; |
| } |
| |
| // Make sure the work is actually running, so we don't deadlock |
| if( (error = clFlush( gQueue ) ) ) |
| { |
| vlog_error( "clFlush failed with error %d\n", error ); |
| gFailCount++; |
| goto exit; |
| } |
| |
| ThreadPool_Do(PrepareReference, chunks, &init_info); |
| |
| // signal we are done calculating the reference results |
| if( (error = clSetUserEventStatus( writeInputBufferInfo.calcReferenceValues, CL_COMPLETE ) ) ) |
| { |
| vlog_error( "Error: Failed to set user event status to CL_COMPLETE: %d\n", error ); |
| gFailCount++; |
| goto exit; |
| } |
| |
| // Wait for the event callbacks to finish verifying correctness. |
| if( (error = clWaitForEvents( 1, (cl_event*) &writeInputBufferInfo.doneBarrier ) )) |
| { |
| vlog_error( "Error: Failed to wait for barrier: %d\n", error ); |
| gFailCount++; |
| goto exit; |
| } |
| |
| if( (error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues ) )) |
| { |
| vlog_error( "Error: Failed to release calcReferenceValues: %d\n", error ); |
| gFailCount++; |
| goto exit; |
| } |
| |
| if( (error = clReleaseEvent(writeInputBufferInfo.doneBarrier ) )) |
| { |
| vlog_error( "Error: Failed to release done barrier: %d\n", error ); |
| gFailCount++; |
| goto exit; |
| } |
| |
| |
| for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) |
| { |
| if( ( error = writeInputBufferInfo.calcInfo[ vectorSize ].result )) |
| { |
| switch( inType ) |
| { |
| case kuchar: |
| case kchar: |
| vlog( "Input value: 0x%2.2x ", ((unsigned char*)gIn)[error - 1] ); |
| break; |
| case kushort: |
| case kshort: |
| vlog( "Input value: 0x%4.4x ", ((unsigned short*)gIn)[error - 1] ); |
| break; |
| case kuint: |
| case kint: |
| vlog( "Input value: 0x%8.8x ", ((unsigned int*)gIn)[error - 1] ); |
| break; |
| case kfloat: |
| vlog( "Input value: %a ", ((float*)gIn)[error - 1] ); |
| break; |
| break; |
| case kulong: |
| case klong: |
| vlog( "Input value: 0x%16.16llx ", ((unsigned long long*)gIn)[error - 1] ); |
| break; |
| case kdouble: |
| vlog( "Input value: %a ", ((double*)gIn)[error - 1]); |
| break; |
| default: |
| vlog_error( "Internal error at %s: %d\n", __FILE__, __LINE__ ); |
| abort(); |
| break; |
| } |
| |
| // tell the user which conversion it was. |
| if( 0 == vectorSize ) |
| vlog( " (implicit scalar conversion from %s to %s)\n", gTypeNames[ inType ], gTypeNames[ outType ] ); |
| else |
| vlog( " (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType], sizeNames[vectorSize], gSaturationNames[ sat ], |
| gRoundingModeNames[ round ], gTypeNames[inType], sizeNames[vectorSize] ); |
| |
| gFailCount++; |
| goto exit; |
| } |
| } |
| } |
| |
| log_info( "done.\n" ); |
| |
| if( gTimeResults ) |
| { |
| //Kick off tests for the various vector lengths |
| for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) |
| { |
| size_t workItemCount = blockCount / vectorSizes[vectorSize]; |
| if( vectorSizes[vectorSize] * gTypeSizes[outType] < 4 ) |
| workItemCount /= 4 / (vectorSizes[vectorSize] * gTypeSizes[outType]); |
| |
| double sum = 0.0; |
| double bestTime = INFINITY; |
| cl_uint k; |
| for( k = 0; k < PERF_LOOP_COUNT; k++ ) |
| { |
| uint64_t startTime = GetTime(); |
| if( (error = RunKernel( writeInputBufferInfo.calcInfo[vectorSize].kernel, gInBuffer, gOutBuffers[ vectorSize ], workItemCount )) ) |
| { |
| gFailCount++; |
| goto exit; |
| } |
| |
| // Make sure OpenCL is done |
| if( (error = clFinish(gQueue) ) ) |
| { |
| vlog_error( "Error %d at clFinish\n", error ); |
| goto exit; |
| } |
| |
| uint64_t endTime = GetTime(); |
| double time = SubtractTime( endTime, startTime ); |
| sum += time; |
| if( time < bestTime ) |
| bestTime = time; |
| |
| } |
| |
| if( gReportAverageTimes ) |
| bestTime = sum / PERF_LOOP_COUNT; |
| double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (workItemCount * vectorSizes[vectorSize]); |
| if( 0 == vectorSize ) |
| vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "implicit convert %s -> %s", gTypeNames[ inType ], gTypeNames[ outType ] ); |
| else |
| vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "convert_%s%s%s%s( %s%s )", gTypeNames[ outType ], sizeNames[vectorSize], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType], sizeNames[vectorSize] ); |
| } |
| } |
| |
| if( gWimpyMode ) |
| vlog( "\tWimp pass" ); |
| else |
| vlog( "\tpassed" ); |
| |
| #ifdef __APPLE__ |
| // record the run time |
| vlog( "\t(%f s)", 1e-9 * ( mach_absolute_time() - wall_start ) ); |
| #endif |
| vlog( "\n\n" ); |
| fflush( stdout ); |
| |
| |
| exit: |
| //clean up |
| for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) |
| { |
| clReleaseProgram( writeInputBufferInfo.calcInfo[vectorSize].program ); |
| clReleaseKernel( writeInputBufferInfo.calcInfo[vectorSize].kernel ); |
| } |
| |
| if( init_info.d ) |
| { |
| for( i = 0; i < threads; i++ ) |
| free_mtdata(init_info.d[i]); |
| free(init_info.d); |
| } |
| |
| return error; |
| } |
| |
| void CL_CALLBACK MapResultValuesComplete( cl_event e, cl_int status, void *data ); |
| |
| // Note: not called reentrantly |
| void CL_CALLBACK WriteInputBufferComplete( cl_event e, cl_int status, void *data ) |
| { |
| WriteInputBufferInfo *info = (WriteInputBufferInfo*) data; |
| cl_uint count = info->count; |
| int vectorSize; |
| |
| if( CL_SUCCESS != status ) |
| { |
| vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status ); |
| gFailCount++; |
| return; |
| } |
| |
| info->barrierCount = gMaxVectorSize - gMinVectorSize; |
| |
| // now that we know that the write buffer is complete, enqueue callbacks to wait for the main thread to |
| // finish calculating the reference results. |
| for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) |
| { |
| size_t workItemCount = (count + vectorSizes[vectorSize] - 1) / ( vectorSizes[vectorSize]); |
| cl_event mapComplete = NULL; |
| |
| if( (status = RunKernel( info->calcInfo[ vectorSize ].kernel, gInBuffer, gOutBuffers[ vectorSize ], workItemCount )) ) |
| { |
| gFailCount++; |
| return; |
| } |
| |
| info->calcInfo[vectorSize].p = clEnqueueMapBuffer( gQueue, gOutBuffers[ vectorSize ], CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, |
| 0, count * gTypeSizes[ info->outType ], 0, NULL, &mapComplete, &status); |
| { |
| if( status ) |
| { |
| vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status ); |
| gFailCount++; |
| return; |
| } |
| } |
| |
| if( (status = clSetEventCallback( mapComplete, CL_COMPLETE, MapResultValuesComplete, info->calcInfo + vectorSize))) |
| { |
| vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status ); |
| gFailCount++; |
| return; |
| } |
| |
| if( (status = clReleaseEvent(mapComplete))) |
| { |
| vlog_error( "ERROR: clReleaseEvent calback failed in WriteInputBufferComplete for vector size %d with status: %d\n", vectorSize, status ); |
| gFailCount++; |
| return; |
| } |
| } |
| |
| // Make sure the work starts moving -- otherwise we may deadlock |
| if( (status = clFlush(gQueue))) |
| { |
| vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status ); |
| gFailCount++; |
| return; |
| } |
| |
| // e was already released by the main thread. It should be destroyed automatically soon after we exit. |
| } |
| |
| void CL_CALLBACK CalcReferenceValuesComplete( cl_event e, cl_int status, void *data ); |
| |
| // Note: May be called reentrantly |
| void CL_CALLBACK MapResultValuesComplete( cl_event e, cl_int status, void *data ) |
| { |
| CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo*) data; |
| cl_event calcReferenceValues = info->parent->calcReferenceValues; |
| |
| if( CL_SUCCESS != status ) |
| { |
| vlog_error( "ERROR: MapResultValuesComplete calback failed with status: %d\n", status ); |
| gFailCount++; // not thread safe -- being lazy here |
| clReleaseEvent(calcReferenceValues); |
| return; |
| } |
| |
| // we know that the map is done, wait for the main thread to finish calculating the reference values |
| if( (status = clSetEventCallback( calcReferenceValues, CL_COMPLETE, CalcReferenceValuesComplete, data ))) |
| { |
| vlog_error( "ERROR: clSetEventCallback failed in MapResultValuesComplete with status: %d\n", status ); |
| gFailCount++; // not thread safe -- being lazy here |
| } |
| |
| // this thread no longer needs its reference to info->calcReferenceValues, so release it |
| if( (status = clReleaseEvent(calcReferenceValues) )) |
| { |
| vlog_error( "ERROR: clReleaseEvent(info->calcReferenceValues) failed with status: %d\n", status ); |
| gFailCount++; // not thread safe -- being lazy here |
| } |
| |
| // no need to flush since we didn't enqueue anything |
| |
| // e was already released by WriteInputBufferComplete. It should be destroyed automatically soon after we exit. |
| } |
| |
| |
| void CL_CALLBACK CalcReferenceValuesComplete( cl_event e, cl_int status, void *data ) |
| { |
| CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo*) data; |
| cl_uint vectorSize = info->vectorSize; |
| cl_uint count = info->parent->count; |
| Type outType = info->parent->outType; // the data type of the conversion result |
| Type inType = info->parent->inType; // the data type of the conversion input |
| size_t j; |
| cl_int error; |
| cl_event doneBarrier = info->parent->doneBarrier; |
| |
| // report spurious error condition |
| if( CL_SUCCESS != status ) |
| { |
| vlog_error( "ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n", status ); |
| gFailCount++; // lazy about thread safety here |
| return; |
| } |
| |
| // Now we know that both results have been mapped back from the device, and the |
| // main thread is done calculating the reference results. It is now time to check |
| // the results. |
| |
| // verify results |
| void *mapped = info->p; |
| |
| //Patch up NaNs conversions to integer to zero -- these can be converted to any integer |
| if( outType != kfloat && outType != kdouble ) |
| { |
| if( inType == kfloat ) |
| { |
| float *inp = (float*) gIn; |
| for( j = 0; j < count; j++ ) |
| { |
| if( isnan( inp[j] ) ) |
| memset( (char*) mapped + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] ); |
| } |
| } |
| if( inType == kdouble ) |
| { |
| double *inp = (double*) gIn; |
| for( j = 0; j < count; j++ ) |
| { |
| if( isnan( inp[j] ) ) |
| memset( (char*) mapped + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] ); |
| } |
| } |
| } |
| else if( inType == kfloat || inType == kdouble ) |
| { // outtype and intype is float or double. NaN conversions for float <-> double can be any NaN |
| if( inType == kfloat && outType == kdouble ) |
| { |
| float *inp = (float*) gIn; |
| double *outp = (double*) mapped; |
| for( j = 0; j < count; j++ ) |
| { |
| if( isnan( inp[j] ) && isnan(outp[j]) ) |
| outp[j] = NAN; |
| } |
| } |
| if( inType == kdouble && outType == kfloat ) |
| { |
| double *inp = (double*) gIn; |
| float *outp = (float*) mapped; |
| for( j = 0; j < count; j++ ) |
| { |
| if( isnan( inp[j] ) && isnan(outp[j]) ) |
| outp[j] = NAN; |
| } |
| } |
| } |
| |
| if( memcmp( mapped, gRef, count * gTypeSizes[ outType ] ) ) |
| info->result = gCheckResults[outType]( mapped, gRef, gAllowZ, count, vectorSizes[vectorSize] ); |
| else |
| info->result = 0; |
| |
| // Fill the output buffer with junk and release it |
| { |
| cl_uint pattern = 0xffffdead; |
| memset_pattern4(mapped, &pattern, count * gTypeSizes[outType]); |
| if((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[ vectorSize ], mapped, 0, NULL, NULL))) |
| { |
| vlog_error( "ERROR: clEnqueueUnmapMemObject failed in CalcReferenceValuesComplete (%d)\n", error ); |
| gFailCount++; |
| } |
| } |
| |
| if( 1 == ThreadPool_AtomicAdd( &info->parent->barrierCount, -1) ) |
| { |
| if( (status = clSetUserEventStatus( doneBarrier, CL_COMPLETE) )) |
| { |
| vlog_error( "ERROR: clSetUserEventStatus failed in CalcReferenceValuesComplete (err: %d). We're probably going to deadlock.\n", status ); |
| gFailCount++; |
| return; |
| } |
| |
| if( (status = clReleaseEvent( doneBarrier ) ) ) |
| { |
| vlog_error( "ERROR: clReleaseEvent failed in CalcReferenceValuesComplete (err: %d).\n", status ); |
| gFailCount++; |
| return; |
| } |
| } |
| |
| |
| // e was already released by WriteInputBufferComplete. It should be destroyed automatically soon after |
| // all the calls to CalcReferenceValuesComplete exit. |
| } |
| |
| static cl_program MakeProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel ) |
| { |
| cl_program program; |
| char testName[256]; |
| int error = 0; |
| const char **strings; |
| size_t stringCount = 0; |
| |
| // Create the program. This is a bit complicated because we are trying to avoid byte and short stores. |
| if (0 == vectorSize) |
| { |
| char inName[32]; |
| char outName[32]; |
| const char *programSource[] = |
| { |
| "", // optional pragma |
| "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n" |
| "{\n" |
| " size_t i = get_global_id(0);\n" |
| " dest[i] = src[i];\n" |
| "}\n" |
| }; |
| stringCount = sizeof(programSource) / sizeof(programSource[0]); |
| strings = programSource; |
| |
| if (outType == kdouble || inType == kdouble) |
| programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; |
| |
| //create the type name |
| strncpy(inName, gTypeNames[inType], sizeof(inName)); |
| strncpy(outName, gTypeNames[outType], sizeof(outName)); |
| sprintf(testName, "test_implicit_%s_%s", outName, inName); |
| vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType], gTypeNames[outType]); |
| fflush(stdout); |
| } |
| else |
| { |
| int vectorSizetmp = vectorSizes[vectorSize]; |
| |
| char convertString[128]; |
| char inName[32]; |
| char outName[32]; |
| const char *programSource[] = |
| { |
| "", // optional pragma |
| "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n" |
| "{\n" |
| " size_t i = get_global_id(0);\n" |
| " dest[i] = ", convertString, "( src[i] );\n" |
| "}\n" |
| }; |
| const char *programSourceV3[] = |
| { |
| "", // optional pragma |
| "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n" |
| "{\n" |
| " size_t i = get_global_id(0);\n" |
| " if( i + 1 < get_global_size(0))\n" |
| " vstore3( ", convertString, "( vload3( i, src)), i, dest );\n" |
| " else\n" |
| " {\n" |
| " ", inName, "3 in;\n" |
| " ", outName, "3 out;\n" |
| " if( 0 == (i & 1) )\n" |
| " in.y = src[3*i+1];\n" |
| " in.x = src[3*i];\n" |
| " out = ", convertString, "( in ); \n" |
| " dest[3*i] = out.x;\n" |
| " if( 0 == (i & 1) )\n" |
| " dest[3*i+1] = out.y;\n" |
| " }\n" |
| "}\n" |
| }; |
| stringCount = 3 == vectorSizetmp ? sizeof(programSourceV3) / sizeof(programSourceV3[0]) : |
| sizeof(programSource) / sizeof(programSource[0]); |
| strings = 3 == vectorSizetmp ? programSourceV3 : programSource; |
| |
| if (outType == kdouble || inType == kdouble) { |
| programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; |
| programSourceV3[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; |
| } |
| |
| //create the type name |
| switch (vectorSizetmp) |
| { |
| case 1: |
| strncpy(inName, gTypeNames[inType], sizeof(inName)); |
| strncpy(outName, gTypeNames[outType], sizeof(outName)); |
| snprintf(convertString, sizeof(convertString), "convert_%s%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]); |
| snprintf(testName, 256, "test_%s_%s", convertString, inName); |
| vlog("Building %s( %s ) test\n", convertString, inName); |
| break; |
| case 3: |
| strncpy(inName, gTypeNames[inType], sizeof(inName)); |
| strncpy(outName, gTypeNames[outType], sizeof(outName)); |
| snprintf(convertString, sizeof(convertString), "convert_%s3%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]); |
| snprintf(testName, 256, "test_%s_%s3", convertString, inName); |
| vlog("Building %s( %s3 ) test\n", convertString, inName); |
| break; |
| default: |
| snprintf(inName, sizeof(inName), "%s%d", gTypeNames[inType], vectorSizetmp); |
| snprintf(outName, sizeof(outName), "%s%d", gTypeNames[outType], vectorSizetmp); |
| snprintf(convertString, sizeof(convertString), "convert_%s%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]); |
| snprintf(testName, 256, "test_%s_%s", convertString, inName); |
| vlog("Building %s( %s ) test\n", convertString, inName); |
| break; |
| } |
| |
| fflush(stdout); |
| } |
| *outKernel = NULL; |
| |
| const char *flags = NULL; |
| if( gForceFTZ ) |
| flags = "-cl-denorms-are-zero"; |
| |
| // build it |
| error = create_single_kernel_helper(gContext, &program, outKernel, (cl_uint)stringCount, strings, testName, flags); |
| if (error) |
| { |
| char buffer[2048] = ""; |
| |
| vlog_error("Failed to build kernel/program.\n", error); |
| clReleaseProgram(program); |
| return NULL; |
| } |
| |
| return program; |
| } |